Skip to content

Commit

Permalink
Fix support for a one single page column with multiple multipage columns
Browse files Browse the repository at this point in the history
  • Loading branch information
dlazesz committed Mar 8, 2021
1 parent bfc4243 commit ac18641
Show file tree
Hide file tree
Showing 6 changed files with 130 additions and 94 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ In the `columns` dictionary, the following features can be set for each column (
- `date_first_article` (optional): The date of the first article on the portal/column (also used for archive crawling)
- `date_first_article` (optional): The date of the last article on the portal/column (also used for archive crawling)
- `initial_pagenum` (optional): The initial page number which could be omitted (an empty string if not set, else it should be `min_pagenum` - 1)
- `min_pagenum` (optional): The "first" page number to increment
- `min_pagenum` (optional): The "first" page number to increment (e.g. initial_pagenum + 1 = min_pagenum <= max_pagenum if not a single page column where only initial_pagenum must be specified, min_pagenum and max_pagenum must be omited)
- `max_pagenum` (optional): The upper bound of the number of pages for safety or for stop criteria
- `archive_url_format`: The schema for the archive URL of the portal/column (supply `#year`, `#month`, `#day` and
`#next-year`, `#next-month`, `#next-day` tags which have to be replaced with the actual field of date, and
Expand Down
20 changes: 20 additions & 0 deletions configs/extractors/site_specific_extractor_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1213,6 +1213,17 @@ def next_page_of_article_444(curr_html):
return None


def next_page_of_article_valasz(curr_html):
bs = BeautifulSoup(curr_html, 'lxml')
if bs.find('article', class_='percro-percre-lista') is not None:
next_tag = bs.find('a', rel='next')
if next_tag is not None and 'href' in next_tag.attrs.keys():
next_link = next_tag.attrs['href']
link = f'http://valasz.hu{next_link}'
return link
return None


def next_page_of_article_test(filename, test_logger):
"""Quick test for extracting URLs form an archive page"""
# This function is intended to be used from this file only as the import of WarcCachingDownloader is local to main()
Expand All @@ -1236,6 +1247,15 @@ def next_page_of_article_test(filename, test_logger):
text = w.download_url('https://444.hu/2014/03/23/real-madrid-barcelona-elo?page=7')
assert next_page_of_article_444(text) is None

text = w.download_url('http://valasz.hu/itthon/percrol-percre-az-onkormanyzati-valasztasokrol-105350?page=3')
assert next_page_of_article_valasz(text) == 'http://valasz.hu/itthon/percrol-percre-az-onkormanyzati' \
'-valasztasokrol-105350?page=4'
text = w.download_url('http://valasz.hu/vilag/kelet-ukrajna-percrol-percre-103699?page=3')
assert next_page_of_article_valasz(text) is None
text = w.download_url('http://valasz.hu/itthon/humboldt-dijas-kvantumfizikus-alapkutatas-nelkul'
'-nincs-fejlodes-129197')
assert next_page_of_article_valasz(text) is None

test_logger.log('INFO', 'Test OK!')


Expand Down
178 changes: 93 additions & 85 deletions configs/site_schemas/valasz_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
"min_pagenum": 2
"max_pagenum": 243

# "budapest/pest-budai-latkep":
# "archive_url_format": "http://valasz.hu/pest_budai_latkep?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 162
# Duplicate: part of "budapest"
"budapest/pest-budai-latkep":
"archive_url_format": "http://valasz.hu/pest_budai_latkep?page=#pagenum"

"initial_pagenum": 1
"min_pagenum": 2
"max_pagenum": 162

"gadasag":
"archive_url_format": "http://valasz.hu/uzlet?page=#pagenum"
Expand Down Expand Up @@ -49,9 +50,8 @@
"gazdasag/magyarok-a-piacon":
"archive_url_format": "http://valasz.hu/magyarokapiacon?page=#pagenum"

# Only one page for this column
"initial_pagenum": 1
"min_pagenum": 2
"max_pagenum": 1

"gazdasag/pr":
"archive_url_format": "http://valasz.hu/pr?page=#pagenum"
Expand All @@ -63,9 +63,8 @@
"gazdasag/techvilag":
"archive_url_format": "http://valasz.hu/techvilag?page=#pagenum"

# Only one page for this column
"initial_pagenum": 1
"min_pagenum": 2
"max_pagenum": 1

"itthon":
"archive_url_format": "http://valasz.hu/itthon?page=#pagenum"
Expand All @@ -74,33 +73,36 @@
"min_pagenum": 2
"max_pagenum": 2341

# "itthon/csalad":
# "archive_url_format": "http://valasz.hu/csalad?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 1
#
# "itthon/eletmod":
# "archive_url_format": "http://valasz.hu/eletmod?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 187
#
# "itthon/felsooktatasi-rangsor":
# "archive_url_format": "http://valasz.hu/felsooktatasi_rangsor?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 11
#
# "itthon/reflektor":
# "archive_url_format": "http://valasz.hu/reflektor?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 86
# Duplicate: part of "itthon"
"itthon/csalad":
"archive_url_format": "http://valasz.hu/csalad?page=#pagenum"

# Only one page for this column
"initial_pagenum": 1

# Duplicate: part of "itthon"
"itthon/eletmod":
"archive_url_format": "http://valasz.hu/eletmod?page=#pagenum"

"initial_pagenum": 1
"min_pagenum": 2
"max_pagenum": 187

# Duplicate: part of "itthon"
"itthon/felsooktatasi-rangsor":
"archive_url_format": "http://valasz.hu/felsooktatasi_rangsor?page=#pagenum"

"initial_pagenum": 1
"min_pagenum": 2
"max_pagenum": 11

# Duplicate: part of "itthon"
"itthon/reflektor":
"archive_url_format": "http://valasz.hu/reflektor?page=#pagenum"

"initial_pagenum": 1
"min_pagenum": 2
"max_pagenum": 86

"kultura":
"archive_url_format": "http://valasz.hu/kultura?page=#pagenum"
Expand All @@ -109,47 +111,52 @@
"min_pagenum": 2
"max_pagenum": 1693

# "kultura/esszencia":
# "archive_url_format": "http://valasz.hu/esszencia?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 11
#
# "kultura/gasztronomia":
# "archive_url_format": "http://valasz.hu/gasztronomia?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 62
#
# "kultura/kult-tura":
# "archive_url_format": "http://valasz.hu/kult_tura?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 1
#
# "kultura/mozi":
# "archive_url_format": "http://valasz.hu/mozi?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 21
#
# "kultura/sport":
# "archive_url_format": "http://valasz.hu/sport?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 437
#
# "kultura/szinhaz":
# "archive_url_format": "http://valasz.hu/szinhaz?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 22
# Duplicate: part of "kultura"
"kultura/esszencia":
"archive_url_format": "http://valasz.hu/esszencia?page=#pagenum"

"initial_pagenum": 1
"min_pagenum": 2
"max_pagenum": 11

# Duplicate: part of "kultura"
"kultura/gasztronomia":
"archive_url_format": "http://valasz.hu/gasztronomia?page=#pagenum"

"initial_pagenum": 1
"min_pagenum": 2
"max_pagenum": 62

# Duplicate: part of "kultura"
"kultura/kult-tura":
"archive_url_format": "http://valasz.hu/kult_tura?page=#pagenum"

# Only one page for this column
"initial_pagenum": 1

# Duplicate: part of "kultura"
"kultura/mozi":
"archive_url_format": "http://valasz.hu/mozi?page=#pagenum"

"initial_pagenum": 1
"min_pagenum": 2
"max_pagenum": 21

# Duplicate: part of "kultura"
"kultura/sport":
"archive_url_format": "http://valasz.hu/sport?page=#pagenum"

"initial_pagenum": 1
"min_pagenum": 2
"max_pagenum": 437

# Duplicate: part of "kultura"
"kultura/szinhaz":
"archive_url_format": "http://valasz.hu/szinhaz?page=#pagenum"

"initial_pagenum": 1
"min_pagenum": 2
"max_pagenum": 22

"publi": # Publi column is special!
"archive_url_format": "http://valasz.hu/publi?page=#pagenum"
Expand All @@ -165,15 +172,16 @@
"min_pagenum": 2
"max_pagenum": 1453

# "vilag/amerika-visszainteget":
# "archive_url_format": "http://valasz.hu/amerika_visszainteget?page=#pagenum"
#
# "initial_pagenum": 1
# "min_pagenum": 2
# "max_pagenum": 1
# Duplicate: part of "vilag"
"vilag/amerika-visszainteget":
"archive_url_format": "http://valasz.hu/amerika_visszainteget?page=#pagenum"

# Only one page for this column
"initial_pagenum": 1

"portal_specific_exctractor_functions_file": "../extractors/site_specific_extractor_functions.py"
"extract_article_urls_from_page_fun": "extract_article_urls_from_page_valasz"
"next_page_of_article_fun": "next_page_of_article_valasz"

"next_url_by_pagenum": true
"infinite_scrolling": false
Expand All @@ -185,4 +193,4 @@
# "new_article_url_threshold": 0

"corpus_converter_file": "../extractors/corpus_converters.py"
"corpus_converter": "dummy-converter"
"corpus_converter": "MultiPageArticleConverter"
Binary file modified tests/next_page_of_article.warc.gz
Binary file not shown.
22 changes: 15 additions & 7 deletions webarticlecurator/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,22 @@ def wrap_input_consants(current_task_config_filename):
max_pagenum = column_settings.get('max_pagenum')

if settings['next_url_by_pagenum']:
# If column consists of only one page, min_pagenum and max_pagenum must be omited
# while initial_pagenum must be set
if 'min_pagenum' not in column_settings:
raise ValueError('min_pagenum must be set, if next_url_by_pagenum is true!')

# If initial_pagenum is explicitly set, initial_pagenum + 1 == min_pagenum <= max_pagenum must be satisfied!
if ((isinstance(initial_pagenum, int) and min_pagenum != initial_pagenum + 1) or
(isinstance(max_pagenum, int) and min_pagenum > max_pagenum)):
raise ValueError('If initial_pagenum or max_pagenum is set,'
' initial_pagenum + 1 == min_pagenum <= max_pagenum must be satisfied!')
if 'max_pagenum' in column_settings or 'initial_pagenum' not in column_settings:
raise ValueError('min_pagenum can be omited iff max_pagenum is not present'
' and initial_pagenum is set, when next_url_by_pagenum is true!')
else: # min_pagenum and max_pagenum are not set, but initial_pagenum is. -> One page column!
min_pagenum = 2 # so max < min and exit immediately
max_pagenum = 1
else:
# If initial_pagenum is explicitly set as int and min_pagenum is set,
# initial_pagenum + 1 == min_pagenum <= max_pagenum must be satisfied!
if ((isinstance(initial_pagenum, int) and min_pagenum != initial_pagenum + 1) or
(isinstance(max_pagenum, int) and min_pagenum > max_pagenum)):
raise ValueError('If two or more from initial_pagenum, min_pagenum and max_pagenum are set,'
' initial_pagenum + 1 == min_pagenum <= max_pagenum must be satisfied!')

# If initial_pagenum is implicit, then it will be substituted with empty string. e.g. in &page=
column_settings['INITIAL_PAGENUM'] = str(initial_pagenum)
Expand Down
2 changes: 1 addition & 1 deletion webarticlecurator/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env pyhton3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-

__version__ = '1.2.0'
__version__ = '1.2.1'

0 comments on commit ac18641

Please sign in to comment.