Skip to content

Commit

Permalink
Fix archive page crawler ending sequence
Browse files Browse the repository at this point in the history
  • Loading branch information
dlazesz committed Feb 10, 2022
1 parent 7987a58 commit 518a374
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions src/webarticlecurator/news_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ def _gen_article_urls_including_subpages(self, archive_page_url_base):
first_page = True
next_page_url = archive_page_url_base.replace('#pagenum', self._initial_page_num)
while next_page_url is not None or tries_left > 0:
article_urls = []
archive_page_raw_html = self._downloader.download_url(next_page_url, self._ignore_archive_cache)
tries_left -= 1
curr_page_url = next_page_url
Expand All @@ -200,7 +199,10 @@ def _gen_article_urls_including_subpages(self, archive_page_url_base):
# 2) Generate next-page URL or None if there should not be any
next_page_url = self._find_next_page_url(archive_page_url_base, page_num, archive_page_raw_html,
article_urls)
tries_left = self._max_tries # Restore tries_left
if next_page_url is not None:
tries_left = self._max_tries # Restore tries_left
else:
tries_left = 0 # We have arrived to the end
page_num += 1 # Bump pagenum for next round
first_page = False
self._logger.log('DEBUG', 'URLs/ARCHIVE PAGE', curr_page_url, len(article_urls), sep='\t')
Expand Down

0 comments on commit 518a374

Please sign in to comment.