Skip to content

Commit

Permalink
Fix get_records, add get_records_offset()
Browse files Browse the repository at this point in the history
  • Loading branch information
dlazesz committed Jan 1, 2022
1 parent 0486581 commit 742adff
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 5 deletions.
3 changes: 2 additions & 1 deletion webarticlecurator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@
from webarticlecurator.news_crawler import NewsArchiveCrawler, NewsArticleCrawler
from webarticlecurator.version import __version__

__all__ = ['NewsArchiveCrawler', 'NewsArticleCrawler', 'DummyConverter', 'WarcCachingDownloader', 'Logger', 'wrap_input_constants', __version__]
__all__ = ['NewsArchiveCrawler', 'NewsArticleCrawler', 'DummyConverter', 'WarcCachingDownloader', 'Logger',
'wrap_input_constants', __version__]
2 changes: 1 addition & 1 deletion webarticlecurator/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def parse_args_sample(parser):
parser.add_argument('-n', '--negative', type=str2bool, nargs='?', const=True, default=False, metavar='True/False',
help='Sample input-urls URLs which are not present in the source archive (default False)')
parser.add_argument('-c', '--config', type=str, default=None, metavar='CONFIG_FILE_NAME',
help='Portal configfile (see configs folder for examples!)')
help='Portal configfile (see configs folder for examples!)', required=True)
parser.add_argument('--allow-cookies', type=str2bool, nargs='?', const=True, default=False, metavar='True/False',
help='Allow session cookies')
parser.add_argument('--max-tries', type=int, help='No of maximal tries if the download fails because duplicate '
Expand Down
10 changes: 8 additions & 2 deletions webarticlecurator/enhanced_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def download_url(self, url, ignore_cache=False, return_warc_records_wo_writing=F
# 3) Check if the URL presents in the cached_content...
elif url in self.url_index:
# 3a) ...retrieve it! (from the last source WARC where the URL is found in)
cache, reqv, resp = self.get_records(url)
cache, reqv, resp = self.get_records_offset(url)
# 3b) Get content even if the URL is a duplicate, because ignore_cache knows better what to do with it
cached_content = cache.download_url(url)
# 3c) Decide to return the records with the content XOR write the records and return the content only
Expand All @@ -155,7 +155,7 @@ def download_url(self, url, ignore_cache=False, return_warc_records_wo_writing=F
def write_records_for_url(self, url, rec):
self._new_downloads.write_records_for_url(url, rec)

def get_records(self, url):
def get_records_offset(self, url):
for cache in reversed(self._cached_downloads):
if url in cache.url_index:
reqv, resp = cache.get_record_data(url)
Expand All @@ -165,6 +165,12 @@ def get_records(self, url):
' but is in the URL index!'.format(url))
return cache, reqv, resp

def get_records(self, url):
cache, reqv, resp = self.get_records_offset(url)
reqv_rec = cache.get_record(reqv[0])
resp_rec = cache.get_record(resp[0])
return cache, reqv_rec, resp_rec

@property
def bad_urls(self): # Ready-only property for shortcut
return self._new_downloads.bad_urls
Expand Down
2 changes: 1 addition & 1 deletion webarticlecurator/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-

__version__ = '1.8.0'
__version__ = '1.9.0'

0 comments on commit 742adff

Please sign in to comment.