diff --git a/webarticlecurator/__init__.py b/webarticlecurator/__init__.py index a7920ec..75c4b7f 100644 --- a/webarticlecurator/__init__.py +++ b/webarticlecurator/__init__.py @@ -10,4 +10,5 @@ from webarticlecurator.news_crawler import NewsArchiveCrawler, NewsArticleCrawler from webarticlecurator.version import __version__ -__all__ = ['NewsArchiveCrawler', 'NewsArticleCrawler', 'DummyConverter', 'WarcCachingDownloader', 'Logger', 'wrap_input_constants', __version__] +__all__ = ['NewsArchiveCrawler', 'NewsArticleCrawler', 'DummyConverter', 'WarcCachingDownloader', 'Logger', + 'wrap_input_constants', __version__] diff --git a/webarticlecurator/__main__.py b/webarticlecurator/__main__.py index 739ee09..4402fab 100644 --- a/webarticlecurator/__main__.py +++ b/webarticlecurator/__main__.py @@ -126,7 +126,7 @@ def parse_args_sample(parser): parser.add_argument('-n', '--negative', type=str2bool, nargs='?', const=True, default=False, metavar='True/False', help='Sample input-urls URLs which are not present in the source archive (default False)') parser.add_argument('-c', '--config', type=str, default=None, metavar='CONFIG_FILE_NAME', - help='Portal configfile (see configs folder for examples!)') + help='Portal configfile (see configs folder for examples!)', required=True) parser.add_argument('--allow-cookies', type=str2bool, nargs='?', const=True, default=False, metavar='True/False', help='Allow session cookies') parser.add_argument('--max-tries', type=int, help='No of maximal tries if the download fails because duplicate ' diff --git a/webarticlecurator/enhanced_downloader.py b/webarticlecurator/enhanced_downloader.py index eb7ade8..3679cdb 100644 --- a/webarticlecurator/enhanced_downloader.py +++ b/webarticlecurator/enhanced_downloader.py @@ -128,7 +128,7 @@ def download_url(self, url, ignore_cache=False, return_warc_records_wo_writing=F # 3) Check if the URL presents in the cached_content... elif url in self.url_index: # 3a) ...retrieve it! (from the last source WARC where the URL is found in) - cache, reqv, resp = self.get_records(url) + cache, reqv, resp = self.get_records_offset(url) # 3b) Get content even if the URL is a duplicate, because ignore_cache knows better what to do with it cached_content = cache.download_url(url) # 3c) Decide to return the records with the content XOR write the records and return the content only @@ -155,7 +155,7 @@ def download_url(self, url, ignore_cache=False, return_warc_records_wo_writing=F def write_records_for_url(self, url, rec): self._new_downloads.write_records_for_url(url, rec) - def get_records(self, url): + def get_records_offset(self, url): for cache in reversed(self._cached_downloads): if url in cache.url_index: reqv, resp = cache.get_record_data(url) @@ -165,6 +165,12 @@ def get_records(self, url): ' but is in the URL index!'.format(url)) return cache, reqv, resp + def get_records(self, url): + cache, reqv, resp = self.get_records_offset(url) + reqv_rec = cache.get_record(reqv[0]) + resp_rec = cache.get_record(resp[0]) + return cache, reqv_rec, resp_rec + @property def bad_urls(self): # Ready-only property for shortcut return self._new_downloads.bad_urls diff --git a/webarticlecurator/version.py b/webarticlecurator/version.py index ae3737d..06d9fea 100644 --- a/webarticlecurator/version.py +++ b/webarticlecurator/version.py @@ -1,4 +1,4 @@ #!/usr/bin/env python3 # -*- coding: utf-8, vim: expandtab:ts=4 -*- -__version__ = '1.8.0' +__version__ = '1.9.0'