Fix get_records, add get_records_offset()

ELTE-DH · Jan 1, 2022 · 742adff · 742adff
1 parent 0486581
commit 742adff
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 5 deletions.
diff --git a/webarticlecurator/__init__.py b/webarticlecurator/__init__.py
@@ -10,4 +10,5 @@
 from webarticlecurator.news_crawler import NewsArchiveCrawler, NewsArticleCrawler
 from webarticlecurator.version import __version__
 
-__all__ = ['NewsArchiveCrawler', 'NewsArticleCrawler', 'DummyConverter', 'WarcCachingDownloader', 'Logger', 'wrap_input_constants', __version__]
+__all__ = ['NewsArchiveCrawler', 'NewsArticleCrawler', 'DummyConverter', 'WarcCachingDownloader', 'Logger',
+           'wrap_input_constants', __version__]
diff --git a/webarticlecurator/__main__.py b/webarticlecurator/__main__.py
@@ -126,7 +126,7 @@ def parse_args_sample(parser):
     parser.add_argument('-n', '--negative', type=str2bool, nargs='?', const=True, default=False, metavar='True/False',
                         help='Sample input-urls URLs which are not present in the source archive (default False)')
     parser.add_argument('-c', '--config', type=str, default=None, metavar='CONFIG_FILE_NAME',
-                        help='Portal configfile (see configs folder for examples!)')
+                        help='Portal configfile (see configs folder for examples!)', required=True)
     parser.add_argument('--allow-cookies', type=str2bool, nargs='?', const=True, default=False, metavar='True/False',
                         help='Allow session cookies')
     parser.add_argument('--max-tries', type=int, help='No of maximal tries if the download fails because duplicate '

diff --git a/webarticlecurator/enhanced_downloader.py b/webarticlecurator/enhanced_downloader.py
@@ -128,7 +128,7 @@ def download_url(self, url, ignore_cache=False, return_warc_records_wo_writing=F
         # 3) Check if the URL presents in the cached_content...
         elif url in self.url_index:
             # 3a) ...retrieve it! (from the last source WARC where the URL is found in)
-            cache, reqv, resp = self.get_records(url)
+            cache, reqv, resp = self.get_records_offset(url)
             # 3b) Get content even if the URL is a duplicate, because ignore_cache knows better what to do with it
             cached_content = cache.download_url(url)
             # 3c) Decide to return the records with the content XOR write the records and return the content only
@@ -155,7 +155,7 @@ def download_url(self, url, ignore_cache=False, return_warc_records_wo_writing=F
     def write_records_for_url(self, url, rec):
         self._new_downloads.write_records_for_url(url, rec)
 
-    def get_records(self, url):
+    def get_records_offset(self, url):
         for cache in reversed(self._cached_downloads):
             if url in cache.url_index:
                 reqv, resp = cache.get_record_data(url)
@@ -165,6 +165,12 @@ def get_records(self, url):
                              ' but is in the URL index!'.format(url))
         return cache, reqv, resp
 
+    def get_records(self, url):
+        cache, reqv, resp = self.get_records_offset(url)
+        reqv_rec = cache.get_record(reqv[0])
+        resp_rec = cache.get_record(resp[0])
+        return cache, reqv_rec, resp_rec
+
     @property
     def bad_urls(self):  # Ready-only property for shortcut
         return self._new_downloads.bad_urls

diff --git a/webarticlecurator/version.py b/webarticlecurator/version.py
@@ -1,4 +1,4 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8, vim: expandtab:ts=4 -*-
 
-__version__ = '1.8.0'
+__version__ = '1.9.0'