Skip to content

Commit

Permalink
Add get_records() function to WarcCachingDownloader
Browse files Browse the repository at this point in the history
With this new function the user is able to extract raw WARC records
from the archive or interact directly with the archive containing the
selected URL
  • Loading branch information
dlazesz committed Feb 10, 2021
1 parent 4c1b828 commit bfc4243
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 10 deletions.
22 changes: 13 additions & 9 deletions webarticlecurator/enhanced_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,7 @@ def download_url(self, url, ignore_cache=False):
# 3) Check if the URL presents in the cached_content...
elif url in self.url_index:
# 3a) ...copy it! (from the last source WARC where the URL is found in)
for cache in reversed(self._cached_downloads):
if url in cache.url_index:
reqv, resp = cache.get_record(url)
break
else:
raise ValueError('INTERNAL ERROR: {0} not found in any supplied source WARC file,'
' but is in the URL index!'.format(url))
cache, reqv, resp = self.get_records(url)
self._new_downloads.write_record(reqv, url)
self._new_downloads.write_record(resp, url)
# 3b) Get content even if the URL is a duplicate, because ignore_cache knows better what to do with it
Expand All @@ -100,6 +94,16 @@ def download_url(self, url, ignore_cache=False):
# 5) Really download the URL! (url not in cached_content or cached_content is ignored)
return self._new_downloads.download_url(url) # Still check if the URL is already downloaded!

def get_records(self, url):
for cache in reversed(self._cached_downloads):
if url in cache.url_index:
reqv, resp = cache.get_record(url)
break
else:
raise ValueError('INTERNAL ERROR: {0} not found in any supplied source WARC file,'
' but is in the URL index!'.format(url))
return cache, reqv, resp

@property
def bad_urls(self): # Ready-only property for shortcut
return self._new_downloads.bad_urls
Expand Down Expand Up @@ -325,7 +329,7 @@ def write_record(self, record, url):

class WarcReader:
def __init__(self, filename, _logger, strict_mode=False, check_digest=False):
self._inp_filename = filename
self.filename = filename
self._stream = open(filename, 'rb')
self._internal_url_index = {}
self._logger = _logger
Expand All @@ -350,7 +354,7 @@ def url_index(self): # Ready-only property for shortcut
return self._internal_url_index.keys()

def _create_index(self):
self._logger.log('INFO', 'Creating index for {0}...'.format(self._inp_filename))
self._logger.log('INFO', 'Creating index for {0}...'.format(self.filename))
archive_it = ArchiveIterator(self._stream, check_digests=self._check_digest)
info_rec = next(archive_it)
# First record should be an info record, then it should be followed by the request-response pairs
Expand Down
2 changes: 1 addition & 1 deletion webarticlecurator/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env pyhton3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-

__version__ = '1.1.0'
__version__ = '1.2.0'

0 comments on commit bfc4243

Please sign in to comment.