Skip to content

Commit

Permalink
Fix cat mode
Browse files Browse the repository at this point in the history
  • Loading branch information
dlazesz committed Dec 14, 2021
1 parent a8da9e0 commit 0486581
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions webarticlecurator/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,19 +210,27 @@ def main_validate_and_list(args):

def main_cat_and_sample(args):
""" __file__ sample [source warcfiles or None] [urls list file or stdin] [target warcfile] [Online or Offline] """
extract_article_urls_from_page_plus_fun = \
wrap_input_constants(args.config)['EXTRACT_ARTICLE_URLS_FROM_PAGE_PLUS_FUN']
if args.command == 'sample':
extract_article_urls_from_page_plus_fun = \
wrap_input_constants(args.config)['EXTRACT_ARTICLE_URLS_FROM_PAGE_PLUS_FUN']
just_cache = False
else:
extract_article_urls_from_page_plus_fun = None
just_cache = True

main_logger = Logger()
out_dir = getattr(args, 'out_dir', None)
target_warcfile = getattr(args, 'target_warcfile', None)
target = out_dir if out_dir is not None else target_warcfile
main_logger.log('INFO', 'Adding URLs to', target, ':')
offline = getattr(args, 'offline', True) # Sample can be online or offline, but we write warc only when sampling!
negative = getattr(args, 'negative', False) # Sample URLs from warc not in input_stream
max_tries = getattr(args, 'max_tries', 1)
allow_cookies = getattr(args, 'allow_cookies', False)
sample_warc_by_urls(args.source_warcfile, args.url_input_stream, main_logger, target_warcfile=target_warcfile,
offline=offline, out_dir=out_dir, just_cache=args.command == 'cat', negative=negative,
offline=offline, out_dir=out_dir, just_cache=just_cache, negative=negative,
extract_article_urls_from_page_plus_fun=extract_article_urls_from_page_plus_fun,
max_tries=args.max_tries, allow_cookies=args.allow_cookies)
max_tries=max_tries, allow_cookies=allow_cookies)
main_logger.log('INFO', 'Done!')


Expand Down

0 comments on commit 0486581

Please sign in to comment.