diff --git a/webarticlecurator/__main__.py b/webarticlecurator/__main__.py index ac468ce..739ee09 100644 --- a/webarticlecurator/__main__.py +++ b/webarticlecurator/__main__.py @@ -210,8 +210,14 @@ def main_validate_and_list(args): def main_cat_and_sample(args): """ __file__ sample [source warcfiles or None] [urls list file or stdin] [target warcfile] [Online or Offline] """ - extract_article_urls_from_page_plus_fun = \ - wrap_input_constants(args.config)['EXTRACT_ARTICLE_URLS_FROM_PAGE_PLUS_FUN'] + if args.command == 'sample': + extract_article_urls_from_page_plus_fun = \ + wrap_input_constants(args.config)['EXTRACT_ARTICLE_URLS_FROM_PAGE_PLUS_FUN'] + just_cache = False + else: + extract_article_urls_from_page_plus_fun = None + just_cache = True + main_logger = Logger() out_dir = getattr(args, 'out_dir', None) target_warcfile = getattr(args, 'target_warcfile', None) @@ -219,10 +225,12 @@ def main_cat_and_sample(args): main_logger.log('INFO', 'Adding URLs to', target, ':') offline = getattr(args, 'offline', True) # Sample can be online or offline, but we write warc only when sampling! negative = getattr(args, 'negative', False) # Sample URLs from warc not in input_stream + max_tries = getattr(args, 'max_tries', 1) + allow_cookies = getattr(args, 'allow_cookies', False) sample_warc_by_urls(args.source_warcfile, args.url_input_stream, main_logger, target_warcfile=target_warcfile, - offline=offline, out_dir=out_dir, just_cache=args.command == 'cat', negative=negative, + offline=offline, out_dir=out_dir, just_cache=just_cache, negative=negative, extract_article_urls_from_page_plus_fun=extract_article_urls_from_page_plus_fun, - max_tries=args.max_tries, allow_cookies=args.allow_cookies) + max_tries=max_tries, allow_cookies=allow_cookies) main_logger.log('INFO', 'Done!')