Fix Make Newspaper3k an optional dependency

ELTE-DH · Nov 30, 2021 · 4a4420d · 4a4420d
1 parent 8cdf0c1
commit 4a4420d
Show file tree

Hide file tree

Showing 5 changed files with 12 additions and 9 deletions.
diff --git a/configs/extractors/corpus_converters.py b/configs/extractors/corpus_converters.py
@@ -7,13 +7,15 @@
 from datetime import datetime
 
 import yaml
-from newspaper import Article
 
 """Here comes the stuff to extract data from a specific downloaded webpage (article)"""
 
 
 class CorpusConverterNewspaper:  # Mimic CorpusConverter
     def __init__(self, settings):
+        from newspaper import Article
+        self._article = Article
+
         self._logger = Namespace(log=print)  # Hack to be able to monkeypatch logger
 
         # Set output_file handle
@@ -35,13 +37,12 @@ def identify_site_scheme(self, url, article_raw_html):
                          'NO MATCHING COLUMN_RE PATTERN! IGNORING ARTICLE!', sep='\t', file=sys.stderr)
         return None
 
-    @staticmethod
-    def extract_article_date(url, article_raw_html, scheme):
+    def extract_article_date(self, url, article_raw_html, scheme):
         """
             extracts and returns next page URL from an HTML code if there is one...
         """
         _ = url, scheme  # Silence dummy IDE
-        article = Article(url, memoize_articles=False, language='hu')
+        article = self._article(url, memoize_articles=False, language='hu')
         article.download(input_html=article_raw_html)
         article.parse()
         article.nlp()
@@ -50,7 +51,7 @@ def extract_article_date(url, article_raw_html, scheme):
 
     def article_to_corpus(self, url, article_raw_html, scheme):
         _ = scheme  # Silence dummy IDE
-        article = Article(url, memoize_articles=False, language='hu')
+        article = self._article(url, memoize_articles=False, language='hu')
         article.download(input_html=article_raw_html)
         article.parse()
         article.nlp()

diff --git a/configs/site_schemas/24hu_belfold1_schema.yaml b/configs/site_schemas/24hu_belfold1_schema.yaml
@@ -36,6 +36,7 @@
 "portal_specific_exctractor_functions_file": "../extractors/site_specific_extractor_functions_news_ngvmt.py"
 "extract_next_page_url_fun": "extract_next_page_url_24hu"
 "extract_article_urls_from_page_fun": "extract_article_urls_from_page_24hu"
+"next_page_of_article_fun": "next_page_of_article_24hu"
 "extract_article_urls_from_page_plus_fun": "extract_article_urls_from_page_plus_24hu"
 
 "next_url_by_pagenum": false
@@ -48,4 +49,4 @@
 # "new_article_url_threshold": 0
 
 "corpus_converter_file": "../extractors/corpus_converters.py"
-"corpus_converter": "dummy-converter"
+"corpus_converter": "MultiPageArticleConverter"
diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,7 @@ urllib3
 warcio
 ratelimit
 beautifulsoup4
+lxml
 yamale
 # Optional
 # newspaper3k
diff --git a/setup.py b/setup.py
@@ -25,8 +25,8 @@
         'Operating System :: POSIX :: Linux',
     ],
     python_requires='>=3.7',
-    install_requires=['pyyaml', 'chardet', 'requests', 'urllib3', 'warcio', 'ratelimit', 'beautifulsoup4', 'yamale',
-                      'newspaper3k'],  # Newspaper3k is optional!
+    install_requires=['pyyaml', 'chardet', 'requests', 'urllib3', 'warcio', 'ratelimit', 'beautifulsoup4', 'lxml',
+                      'yamale'],
     # pip install webarticlecurator[newspaper3k]
     extras_require={'newspaper3k': ['newspaper3k>=0.2.8,<1.0.0']},
     include_package_data=True,

diff --git a/webarticlecurator/version.py b/webarticlecurator/version.py
@@ -1,4 +1,4 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8, vim: expandtab:ts=4 -*-
 
-__version__ = '1.7.2'
+__version__ = '1.7.4'