Skip to content

Commit

Permalink
Fix Make Newspaper3k an optional dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
dlazesz committed Nov 30, 2021
1 parent 8cdf0c1 commit 4a4420d
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 9 deletions.
11 changes: 6 additions & 5 deletions configs/extractors/corpus_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,15 @@
from datetime import datetime

import yaml
from newspaper import Article

"""Here comes the stuff to extract data from a specific downloaded webpage (article)"""


class CorpusConverterNewspaper: # Mimic CorpusConverter
def __init__(self, settings):
from newspaper import Article
self._article = Article

self._logger = Namespace(log=print) # Hack to be able to monkeypatch logger

# Set output_file handle
Expand All @@ -35,13 +37,12 @@ def identify_site_scheme(self, url, article_raw_html):
'NO MATCHING COLUMN_RE PATTERN! IGNORING ARTICLE!', sep='\t', file=sys.stderr)
return None

@staticmethod
def extract_article_date(url, article_raw_html, scheme):
def extract_article_date(self, url, article_raw_html, scheme):
"""
extracts and returns next page URL from an HTML code if there is one...
"""
_ = url, scheme # Silence dummy IDE
article = Article(url, memoize_articles=False, language='hu')
article = self._article(url, memoize_articles=False, language='hu')
article.download(input_html=article_raw_html)
article.parse()
article.nlp()
Expand All @@ -50,7 +51,7 @@ def extract_article_date(url, article_raw_html, scheme):

def article_to_corpus(self, url, article_raw_html, scheme):
_ = scheme # Silence dummy IDE
article = Article(url, memoize_articles=False, language='hu')
article = self._article(url, memoize_articles=False, language='hu')
article.download(input_html=article_raw_html)
article.parse()
article.nlp()
Expand Down
3 changes: 2 additions & 1 deletion configs/site_schemas/24hu_belfold1_schema.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"portal_specific_exctractor_functions_file": "../extractors/site_specific_extractor_functions_news_ngvmt.py"
"extract_next_page_url_fun": "extract_next_page_url_24hu"
"extract_article_urls_from_page_fun": "extract_article_urls_from_page_24hu"
"next_page_of_article_fun": "next_page_of_article_24hu"
"extract_article_urls_from_page_plus_fun": "extract_article_urls_from_page_plus_24hu"

"next_url_by_pagenum": false
Expand All @@ -48,4 +49,4 @@
# "new_article_url_threshold": 0

"corpus_converter_file": "../extractors/corpus_converters.py"
"corpus_converter": "dummy-converter"
"corpus_converter": "MultiPageArticleConverter"
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ urllib3
warcio
ratelimit
beautifulsoup4
lxml
yamale
# Optional
# newspaper3k
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
'Operating System :: POSIX :: Linux',
],
python_requires='>=3.7',
install_requires=['pyyaml', 'chardet', 'requests', 'urllib3', 'warcio', 'ratelimit', 'beautifulsoup4', 'yamale',
'newspaper3k'], # Newspaper3k is optional!
install_requires=['pyyaml', 'chardet', 'requests', 'urllib3', 'warcio', 'ratelimit', 'beautifulsoup4', 'lxml',
'yamale'],
# pip install webarticlecurator[newspaper3k]
extras_require={'newspaper3k': ['newspaper3k>=0.2.8,<1.0.0']},
include_package_data=True,
Expand Down
2 changes: 1 addition & 1 deletion webarticlecurator/version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-

__version__ = '1.7.2'
__version__ = '1.7.4'

0 comments on commit 4a4420d

Please sign in to comment.