Skip to content

Commit

Permalink
Update README.md and rebrand package
Browse files Browse the repository at this point in the history
  • Loading branch information
dlazesz committed Jul 1, 2020
1 parent 15cc546 commit 5c0e261
Show file tree
Hide file tree
Showing 12 changed files with 136 additions and 119 deletions.
212 changes: 114 additions & 98 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion configs/extractors/site_specific_extractor_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,7 @@ def extract_article_urls_from_page_test(filename, test_logger):


if __name__ == '__main__':
from corpusbuilder import WarcCachingDownloader, Logger
from webarticlecurator import WarcCachingDownloader, Logger

main_logger = Logger()

Expand Down
11 changes: 0 additions & 11 deletions corpusbuilder/__init__.py

This file was deleted.

13 changes: 7 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,20 @@
# -*- coding: utf-8, vim: expandtab:ts=4 -*-

import setuptools
from corpusbuilder import __version__
from webarticlecurator import __version__

with open('README.md') as fh:
long_description = fh.read()

setuptools.setup(
name='corpusbuilder',
name='webarticlecurator',
version=__version__,
author='dlazesz', # Will warn about missing e-mail
description='Corpus generator program to download content of Hungarian online newspapers',
description='A crawler program to download content from portals (news, forums, blogs) and convert it'
' to the desired output format according to the configuration.',
long_description=long_description,
long_description_content_type='text/markdown',
url='https://github.com/dlazesz/corpusbuilder',
url='https://github.com/ELTE-DH/WebArticleCurator',
# license='GNU Lesser General Public License v3 (LGPLv3)', # Never really used in favour of classifiers
# platforms='any', # Never really used in favour of classifiers
packages=setuptools.find_packages(exclude=['tests']),
Expand All @@ -24,12 +25,12 @@
'Operating System :: POSIX :: Linux',
],
python_requires='>=3.6',
install_requires=['pyyaml', 'chardet', 'requests', 'urllib3', 'warcio', 'ratelimit', 'beautifulsoup4',
install_requires=['pyyaml', 'chardet', 'requests', 'urllib3', 'warcio', 'ratelimit', 'beautifulsoup4', 'yamale',
'newspaper3k'], # Newspaper3k is optional!
include_package_data=True,
entry_points={
'console_scripts': [
'corpusbuilder=corpusbuilder.__main__:main',
'webarticlecurator=webarticlecurator.__main__:main',
]
},
)
11 changes: 11 additions & 0 deletions webarticlecurator/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env pyhton3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-

from webarticlecurator.utils import Logger, wrap_input_consants, DummyConverter
from webarticlecurator.enhanced_downloader import WarcCachingDownloader, sample_warc_by_urls, validate_warc_file, \
online_test
from webarticlecurator.news_crawler import NewsArchiveCrawler, NewsArticleCrawler
from webarticlecurator.version import __version__

__all__ = ['NewsArchiveCrawler', 'NewsArticleCrawler', 'DummyConverter', 'WarcCachingDownloader', 'sample_warc_by_urls',
'validate_warc_file', 'online_test', 'Logger', 'wrap_input_consants', __version__]
4 changes: 2 additions & 2 deletions corpusbuilder/__main__.py → webarticlecurator/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
from argparse import ArgumentParser, ArgumentTypeError, FileType

from corpusbuilder import wrap_input_consants, NewsArchiveCrawler, NewsArticleCrawler, sample_warc_by_urls, \
from webarticlecurator import wrap_input_consants, NewsArchiveCrawler, NewsArticleCrawler, sample_warc_by_urls, \
validate_warc_file, online_test, Logger, __version__


Expand Down Expand Up @@ -45,7 +45,7 @@ def parse_args_crawl(parser):
parser.add_argument('--strict', type=str2bool, nargs='?', const=True, default=False, metavar='True/False',
help='Set strict-mode in WARCReader to enable validation')
parser.add_argument('--crawler-name', type=str, help='The name of the crawler for the WARC info record',
default='corpusbuilder {0}'.format(__version__))
default='WebArticleCurator {0}'.format(__version__))
parser.add_argument('--user-agent', type=str, help='The User-Agent string to use in headers while downloading')
parser.add_argument('--no-overwrite-warc', help='Do not overwrite --{archive,articles}-warc if needed',
action='store_false')
Expand Down
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from datetime import timedelta

from corpusbuilder import WarcCachingDownloader, Logger
from webarticlecurator import WarcCachingDownloader, Logger


class NewsArchiveCrawler:
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit 5c0e261

Please sign in to comment.