diff --git a/config.yml b/config.yml index 53fac4863..7d8790195 100644 --- a/config.yml +++ b/config.yml @@ -25,16 +25,24 @@ network: # 设置代理服务器地址,支持 http, socks5/socks5h 代理,比如'http://127.0.0.1:1080' # null表示禁用代理 proxy_server: null - # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置 - proxy_free: - avsox: 'https://avsox.click' - javbus: 'https://www.seedmm.help' - javdb: 'https://javdb368.com' - javlib: 'https://www.y78k.com' # 网络问题导致抓取数据失败时的重试次数,通常3次就差不多了 - retry: 3 + retries: 3 # https://en.wikipedia.org/wiki/ISO_8601#Durations timeout: PT10S + # 对列表中的地址不使用梯子(如果启用了的话) + unproxied: [ + 'https://www.seedmm.help', + 'https://javdb368.com', + 'https://www.y78k.com', + 'https://www.javbus.one', + 'https://www.tellme.pw', + ] + # 各个站点的代替地址。 + # JavSP会按顺序尝试列表里的每一个服务器,如果都不行会使用默认的主站点地址 + fallback: + javbus: ['https://www.seedmm.help'] + javdb: ['https://javdb368.com'] + javlib: ['https://www.y78k.com'] ################################ crawler: @@ -52,8 +60,6 @@ crawler: hardworking: true # 使用网页番号作为最终番号(启用时会对番号大小写等进行更正) respect_site_avid: true - # fc2fan已关站。如果你有镜像,请设置本地镜像文件夹的路径,此文件夹内要有类似'FC2-12345.html'的网页文件 - fc2fan_local_path: null # 刮削一部电影后的等待时间(设置为0禁用此功能) # https://en.wikipedia.org/wiki/ISO_8601#Durations sleep_after_scraping: PT1S diff --git a/javsp/__main__.py b/javsp/__main__.py index 7771170e7..456bbebf8 100644 --- a/javsp/__main__.py +++ b/javsp/__main__.py @@ -3,13 +3,14 @@ import sys import json import time +import asyncio import logging from PIL import Image from pydantic import ValidationError +from pydantic_core import Url from pydantic_extra_types.pendulum_dt import Duration -import requests -import threading -from typing import Dict, List +from typing import Any, Coroutine, Dict, List +from javsp.crawlers.all import crawlers sys.stdout.reconfigure(encoding='utf-8') @@ -23,7 +24,7 @@ from javsp.print import TqdmOut -from javsp.cropper import Cropper, get_cropper +from javsp.cropper import get_cropper # 将StreamHandler的stream修改为TqdmOut,以与Tqdm协同工作 @@ -41,11 +42,11 @@ from javsp.func import * from javsp.image import * from javsp.datatype import Movie, MovieInfo -from javsp.web.base import download -from javsp.web.exceptions import * -from javsp.web.translate import translate_movie_info +from javsp.network.utils import url_download +from javsp.crawlers.exceptions import * +from javsp.translate import translate_movie_info -from javsp.config import Cfg, CrawlerID +from javsp.config import Cfg, CrawlerID, UseJavDBCover actressAliasMap = {} @@ -57,86 +58,49 @@ def resolve_alias(name): return name # 如果找不到别名对应的固定名字,则返回原名 -def import_crawlers(): - """按配置文件的抓取器顺序将该字段转换为抓取器的函数列表""" - unknown_mods = [] - for _, mods in Cfg().crawler.selection.items(): - valid_mods = [] - for name in mods: - try: - # 导入fc2fan抓取器的前提: 配置了fc2fan的本地路径 - # if name == 'fc2fan' and (not os.path.isdir(Cfg().Crawler.fc2fan_local_path)): - # logger.debug('由于未配置有效的fc2fan路径,已跳过该抓取器') - # continue - import_name = 'javsp.web.' + name - __import__(import_name) - valid_mods.append(import_name) # 抓取器有效: 使用完整模块路径,便于程序实际使用 - except ModuleNotFoundError: - unknown_mods.append(name) # 抓取器无效: 仅使用模块名,便于显示 - if unknown_mods: - logger.warning('配置的抓取器无效: ' + ', '.join(unknown_mods)) - - # 爬虫是IO密集型任务,可以通过多线程提升效率 -def parallel_crawler(movie: Movie, tqdm_bar=None): +async def parallel_crawler(movie: Movie, tqdm_bar=None) -> dict[CrawlerID, MovieInfo]: """使用多线程抓取不同网站的数据""" - def wrapper(parser, info: MovieInfo, retry): + + async def wrapper(id: CrawlerID, movie: MovieInfo) -> None: """对抓取器函数进行包装,便于更新提示信息和自动重试""" - crawler_name = threading.current_thread().name - task_info = f'Crawler: {crawler_name}: {info.dvdid}' - for cnt in range(retry): - try: - parser(info) - movie_id = info.dvdid or info.cid - logger.debug(f"{crawler_name}: 抓取成功: '{movie_id}': '{info.url}'") - setattr(info, 'success', True) - if isinstance(tqdm_bar, tqdm): - tqdm_bar.set_description(f'{crawler_name}: 抓取完成') - break - except MovieNotFoundError as e: - logger.debug(e) - break - except MovieDuplicateError as e: - logger.exception(e) - break - except (SiteBlocked, SitePermissionError, CredentialError) as e: - logger.error(e) - break - except requests.exceptions.RequestException as e: - logger.debug(f'{crawler_name}: 网络错误,正在重试 ({cnt+1}/{retry}): \n{repr(e)}') - if isinstance(tqdm_bar, tqdm): - tqdm_bar.set_description(f'{crawler_name}: 网络错误,正在重试') - except Exception as e: - logger.exception(e) + try: + crawler = await crawlers[id].create() + await crawler.crawl_and_fill(movie) + movie_id = info.dvdid or info.cid + logger.debug(f"{crawler.id.value}: 抓取成功: '{movie_id}': '{info.url}'") + setattr(info, 'success', True) + if isinstance(tqdm_bar, tqdm): + tqdm_bar.set_description(f'{crawler.id.value}: 抓取完成') + except MovieNotFoundError as e: + logger.debug(e) + except MovieDuplicateError as e: + logger.exception(e) + except (SiteBlocked, SitePermissionError, CredentialError) as e: + logger.error(e) + except Exception as e: + logger.exception(e) # 根据影片的数据源获取对应的抓取器 - crawler_mods: List[CrawlerID] = Cfg().crawler.selection[movie.data_src] + crawler_to_use: List[CrawlerID] = Cfg().crawler.selection[movie.data_src] + + all_info: Dict[CrawlerID, MovieInfo] = {i: MovieInfo(movie) for i in crawler_to_use} - all_info = {i.value: MovieInfo(movie) for i in crawler_mods} # 番号为cid但同时也有有效的dvdid时,也尝试使用普通模式进行抓取 if movie.data_src == 'cid' and movie.dvdid: - crawler_mods = crawler_mods + Cfg().crawler.selection.normal + crawler_to_use += Cfg().crawler.selection.normal for i in all_info.values(): i.dvdid = None for i in Cfg().crawler.selection.normal: all_info[i] = MovieInfo(movie.dvdid) - thread_pool = [] - for mod_partial, info in all_info.items(): - mod = f"javsp.web.{mod_partial}" - parser = getattr(sys.modules[mod], 'parse_data') - # 将all_info中的info实例传递给parser,parser抓取完成后,info实例的值已经完成更新 - # TODO: 抓取器如果带有parse_data_raw,说明它已经自行进行了重试处理,此时将重试次数设置为1 - if hasattr(sys.modules[mod], 'parse_data_raw'): - th = threading.Thread(target=wrapper, name=mod, args=(parser, info, 1)) - else: - th = threading.Thread(target=wrapper, name=mod, args=(parser, info, Cfg().network.retry)) - th.start() - thread_pool.append(th) - # 等待所有线程结束 - timeout = Cfg().network.retry * Cfg().network.timeout.total_seconds() - for th in thread_pool: - th: threading.Thread - th.join(timeout=timeout) + + co_pool: list[Coroutine[Any, Any, None]] = [] + for crawler_id, info in all_info.items(): + co_pool.append(wrapper(crawler_id, info)) + + # 等待所有协程结束 + await asyncio.gather(*co_pool) + # 根据抓取结果更新影片类型判定 if movie.data_src == 'cid' and movie.dvdid: titles = [all_info[i].title for i in Cfg().crawler.selection[movie.data_src]] @@ -148,22 +112,22 @@ def wrapper(parser, info: MovieInfo, retry): movie.data_src = 'normal' movie.cid = None all_info = {k: v for k, v in all_info.items() if k not in Cfg().crawler.selection['cid']} + # 删除抓取失败的站点对应的数据 all_info = {k:v for k,v in all_info.items() if hasattr(v, 'success')} for info in all_info.values(): del info.success - # 删除all_info中键名中的'web.' - all_info = {k[4:]:v for k,v in all_info.items()} + return all_info -def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): +def info_summary(movie: Movie, all_info: Dict[CrawlerID, MovieInfo]): """汇总多个来源的在线数据生成最终数据""" final_info = MovieInfo(movie) ########## 部分字段配置了专门的选取逻辑,先处理这些字段 ########## # genre - if 'javdb' in all_info and all_info['javdb'].genre: - final_info.genre = all_info['javdb'].genre + if 'javdb' in all_info and all_info[CrawlerID.javdb].genre: + final_info.genre = all_info[CrawlerID.javdb].genre ########## 移除所有抓取器数据中,标题尾部的女优名 ########## if Cfg().summarizer.title.remove_trailing_actor_name: @@ -197,7 +161,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): setattr(final_info, attr, incoming) absorbed.append(attr) if absorbed: - logger.debug(f"从'{name}'中获取了字段: " + ' '.join(absorbed)) + logger.debug(f"从'{name.value}'中获取了字段: " + ' '.join(absorbed)) # 使用网站的番号作为番号 if Cfg().crawler.respect_site_avid: id_weight = {} @@ -216,7 +180,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]): else: final_info.cid = final_id # javdb封面有水印,优先采用其他站点的封面 - javdb_cover = getattr(all_info.get('javdb'), 'cover', None) + javdb_cover = getattr(all_info.get(CrawlerID.javdb), 'cover', None) if javdb_cover is not None: match Cfg().crawler.use_javdb_cover: case UseJavDBCover.fallback: @@ -402,7 +366,7 @@ def should_use_ai_crop_match(label): fanart_cropped = add_label_to_poster(fanart_cropped, UNCENSORED_MARK_FILE, LabelPostion.BOTTOM_LEFT) fanart_cropped.save(movie.poster_file) -def RunNormalMode(all_movies): +async def RunNormalMode(all_movies): """普通整理模式""" def check_step(result, msg='步骤错误'): """检查一个整理步骤的结果,并负责更新tqdm的进度""" @@ -427,7 +391,7 @@ def check_step(result, msg='步骤错误'): inner_bar = tqdm(total=total_step, desc='步骤', ascii=True, leave=False) # 依次执行各个步骤 inner_bar.set_description(f'启动并发任务') - all_info = parallel_crawler(movie, inner_bar) + all_info = await parallel_crawler(movie, inner_bar) msg = f'为其配置的{len(Cfg().crawler.selection[movie.data_src])}个抓取器均未获取到影片信息' check_step(all_info, msg) @@ -447,9 +411,9 @@ def check_step(result, msg='步骤错误'): inner_bar.set_description('下载封面图片') if Cfg().summarizer.cover.highres: - cover_dl = download_cover(movie.info.covers, movie.fanart_file, movie.info.big_covers) + cover_dl = await download_cover(movie.info.covers, movie.fanart_file, movie.info.big_covers) else: - cover_dl = download_cover(movie.info.covers, movie.fanart_file) + cover_dl = await download_cover(movie.info.covers, movie.fanart_file) check_step(cover_dl, '下载封面图片失败') cover, pic_path = cover_dl # 确保实际下载的封面的url与即将写入到movie.info中的一致 @@ -476,12 +440,12 @@ def check_step(result, msg='步骤错误'): fanart_destination = f"{extrafanartdir}/{id}.png" try: - info = download(pic_url, fanart_destination) + info = await url_download(Url(pic_url), fanart_destination) if valid_pic(fanart_destination): filesize = get_fmt_size(pic_path) width, height = get_pic_size(pic_path) - elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed'])) - speed = get_fmt_size(info['rate']) + '/s' + elapsed = str(info.elapsed) + speed = f"{info.get_rate()}Mbps" logger.info(f"已下载剧照{pic_url} {id}.png: {width}x{height}, {filesize} [{elapsed}, {speed}]") else: check_step(False, f"下载剧照{id}: {pic_url}失败") @@ -512,38 +476,29 @@ def check_step(result, msg='步骤错误'): return return_movies -def download_cover(covers, fanart_path, big_covers=[]): +async def download_cover(covers, fanart_path, big_covers=[]): """下载封面图片""" # 优先下载高清封面 for url in big_covers: pic_path = get_pic_path(fanart_path, url) - for _ in range(Cfg().network.retry): - try: - info = download(url, pic_path) - if valid_pic(pic_path): - filesize = get_fmt_size(pic_path) - width, height = get_pic_size(pic_path) - elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed'])) - speed = get_fmt_size(info['rate']) + '/s' - logger.info(f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]") - return (url, pic_path) - except requests.exceptions.HTTPError: - # HTTPError通常说明猜测的高清封面地址实际不可用,因此不再重试 - break + info = await url_download(Url(url), pic_path) + if valid_pic(pic_path): + filesize = get_fmt_size(pic_path) + width, height = get_pic_size(pic_path) + elapsed = str(info.elapsed) + speed = f"{info.get_rate()}Mbps" + logger.info(f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]") + return (url, pic_path) # 如果没有高清封面或高清封面下载失败 for url in covers: pic_path = get_pic_path(fanart_path, url) - for _ in range(Cfg().network.retry): - try: - download(url, pic_path) - if valid_pic(pic_path): - logger.debug(f"已下载封面: '{url}'") - return (url, pic_path) - else: - logger.debug(f"图片无效或已损坏: '{url}',尝试更换下载地址") - break - except Exception as e: - logger.debug(e, exc_info=True) + await url_download(Url(url), pic_path) + if valid_pic(pic_path): + logger.debug(f"已下载封面: '{url}'") + return (url, pic_path) + else: + logger.debug(f"图片无效或已损坏: '{url}',尝试更换下载地址") + break logger.error(f"下载封面图片失败") logger.debug('big_covers:'+str(big_covers) + ', covers'+str(covers)) return None @@ -558,14 +513,7 @@ def get_pic_path(fanart_path, url): pic_path = fanart_base + "." + pic_extend return pic_path -def error_exit(success, err_info): - """检查业务逻辑是否成功完成,如果失败则报错退出程序""" - if not success: - logger.error(err_info) - sys.exit(1) - - -def entry(): +async def aentry(): try: Cfg() except ValidationError as e: @@ -583,22 +531,28 @@ def entry(): # 检查更新 version_info = 'JavSP ' + getattr(sys, 'javsp_version', '未知版本/从代码运行') logger.debug(version_info.center(60, '=')) - check_update(Cfg().other.check_update, Cfg().other.auto_update) + await check_update(Cfg().other.check_update, Cfg().other.auto_update) root = get_scan_dir(Cfg().scanner.input_directory) - error_exit(root, '未选择要扫描的文件夹') + if root is None: + logger.error('未选择要扫描的文件夹') + sys.exit(1) # 导入抓取器,必须在chdir之前 - import_crawlers() os.chdir(root) print(f'扫描影片文件...') recognized = scan_movies(root) movie_count = len(recognized) recognize_fail = [] - error_exit(movie_count, '未找到影片文件') + if movie_count == 0: + logger.error('未找到影片文件') + sys.exit(1) logger.info(f'扫描影片文件:共找到 {movie_count} 部影片') - RunNormalMode(recognized + recognize_fail) + await RunNormalMode(recognized + recognize_fail) sys.exit(0) +def entry(): + asyncio.run(aentry(), debug=True) + if __name__ == "__main__": entry() diff --git a/javsp/config.py b/javsp/config.py index 3fbc8f071..e87b5dc28 100644 --- a/javsp/config.py +++ b/javsp/config.py @@ -39,9 +39,10 @@ class CrawlerID(str, Enum): class Network(BaseConfig): proxy_server: Url | None - retry: NonNegativeInt = 3 + retries: NonNegativeInt = 3 timeout: Duration - proxy_free: Dict[CrawlerID, Url] + unproxied: List[Url] + fallback: Dict[CrawlerID, List[str]] class CrawlerSelect(BaseConfig): def items(self) -> List[tuple[str, list[CrawlerID]]]: @@ -109,7 +110,6 @@ class Crawler(BaseConfig): required_keys: list[MovieInfoField] hardworking: bool respect_site_avid: bool - fc2fan_local_path: Path | None sleep_after_scraping: Duration use_javdb_cover: UseJavDBCover normalize_actress_name: bool diff --git a/javsp/crawlers/all.py b/javsp/crawlers/all.py new file mode 100644 index 000000000..8c262ecc1 --- /dev/null +++ b/javsp/crawlers/all.py @@ -0,0 +1,30 @@ +from collections.abc import Coroutine +from typing import Any, Dict +from javsp.config import CrawlerID +from javsp.crawlers.interface import Crawler +from javsp.crawlers.sites import \ + airav, arzon, arzon_iv, avsox, avwiki, dl_getchu, fanza, fc2, fc2ppvdb, \ + gyutto, jav321, javbus, javdb, javlib, javmenu, mgstage, njav, prestige + +__all__ = ['crawlers'] + +crawlers: Dict[CrawlerID, type[Crawler]] = { + CrawlerID.airav: airav. AiravCrawler, + CrawlerID.arzon: arzon. ArzonCrawler, + CrawlerID.arzon_iv: arzon_iv. ArzonIvCrawler, + CrawlerID.avsox: avsox. AvsoxCrawler, + CrawlerID.avwiki: avwiki. AvWikiCrawler, + CrawlerID.dl_getchu: dl_getchu.DlGetchuCrawler, + CrawlerID.fanza: fanza. FanzaCrawler, + CrawlerID.fc2: fc2. Fc2Crawler, + CrawlerID.fc2ppvdb: fc2ppvdb. Fc2PpvDbCrawler, + CrawlerID.gyutto: gyutto. GyuttoCrawler, + CrawlerID.jav321: jav321. Jav321Crawler, + CrawlerID.javbus: javbus. JavbusCrawler, + CrawlerID.javdb: javdb. JavDbCrawler, + CrawlerID.javlib: javlib. JavLibCrawler, + CrawlerID.javmenu: javmenu. JavMenuCrawler, + CrawlerID.mgstage: mgstage. MgstageCrawler, + CrawlerID.njav: njav. NjavCrawler, + CrawlerID.prestige: prestige. PrestigeCrawler, +} diff --git a/javsp/web/exceptions.py b/javsp/crawlers/exceptions.py similarity index 100% rename from javsp/web/exceptions.py rename to javsp/crawlers/exceptions.py diff --git a/javsp/crawlers/interface.py b/javsp/crawlers/interface.py new file mode 100644 index 000000000..a641b0a27 --- /dev/null +++ b/javsp/crawlers/interface.py @@ -0,0 +1,21 @@ +from httpx import AsyncClient +from javsp.config import CrawlerID +from javsp.datatype import MovieInfo +from abc import ABC, abstractmethod +from typing import Self + + +class Crawler(ABC): + base_url: str + client: AsyncClient + id: CrawlerID + + + @classmethod + @abstractmethod + async def create(cls) -> Self: + pass + + @abstractmethod + async def crawl_and_fill(self, movie: MovieInfo) -> None: + pass diff --git a/javsp/crawlers/proxyfree.py b/javsp/crawlers/proxyfree.py new file mode 100644 index 000000000..381eeb7af --- /dev/null +++ b/javsp/crawlers/proxyfree.py @@ -0,0 +1,98 @@ +"""获取各个网站的免代理地址""" +from collections.abc import Callable, Coroutine +import re +from typing import Any, Dict + +from pydantic_core import Url +from pydantic_extra_types.pendulum_dt import Duration +from lxml import html + +from javsp.config import CrawlerID +from javsp.network.utils import test_connect, choose_one_connectable +from javsp.network.client import get_client + + +async def _get_avsox_urls() -> list[str]: + link = 'https://tellme.pw/avsox' + client = get_client(Url(link)) + resp = await client.get(link) + tree = html.fromstring(resp.text) + urls = tree.xpath('//h4/strong/a/@href') + return urls + + +async def _get_javbus_urls() -> list[str]: + link = 'https://www.javbus.one/' + client = get_client(Url(link)) + resp = await client.get(link) + text = resp.text + urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A) + return urls + + +async def _get_javlib_urls() -> list[str]: + link = 'https://github.com/javlibcom' + client = get_client(Url(link)) + resp = await client.get(link) + tree = html.fromstring(resp.text) + text = tree.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content() + match = re.search(r'[\w\.]+', text, re.A) + if match: + domain = f'https://www.{match.group(0)}.com' + return [domain] + return [] + + +async def _get_javdb_urls() -> list[str]: + root_link = 'https://jav524.app' + client = get_client(Url(root_link)) + resp = await client.get(root_link) + tree = html.fromstring(resp.text) + js_links = tree.xpath("//script[@src]/@src") + for link in js_links: + if '/js/index' in link: + link = root_link + link + resp = await client.get(link) + text = resp.text + match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A) + if match: + return [match.group(1)] + return [] + +proxy_free_fns: Dict[CrawlerID, Callable[[], Coroutine[Any, Any, list[str]]]]= { + CrawlerID.avsox: _get_avsox_urls, + CrawlerID.javdb: _get_javdb_urls, + CrawlerID.javbus: _get_javbus_urls, + CrawlerID.javlib: _get_javlib_urls, +} + +async def get_proxy_free_url(site_name: CrawlerID, prefer_url: str | None = None) -> str | None: + """获取指定网站的免代理地址 + Args: + site_name (str): 站点名称 + prefer_url (str, optional): 优先测试此url是否可用 + Returns: + str: 指定站点的免代理地址(失败时为空字符串) + """ + if prefer_url and await test_connect(prefer_url, Duration(seconds=5)): + return prefer_url + + if site_name in proxy_free_fns: + try: + urls = await proxy_free_fns[site_name]() + return await choose_one_connectable(urls) + except: + return None + else: + raise Exception("Dont't know how to get proxy-free url for " + site_name) + + + +if __name__ == "__main__": + + async def test_main(): + print('javdb:\t', await _get_javdb_urls()) + print('javlib:\t', await _get_javlib_urls()) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/airav.py b/javsp/crawlers/sites/airav.py new file mode 100644 index 000000000..5afd46998 --- /dev/null +++ b/javsp/crawlers/sites/airav.py @@ -0,0 +1,124 @@ +"""从airav抓取数据""" +import re +from html import unescape + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.network.client import get_client +from javsp.network.utils import resolve_site_fallback +from javsp.config import Cfg, CrawlerID +from javsp.datatype import MovieInfo +from javsp.crawlers.interface import Crawler + + +class AiravCrawler(Crawler): + id = CrawlerID.airav + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.airav.wiki') + self.base_url = str(url) + self.client = get_client(url) + self.client.headers['Accept-Language'] = 'zh-TW,zh;q=0.9' + return self + + async def search_movie(self, dvdid: str): + """通过搜索番号获取指定的影片在网站上的ID""" + # 部分影片的ID并不直接等于番号(如012717-360),此时需要尝试通过搜索来寻找影片 + page = 0 + count = 1 + result = [] + while len(result) < count: + url = f'{self.base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}' + response = await self.client.get(url) + resp = response.json() + # {"offset": 2460, "count": 12345, "result": [...], "status": "ok"} + if resp['result']: + result.extend(resp['result']) + count = resp['count'] + page += 1 + else: # 结果为空,结束循环 + break + # 如果什么都没搜索到,直接返回 + if not result: + raise MovieNotFoundError(__name__, dvdid) + # 排序,以优先选择更符合预期的结果(如'012717_472'对应的'1pondo_012717_472'和'_1pondo_012717_472') + result.sort(key=lambda x:x['barcode']) + # 从所有搜索结果中选择最可能的番号,返回它的URL + target = dvdid.replace('-', '_') + for item in result: + # {'vid': '', 'slug': '', 'name': '', 'url': '', 'view': '', 'img_url': '', 'barcode': ''} + barcode = item['barcode'].replace('-', '_') + if target in barcode: + return item['barcode'] + raise MovieNotFoundError(__name__, dvdid, result) + + + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据 + url = f'{self.base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW' + response = await self.client.get(url) + resp_json = response.json() + # 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息 + if resp_json['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid): + barcode = await self.search_movie(movie.dvdid) + if barcode: + url = f'{self.base_url}/api/video/barcode/{barcode}?lng=zh-TW' + response = await self.client.get(url) + resp_json = response.json() + + if resp_json['count'] == 0: + raise MovieNotFoundError(__name__, movie.dvdid, resp_json) + + # 从API返回的数据中提取需要的字段 + # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展 + data = resp_json['result'] + dvdid = data['barcode'] + movie.dvdid = dvdid + movie.url = self.base_url + '/video/' + dvdid + # plot和title中可能含有HTML的转义字符,需要进行解转义处理 + movie.plot = unescape(data['description']) or None + movie.cover = data['img_url'] + # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id + movie.genre = [i['name'] for i in data['tags']] + movie.title = unescape(data['name']) + movie.actress = [i['name'] for i in data['actors']] + movie.publish_date = data['publish_date'] + movie.preview_pics = data['images'] or [] + if data['factories']: + movie.producer = data['factories'][0]['name'] + + if Cfg().crawler.hardworking: + # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472') + video_url = f"{self.base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}" + response = await self.client.get(video_url) + resp = response.json() + # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'} + if 'data' in resp: + # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址 + # TODO: 发现部分影片(如080719-976)的传统格式预览片错误 + movie.preview_video = resp['data'].get('url') + + # airav上部分影片会被标记为'馬賽克破壞版'等,这些影片的title、plot和genre都不再准确 + for keyword in ('馬賽克破壞版', '馬賽克破解版', '無碼流出版'): + if movie.title and keyword in movie.title: + movie.title = None + movie.genre = [] + if movie.plot and keyword in movie.plot: + movie.plot = None + movie.genre = [] + if not any([movie.title, movie.plot, movie.genre]): + break + +if __name__ == "__main__": + + async def test_main(): + crawler = await AiravCrawler.create() + movie = MovieInfo("DSAD-938") + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/arzon.py b/javsp/crawlers/sites/arzon.py new file mode 100644 index 000000000..f4887f4d7 --- /dev/null +++ b/javsp/crawlers/sites/arzon.py @@ -0,0 +1,105 @@ +"""从arzon抓取数据""" +import re + +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from javsp.crawlers.exceptions import * +from javsp.datatype import MovieInfo +from lxml import html + +class ArzonCrawler(Crawler): + id = CrawlerID.arzon + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, "https://www.arzon.jp") + self.base_url = str(url) + self.client = get_client(url) + # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F + skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1" + await self.client.get(skip_verify_url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + full_id = movie.dvdid + url = f'{self.base_url}/itemlist.html?t=&m=all&s=&q={full_id}' + # url = f'{base_url}/imagelist.html?q={full_id}' + + r = await self.client.get(url) + if r.status_code == 404: + raise MovieNotFoundError(__name__, movie.dvdid) + # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported + data = html.fromstring(r.content) + + urls = data.xpath("//h2/a/@href") + if len(urls) == 0: + raise MovieNotFoundError(__name__, movie.dvdid) + + item_url = self.base_url + urls[0] + e = await self.client.get(item_url) + item = html.fromstring(e.content) + + title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0] + cover = item.xpath("//td[@align='center']//a/img/@src")[0] + item_text = item.xpath("//div[@class='item_text']/text()") + plot = [item.strip() for item in item_text if item.strip() != ''][0] + preview_pics_arr = item.xpath("//div[@class='detail_img']//img/@src") + # 使用列表推导式添加 "http:" 并去除 "m_" + preview_pics = [("https:" + url).replace("m_", "") for url in preview_pics_arr] + + container = item.xpath("//div[@class='item_register']/table//tr") + for row in container: + key = row.xpath("./td[1]/text()")[0] + contents = row.xpath("./td[2]//text()") + content = [item.strip() for item in contents if item.strip() != ''] + index = 0 + value = content[index] if content and index < len(content) else None + if key == "AV女優:": + movie.actress = content + if key == "AVメーカー:": + movie.producer = value + if key == "AVレーベル:": + video_type = value + if key == "シリーズ:": + movie.serial = value + if key == "監督:": + movie.director = value + if key == "発売日:" and value: + movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") + if key == "収録時間:" and value: + movie.duration = re.search(r'([\d.]+)分', value).group(1) + if key == "品番:": + dvd_id = value + elif key == "タグ:": + genre = value + + genres = '' + if video_type: + genres = [video_type] + if(genre != None): + genres.append(genre) + + movie.genre = genres + movie.url = item_url + movie.title = title + movie.plot = plot + movie.cover = f'https:{cover}' + movie.preview_pics = preview_pics + +if __name__ == "__main__": + + async def test_main(): + crawler = await ArzonCrawler.create() + movie = MovieInfo("CSCT-011") + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/arzon_iv.py b/javsp/crawlers/sites/arzon_iv.py new file mode 100644 index 000000000..a84c97aea --- /dev/null +++ b/javsp/crawlers/sites/arzon_iv.py @@ -0,0 +1,100 @@ +"""从arzon_iv抓取数据""" +import re + + +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from javsp.crawlers.exceptions import * +from javsp.datatype import MovieInfo +from lxml import html + +class ArzonIvCrawler(Crawler): + id = CrawlerID.arzon_iv + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, "https://www.arzon.jp") + self.base_url = str(url) + self.client = get_client(url) + # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F + skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1" + await self.client.get(skip_verify_url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + full_id = movie.dvdid + url = f'{self.base_url}/imagelist.html?q={full_id}' + # url = f'{base_url}/imagelist.html?q={full_id}' + + r = await self.client.get(url) + if r.status_code == 404: + raise MovieNotFoundError(__name__, movie.dvdid) + # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported + data = html.fromstring(r.content) + + urls = data.xpath("//h2/a/@href") + if len(urls) == 0: + raise MovieNotFoundError(__name__, movie.dvdid) + + item_url = self.base_url + urls[0] + e = await self.client.get(item_url) + item = html.fromstring(e.content) + + title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0] + cover = item.xpath("//td[@align='center']//a/img/@src")[0] + item_text = item.xpath("//div[@class='item_text']/text()") + plot = [item.strip() for item in item_text if item.strip() != ''][0] + + container = item.xpath("//div[@class='item_register']/table//tr") + for row in container: + key = row.xpath("./td[1]/text()")[0] + contents = row.xpath("./td[2]//text()") + content = [item.strip() for item in contents if item.strip() != ''] + index = 0 + value = content[index] if content and index < len(content) else None + if key == "タレント:": + movie.actress = content + if key == "イメージメーカー:": + movie.producer = value + if key == "イメージレーベル:": + video_type = value + if key == "監督:": + movie.director = value + if key == "発売日:" and value: + movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") + if key == "収録時間:" and value: + movie.duration = re.search(r'([\d.]+)分', value).group(1) + if key == "品番:": + dvd_id = value + elif key == "タグ:": + genre = value + + genres = '' + if video_type: + genres = [video_type] + if(genre != None): + genres.append(genre) + + movie.genre = genres + movie.url = item_url + movie.title = title + movie.plot = plot + movie.cover = f'https:{cover}' + +if __name__ == "__main__": + + async def test_main(): + crawler = await ArzonIvCrawler.create() + movie = MovieInfo("KIDM-1137B") + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/avsox.py b/javsp/crawlers/sites/avsox.py new file mode 100644 index 000000000..47b0ea32d --- /dev/null +++ b/javsp/crawlers/sites/avsox.py @@ -0,0 +1,88 @@ +"""从avsox抓取数据""" + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + +class AvsoxCrawler(Crawler): + id = CrawlerID.avsox + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, "https://avsox.click/") + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + full_id: str = movie.dvdid + if full_id.startswith('FC2-'): + full_id = full_id.replace('FC2-', 'FC2-PPV-') + resp = await self.client.get(f'{self.base_url}tw/search/{full_id}') + tree = html.fromstring(resp.text) + tree.make_links_absolute(str(resp.url), resolve_base_href=True) + ids = tree.xpath("//div[@class='photo-info']/span/date[1]/text()") + urls = tree.xpath("//a[contains(@class, 'movie-box')]/@href") + ids_lower = list(map(str.lower, ids)) + if full_id.lower() not in ids_lower: + raise MovieNotFoundError(__name__, movie.dvdid, ids) + + url = urls[ids_lower.index(full_id.lower())] + url = url.replace('/tw/', '/cn/', 1) + + # 提取影片信息 + resp = await self.client.get(url) + # with open('file.html', 'wb') as f: + # f.write(resp.content) + tree = html.fromstring(resp.text) + container = tree.xpath("/html/body/div[@class='container']")[0] + title = container.xpath("h3/text()")[0] + cover = container.xpath("//a[@class='bigImage']/@href")[0] + info = container.xpath("div/div[@class='col-md-3 info']")[0] + dvdid = info.xpath("p/span[@style]/text()")[0] + publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip() + duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟', '').strip() + producer, serial = None, None + producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a") + if producer_tag: + producer = producer_tag[0].text_content() + serial_tag = info.xpath("p[text()='系列:']") + if serial_tag: + serial = serial_tag[0].getnext().xpath("a/text()")[0] + genre = info.xpath("p/span[@class='genre']/a/text()") + actress = container.xpath("//a[@class='avatar-box']/span/text()") + + movie.dvdid = dvdid.replace('FC2-PPV-', 'FC2-') + movie.url = url + movie.title = title.replace(dvdid, '').strip() + movie.cover = cover + movie.publish_date = publish_date + movie.duration = duration + movie.genre = genre + movie.actress = actress + if full_id.startswith('FC2-'): + # avsox把FC2作品的拍摄者归类到'系列'而制作商固定为'FC2-PPV',这既不合理也与其他的站点不兼容,因此进行调整 + movie.producer = serial + else: + movie.producer = producer + movie.serial = serial + + +if __name__ == "__main__": + + async def test_main(): + crawler = await AvsoxCrawler.create() + movie = MovieInfo("082713-417") + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/avwiki.py b/javsp/crawlers/sites/avwiki.py new file mode 100644 index 000000000..7bc2041e5 --- /dev/null +++ b/javsp/crawlers/sites/avwiki.py @@ -0,0 +1,82 @@ +"""从av-wiki抓取数据""" + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.crawlers.interface import Crawler +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.config import CrawlerID +from lxml import html + +class AvWikiCrawler(Crawler): + id = CrawlerID.avwiki + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://av-wiki.net') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """从网页抓取并解析指定番号的数据 + Args: + movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 + """ + movie.url = url = f'{self.base_url}/{movie.dvdid}' + + resp = await self.client.get(url) + if resp.status_code == 404: + raise MovieNotFoundError(__name__, movie.dvdid) + tree = html.fromstring(resp.content) + + cover_tag = tree.xpath("//header/div/a[@class='image-link-border']/img") + if cover_tag: + try: + srcset = cover_tag[0].get('srcset').split(', ') + src_set_urls = {} + for src in srcset: + url, width = src.split() + width = int(width.rstrip('w')) + src_set_urls[width] = url + max_pic = sorted(src_set_urls.items(), key=lambda x:x[0], reverse=True) + movie.cover = max_pic[0][1] + except: + movie.cover = cover_tag[0].get('src') + body = tree.xpath("//section[@class='article-body']")[0] + title = body.xpath("div/p/text()")[0] + title = title.replace(f"【{movie.dvdid}】", '') + cite_url = body.xpath("div/cite/a/@href")[0] + cite_url = cite_url.split('?aff=')[0] + info = body.xpath("dl[@class='dltable']")[0] + dt_txt_ls, dd_tags = info.xpath("dt/text()"), info.xpath("dd") + data = {} + for dt_txt, dd in zip(dt_txt_ls, dd_tags): + dt_txt = dt_txt.strip() + a_tag = dd.xpath('a') + if len(a_tag) == 0: + dd_txt = dd.text.strip() + else: + dd_txt = [i.text.strip() for i in a_tag] + if isinstance(dd_txt, list) and dt_txt != 'AV女優名': # 只有女优名以列表的数据格式保留 + dd_txt = dd_txt[0] + data[dt_txt] = dd_txt + + ATTR_MAP = {'メーカー': 'producer', 'AV女優名': 'actress', 'メーカー品番': 'dvdid', 'シリーズ': 'serial', '配信開始日': 'publish_date'} + for key, attr in ATTR_MAP.items(): + setattr(movie, attr, data.get(key)) + movie.title = title + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + + +if __name__ == "__main__": + + async def test_main(): + crawler = await AvWikiCrawler.create() + movie = MovieInfo("259LUXU-593") + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/dl_getchu.py b/javsp/crawlers/sites/dl_getchu.py new file mode 100644 index 000000000..c2ab0814f --- /dev/null +++ b/javsp/crawlers/sites/dl_getchu.py @@ -0,0 +1,131 @@ +"""从dl.getchu官网抓取数据""" +import re +import logging + +from javsp.config import CrawlerID +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.crawlers.interface import Crawler +from javsp.network.client import get_client +from javsp.network.utils import resolve_site_fallback +from javsp.crawlers.exceptions import * +from javsp.datatype import MovieInfo +from lxml import html +from lxml.html import HtmlElement + +def get_movie_title(tree: HtmlElement): + container = tree.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[2]") + if len(container) > 0: + container = container[0] + rows = container.xpath('.//tr') + title = '' + for row in rows: + for cell in row.xpath('.//td/div'): + # 获取单元格文本内容 + if cell.text: + title = str(cell.text).strip() + return title + + +def get_movie_img(tree: HtmlElement, getchu_id: str): + img_src = '' + container = tree.xpath(f'//img[contains(@src, "{getchu_id}top.jpg")]') + if len(container) > 0: + container = container[0] + img_src = container.get('src') + return img_src + + +def get_movie_preview(tree: HtmlElement, getchu_id: str): + preview_pics = [] + container = tree.xpath(f'//img[contains(@src, "{getchu_id}_")]') + if len(container) > 0: + for c in container: + preview_pics.append(c.get('src')) + return preview_pics + + +DURATION_PATTERN = re.compile(r'(?:動画)?(\d+)分') + + +class DlGetchuCrawler(Crawler): + id = CrawlerID.dl_getchu + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://dl.getchu.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + # 去除番号中的'GETCHU'字样 + id_uc = movie.dvdid.upper() + if not id_uc.startswith('GETCHU-'): + raise ValueError('Invalid GETCHU number: ' + movie.dvdid) + getchu_id = id_uc.replace('GETCHU-', '') + # 抓取网页 + url = f'{self.base_url}/i/item{getchu_id}' + r = await self.client.get(url) + if r.status_code == 404: + raise MovieNotFoundError(__name__, movie.dvdid) + tree = html.fromstring(r.text) + container = tree.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[3]") + if len(container) > 0: + container = container[0] + # 将表格提取为键值对 + rows = container.xpath('.//table/tr') + kv_rows = [i for i in rows if len(i) == 2] + data = {} + for row in kv_rows: + # 获取单元格文本内容 + key = row.xpath("td[@class='bluetext']/text()")[0] + # 是否包含a标签: 有的属性是用表示的,不是text + a_tags = row.xpath("td[2]/a") + if a_tags: + value = [i.text for i in a_tags] + else: + # 获取第2个td标签的内容(下标从1开始计数) + value = row.xpath("td[2]/text()") + data[key] = value + + for key, value in data.items(): + if key == 'サークル': + movie.producer = value[0] + elif key == '作者': + # 暂时没有在getchu找到多个actress的片子 + movie.actress = [i.strip() for i in value] + elif key == '画像数&ページ数': + match = DURATION_PATTERN.search(' '.join(value)) + if match: + movie.duration = match.group(1) + elif key == '配信開始日': + movie.publish_date = value[0].replace('/', '-') + elif key == '趣向': + movie.genre = value + elif key == '作品内容': + idx = -1 + for i, line in enumerate(value): + if line.lstrip().startswith('※'): + idx = i + break + movie.plot = ''.join(value[:idx]) + + movie.title = get_movie_title(tree) + movie.cover = get_movie_img(tree, getchu_id) + movie.preview_pics = get_movie_preview(tree, getchu_id) + movie.dvdid = id_uc + movie.url = url + + +if __name__ == "__main__": + + async def test_main(): + crawler = await DlGetchuCrawler.create() + movie = MovieInfo('getchu-4041026') + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/fanza.py b/javsp/crawlers/sites/fanza.py new file mode 100644 index 000000000..66b895df5 --- /dev/null +++ b/javsp/crawlers/sites/fanza.py @@ -0,0 +1,246 @@ +"""从fanza抓取数据""" + +import re +import json +import logging +from typing import Dict, List, Tuple + +from httpx import Response + + +from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.config import Cfg +from javsp.datatype import MovieInfo + +from lxml import html +from lxml.html import HtmlElement + +logger = logging.getLogger(__name__) + + +_PRODUCT_PRIORITY = {'digital': 10, 'mono': 5, 'monthly': 2, 'rental': 1} +_TYPE_PRIORITY = {'videoa': 10, 'anime': 8, 'nikkatsu': 6, 'doujin': 4, 'dvd': 3, 'ppr': 2, 'paradisetv': 1} +def sort_search_result(result: List[Dict]): + """排序搜索结果""" + scores = {i['url']:(_PRODUCT_PRIORITY.get(i['product'], 0), _TYPE_PRIORITY.get(i['type'], 0)) for i in result} + sorted_result = sorted(result, key=lambda x:scores[x['url']], reverse=True) + return sorted_result + + +def resp2html_wrapper(resp: Response) -> HtmlElement: + tree = html.fromstring(resp.text) + if 'not available in your region' in tree.text_content(): + raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置') + elif '/login/' in str(resp.url): + raise SiteBlocked('FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP') + return tree + + + + +def parse_anime_page(movie: MovieInfo, tree: HtmlElement): + """解析动画影片的页面布局""" + title = tree.xpath("//h1[@id='title']/text()")[0] + container = tree.xpath("//table[@class='mg-b12']/tr/td")[0] + cover = container.xpath("//img[@name='package-image']/@src")[0] + date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip() + publish_date = date_str.replace('/', '-') + duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()") + if duration_tag: + movie.duration = duration_tag[0].strip().replace('分', '') + serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()") + if serial_tag: + movie.serial = serial_tag[0].strip() + producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()") + if producer_tag: + movie.producer = producer_tag[0].strip() + genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]") + genre, genre_id = [], [] + for tag in genre_tags: + genre.append(tag.text.strip()) + genre_id.append(tag.get('href').split('=')[-1].strip('/')) + cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() + plot = container.xpath("//div[@class='mg-b20 lh4']/p")[0].text_content().strip() + preview_pics = container.xpath("//a[@name='sample-image']/img/@data-lazy") + score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0] + score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 + + movie.cid = cid + movie.title = title + movie.cover = cover + movie.publish_date = publish_date + movie.genre = genre + movie.genre_id = genre_id + movie.plot = plot + movie.score = f'{score/5:.2f}' # 转换为10分制 + movie.preview_pics = preview_pics + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + + +# parse_dvd_page = parse_videoa_page # 118wtktabf067 +# parse_ppr_page = parse_videoa_page +# parse_nikkatsu_page = parse_videoa_page +# parse_doujin_page = parse_anime_page + +class FanzaCrawler(Crawler): + id = CrawlerID.fanza + + async def get_urls_of_cid(self, cid: str) -> Tuple[str, str]: + """搜索cid可能的影片URL""" + r = await self.client.get(f"{self.base_url}search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0") + if r.status_code == 404: + raise MovieNotFoundError(__name__, cid) + r.raise_for_status() + tree = resp2html_wrapper(r) + result = tree.xpath("//ul[@id='list']/li/div/p/a/@href") + parsed_result = {} + for url in result: + items = url.split('/') + type_, cid = None, None + for i, part in enumerate(items): + if part == '-': + product, type_ = items[i-2], items[i-1] + elif part.startswith('cid='): + cid = part[4:] + new_url = '/'.join(i for i in items if not i.startswith('?')) + '/' + parsed_result.setdefault(cid, []).append({'product': product, 'type': type_, 'url': new_url}) + break + if cid not in parsed_result: + if len(result) > 0: + logger.debug(f"Unknown URL in search result: " + ', '.join(result)) + raise MovieNotFoundError(__name__, cid) + sorted_result = sort_search_result(parsed_result[cid]) + return sorted_result + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.dmm.co.jp') + self.base_url = str(url) + self.client = get_client(url) + + # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) + self.client.cookies = {'age_check_done': '1'} + self.client.headers['Accept-Language'] = 'ja,en-US;q=0.9' + return self + + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + default_url = f'{self.base_url}digital/videoa/-/detail/=/cid={movie.cid}/' + r0 = await self.client.get(default_url) + if r0.status_code == 404: + urls = await self.get_urls_of_cid(movie.cid) + for d in urls: + func_name = f"parse_{d['type']}_page" + if func_name in globals(): + parse_func = globals()[func_name] + else: + logger.debug(f"不知道怎么解析 fanza {d['type']} 的页面: {d['url']}") + continue + r = await self.client.get(d['url']) + tree = resp2html_wrapper(r) + try: + parse_func(movie, tree) + movie.url = d['url'] + break + except: + logger.debug(f"Fail to parse {d['url']}", exc_info=True) + if d is urls[-1]: + logger.warning(f"在fanza查找到的cid={movie.cid}的影片页面均解析失败") + raise + else: + tree = resp2html_wrapper(r0) + await self.parse_videoa_page(movie, tree) + movie.url = default_url + + async def parse_videoa_page(self, movie: MovieInfo, tree: HtmlElement): + """解析AV影片的页面布局""" + title = tree.xpath("//div[@class='hreview']/h1/text()")[0] + # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来 + container = tree.xpath("//table[@class='mg-b12']/tr/td")[0] + cover = container.xpath("//div[@id='sample-video']/a/@href")[0] + # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083 + date_tag = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()") + if date_tag: + movie.publish_date = date_tag[0].strip().replace('/', '-') + duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip() + match = re.search(r'\d+', duration_str) + if match: + movie.duration = match.group(0) + # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况 + actress = container.xpath("//span[@id='performer']/a/text()") + director_tag = container.xpath("//td[text()='監督:']/following-sibling::td/a/text()") + if director_tag: + movie.director = director_tag[0].strip() + serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()") + if serial_tag: + movie.serial = serial_tag[0].strip() + producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()") + if producer_tag: + movie.producer = producer_tag[0].strip() + # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 + # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()") + # if label_tag: + # label = label_tag[0].strip() + # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选 + genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]") + genre, genre_id = [], [] + for tag in genre_tags: + genre.append(tag.text.strip()) + genre_id.append(tag.get('href').split('=')[-1].strip('/')) + cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() + plot = container.xpath("//div[contains(@class, 'mg-b20 lh4')]/text()")[0].strip() + preview_pics = container.xpath("//a[@name='sample-image']/img/@src") + score_tag = container.xpath("//p[@class='d-review__average']/strong/text()") + if score_tag: + match = re.search(r'\d+', score_tag[0].strip()) + if match: + score = float(match.group()) * 2 + movie.score = f'{score:.2f}' + else: + score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0] + movie.score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 + + if Cfg().crawler.hardworking: + # 预览视频是动态加载的,不在静态网页中 + video_url = f'{self.base_url}service/digitalapi/-/html5_player/=/cid={movie.cid}' + resp = await self.client.get(video_url) + tree2 = html.fromstring(resp.text) + # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据 + script = tree2.xpath("//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()")[0].strip() + match = re.search(r'\{.*\}', script) + # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配 + try: + data = json.loads(match.group()) + video_url = data.get('src') + if video_url and video_url.startswith('//'): + video_url = 'https:' + video_url + movie.preview_video = video_url + except Exception as e: + logger.debug('解析视频地址时异常: ' + repr(e)) + + movie.cid = cid + movie.title = title + movie.cover = cover + movie.actress = actress + movie.genre = genre + movie.genre_id = genre_id + movie.plot = plot + movie.preview_pics = preview_pics + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + +if __name__ == "__main__": + + async def test_main(): + crawler = await FanzaCrawler.create() + movie = MovieInfo(cid="d_aisoft3356") + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/fc2.py b/javsp/crawlers/sites/fc2.py new file mode 100644 index 000000000..0ce072b90 --- /dev/null +++ b/javsp/crawlers/sites/fc2.py @@ -0,0 +1,120 @@ +"""从FC2官网抓取数据""" +import logging + +from lxml import html + + +from javsp.crawlers.exceptions import * +from javsp.config import Cfg +from javsp.lib import strftime_to_minutes +from javsp.datatype import MovieInfo +from javsp.crawlers.interface import Crawler +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.config import CrawlerID + + +logger = logging.getLogger(__name__) + +class Fc2Crawler(Crawler): + id = CrawlerID.fc2 + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://adult.contents.fc2.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def get_movie_score(self, fc2_id: str) -> float | None: + """通过评论数据来计算FC2的影片评分(10分制),无法获得评分时返回None""" + resp = await self.client.get(f'{self.base_url}/article/{fc2_id}/review') + tree = html.fromstring(resp.text) + review_tags = tree.xpath("//ul[@class='items_comment_headerReviewInArea']/li") + reviews = {} + for tag in review_tags: + score = int(tag.xpath("div/span/text()")[0]) + vote = int(tag.xpath("span")[0].text_content()) + reviews[score] = vote + total_votes = sum(reviews.values()) + if (total_votes >= 2): # 至少也该有两个人评价才有参考意义一点吧 + summary = sum([k*v for k, v in reviews.items()]) + final_score = summary / total_votes * 2 # 乘以2转换为10分制 + return final_score + + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + # 去除番号中的'FC2'字样 + id_uc = movie.dvdid.upper() + if not id_uc.startswith('FC2-'): + raise ValueError('Invalid FC2 number: ' + movie.dvdid) + fc2_id = id_uc.replace('FC2-', '') + # 抓取网页 + url = f'{self.base_url}/article/{fc2_id}/' + resp = await self.client.get(url) + if '/id.fc2.com/' in str(resp.url): + raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP') + tree = html.fromstring(resp.text) + container = tree.xpath("//div[@class='items_article_left']") + if len(container) > 0: + container = container[0] + else: + raise MovieNotFoundError(__name__, movie.dvdid) + # FC2 标题增加反爬乱码,使用数组合并标题 + title_arr = container.xpath("//div[@class='items_article_headerInfo']/h3/text()") + title = ''.join(title_arr) + thumb_tag = container.xpath("//div[@class='items_article_MainitemThumb']")[0] + thumb_pic = thumb_tag.xpath("span/img/@src")[0] + duration_str = thumb_tag.xpath("span/p[@class='items_article_info']/text()")[0] + # FC2没有制作商和发行商的区分,作为个人市场,影片页面的'by'更接近于制作商 + producer = container.xpath("//li[text()='by ']/a/text()")[0] + genre = container.xpath("//a[@class='tag tagTag']/text()") + date_str = container.xpath("//div[@class='items_article_Releasedate']/p/text()")[0] + publish_date = date_str[-10:].replace('/', '-') # '販売日 : 2017/11/30' + preview_pics = container.xpath("//ul[@data-feed='sample-images']/li/a/@href") + + if Cfg().crawler.hardworking: + # 通过评论数据来计算准确的评分 + score = await self.get_movie_score(fc2_id) + if score: + movie.score = f'{score:.2f}' + # 预览视频是动态加载的,不在静态网页中 + desc_frame_url = container.xpath("//section[@class='items_article_Contents']/iframe/@src")[0] + key = desc_frame_url.split('=')[-1] # /widget/article/718323/description?ac=60fc08fa... + api_url = f'{self.base_url}/api/v2/videos/{fc2_id}/sample?key={key}' + resp = await self.client.get(api_url) + j = resp.json() + movie.preview_video = j['path'] + else: + # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星 + score_tag_attr = container.xpath("//a[@class='items_article_Stars']/p/span/@class")[0] + score = int(score_tag_attr[-1]) * 2 + movie.score = f'{score:.2f}' + + movie.dvdid = id_uc + movie.url = url + movie.title = title + movie.genre = genre + movie.producer = producer + movie.duration = str(strftime_to_minutes(duration_str)) + movie.publish_date = publish_date + movie.preview_pics = preview_pics + # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 + if movie.preview_pics: + movie.cover = preview_pics[0] + else: + movie.cover = thumb_pic + + +if __name__ == "__main__": + + async def test_main(): + crawler = await Fc2Crawler.create() + movie = MovieInfo("FC2-718323") + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/fc2ppvdb.py b/javsp/crawlers/sites/fc2ppvdb.py new file mode 100644 index 000000000..fbba590c2 --- /dev/null +++ b/javsp/crawlers/sites/fc2ppvdb.py @@ -0,0 +1,92 @@ +"""从FC2PPVDB抓取数据""" + +# BUG: This crawler doesn't work, seemed due to cloudflare + +from typing import List + + +from javsp.crawlers.exceptions import * +from javsp.lib import strftime_to_minutes +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + + +class Fc2PpvDbCrawler(Crawler): + id = CrawlerID.fc2ppvdb + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://fc2ppvdb.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + + def get_list_first(list: List): + return list[0] if list and len(list) > 0 else None + + # 去除番号中的'FC2'字样 + id_uc = movie.dvdid.upper() + if not id_uc.startswith('FC2-'): + raise ValueError('Invalid FC2 number: ' + movie.dvdid) + fc2_id = id_uc.replace('FC2-', '') + # 抓取网页 + url = f'{self.base_url}/articles/{fc2_id}' + resp = await self.client.get(url) + tree = html.fromstring(resp.content) + # html = get_html(url) + container = tree.xpath("//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]") + if len(container) > 0: + container = container[0] + else: + raise MovieNotFoundError(__name__, movie.dvdid) + + title = container.xpath("//h2/a/text()") + thumb_pic = container.xpath(f"//img[@alt='{fc2_id}']/@src") + duration_str = container.xpath("//div[starts-with(text(),'収録時間:')]/span/text()") + actress = container.xpath("//div[starts-with(text(),'女優:')]/span/a/text()") + genre = container.xpath("//div[starts-with(text(),'タグ:')]/span/a/text()") + publish_date = container.xpath("//div[starts-with(text(),'販売日:')]/span/text()") + publisher = container.xpath("//div[starts-with(text(),'販売者:')]/span/a/text()") + uncensored_str = container.xpath("//div[starts-with(text(),'モザイク:')]/span/text()") + uncensored_str_f = get_list_first(uncensored_str); + uncensored = True if uncensored_str_f == '無' else False if uncensored_str_f == '有' else None + preview_pics = None + preview_video = container.xpath("//a[starts-with(text(),'サンプル動画')]/@href") + + movie.dvdid = id_uc + movie.url = url + movie.title = get_list_first(title) + movie.genre = genre + movie.actress = actress + movie.duration = str(strftime_to_minutes(get_list_first(duration_str))) + movie.publish_date = get_list_first(publish_date) + movie.publisher = get_list_first(publisher) + movie.uncensored = uncensored + movie.preview_pics = preview_pics + movie.preview_video = get_list_first(preview_video) + + # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 + if movie.preview_pics: + movie.cover = preview_pics[0] + else: + movie.cover = get_list_first(thumb_pic) + + +if __name__ == "__main__": + + async def test_main(): + crawler = await Fc2PpvDbCrawler.create() + movie = MovieInfo('FC2-4497837') + await crawler.crawl_and_fill(movie) + print(movie) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/gyutto.py b/javsp/crawlers/sites/gyutto.py new file mode 100644 index 000000000..b30200284 --- /dev/null +++ b/javsp/crawlers/sites/gyutto.py @@ -0,0 +1,106 @@ +"""从https://gyutto.com/官网抓取数据""" +import logging +import time + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html +from lxml.html import HtmlElement + +logger = logging.getLogger(__name__) + +def get_movie_title(tree: HtmlElement) -> str: + container = tree.xpath("//h1") + if len(container) > 0: + container = container[0] + title = container.text + + return title + +def get_movie_img(tree: HtmlElement, index = 1) -> list[str]: + images = [] + container = tree.xpath("//a[@class='highslide']/img") + if len(container) > 0: + if index == 0: + return container[0].get('src') + + for row in container: + images.append(row.get('src')) + + return images + +class GyuttoCrawler(Crawler): + id = CrawlerID.gyutto + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'http://gyutto.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + # 去除番号中的'gyutto'字样 + id_uc = movie.dvdid.upper() + if not id_uc.startswith('GYUTTO-'): + raise ValueError('Invalid gyutto number: ' + movie.dvdid) + gyutto_id = id_uc.replace('GYUTTO-', '') + # 抓取网页 + url = f'{self.base_url}/i/item{gyutto_id}?select_uaflag=1' + r = await self.client.get(url) + if r.status_code == 404: + raise MovieNotFoundError(__name__, movie.dvdid) + tree = html.fromstring(r.text) + container = tree.xpath("//dl[@class='BasicInfo clearfix']") + + producer = None + genre = None + date = None + publish_date = None + + for row in container: + key = row.xpath(".//dt/text()") + if key[0] == "サークル": + producer = ''.join(row.xpath(".//dd/a/text()")) + elif key[0] == "ジャンル": + genre = row.xpath(".//dd/a/text()") + elif key[0] == "配信開始日": + date = row.xpath(".//dd/text()") + date_str = ''.join(date) + date_time = time.strptime(date_str, "%Y年%m月%d日") + publish_date = time.strftime("%Y-%m-%d", date_time) + + plot = tree.xpath("//div[@class='unit_DetailLead']/p/text()")[0] + + movie.title = get_movie_title(tree) + movie.cover = get_movie_img(tree, 0) + movie.preview_pics = get_movie_img(tree) + movie.dvdid = id_uc + movie.url = url + movie.producer = producer + # movie.actress = actress + # movie.duration = duration + movie.publish_date = publish_date + movie.genre = genre + movie.plot = plot + + +if __name__ == "__main__": + + async def test_main(): + crawler = await GyuttoCrawler.create() + movie = MovieInfo('gyutto-266923') + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/jav321.py b/javsp/crawlers/sites/jav321.py new file mode 100644 index 000000000..6a20a98ec --- /dev/null +++ b/javsp/crawlers/sites/jav321.py @@ -0,0 +1,117 @@ +"""从jav321抓取数据""" +import re +import logging + + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + + +logger = logging.getLogger(__name__) + +class Jav321Crawler(Crawler): + id = CrawlerID.jav321 + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.jav321.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + + """解析指定番号的影片数据""" + resp = await self.client.post(f'{self.base_url}/search', data={'sn': movie.dvdid}) + tree = html.fromstring(resp.text) + page_url = tree.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0] + #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542 + cid = page_url.split('/')[-1] # /video/ipx00177 + # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片 + if cid == 'search': + raise MovieNotFoundError(__name__, movie.dvdid) + title = tree.xpath("//div[@class='panel-heading']/h3/text()")[0] + info = tree.xpath("//div[@class='col-md-9']")[0] + # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签 + company_tags = info.xpath("a[contains(@href,'/company/')]/text()") + if company_tags: + movie.producer = company_tags[0] + # actress, actress_pics + # jav321现在连女优信息都没有了,首页通过女优栏跳转过去也全是空白 + actress, actress_pics = [], {} + actress_tags = tree.xpath("//div[@class='thumbnail']/a[contains(@href,'/star/')]/img") + for tag in actress_tags: + name = tag.tail.strip() + pic_url = tag.get('src') + actress.append(name) + # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url, + # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据 + actress_pics[name] = pic_url + # genre, genre_id + genre_tags = info.xpath("a[contains(@href,'/genre/')]") + genre, genre_id = [], [] + for tag in genre_tags: + genre.append(tag.text) + genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1 + dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper() + publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '') + duration_str = info.xpath("b[text()='収録時間']")[0].tail + match = re.search(r'\d+', duration_str) + if match: + movie.duration = match.group(0) + # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星 + score_tag = info.xpath("//b[text()='平均評価']/following-sibling::img/@data-original") + if score_tag: + score = int(score_tag[0][5:7])/5 # /10*2 + movie.score = str(score) + serial_tag = info.xpath("a[contains(@href,'/series/')]/text()") + if serial_tag: + movie.serial = serial_tag[0] + preview_video_tag = info.xpath("//video/source/@src") + if preview_video_tag: + movie.preview_video = preview_video_tag[0] + plot_tag = info.xpath("//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()") + if plot_tag: + movie.plot = plot_tag[0] + preview_pics = tree.xpath("//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src") + if len(preview_pics) == 0: + # 尝试搜索另一种布局下的封面,需要使用onerror过滤掉明明没有封面时网站往里面塞的默认URL + preview_pics = tree.xpath("//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src") + # 有的图片链接里有多个//,网站质量堪忧…… + preview_pics = [i[:8] + i[8:].replace('//', '/') for i in preview_pics] + # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析 + + movie.url = page_url + movie.cid = cid + movie.dvdid = dvdid + movie.title = title + movie.actress = actress + movie.actress_pics = actress_pics + movie.genre = genre + movie.genre_id = genre_id + movie.publish_date = publish_date + # preview_pics的第一张图始终是封面,剩下的才是预览图 + if len(preview_pics) > 0: + movie.cover = preview_pics[0] + movie.preview_pics = preview_pics[1:] + + +if __name__ == "__main__": + + async def test_main(): + crawler = await Jav321Crawler.create() + movie = MovieInfo('SCUTE-1177') + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/javbus.py b/javsp/crawlers/sites/javbus.py new file mode 100644 index 000000000..b3efaa8dd --- /dev/null +++ b/javsp/crawlers/sites/javbus.py @@ -0,0 +1,129 @@ +"""从JavBus抓取数据""" +import logging + + +from javsp.crawlers.exceptions import * +from javsp.func import * +from javsp.config import CrawlerID +from javsp.datatype import MovieInfo, GenreMap + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client + +from javsp.crawlers.interface import Crawler +from lxml import html + + +logger = logging.getLogger(__name__) + +class JavbusCrawler(Crawler): + id = CrawlerID.javbus + genre_map: GenreMap + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.javbus.com') + self.base_url = str(url) + self.client = get_client(url) + self.client.cookies = {'age': 'verified', 'dv': '1'} + self.genre_map = GenreMap('data/genre_javbus.csv') + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + + """从网页抓取并解析指定番号的数据 + Args: + movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 + """ + url = f'{self.base_url}/{movie.dvdid}' + resp = await self.client.get(url) + + tree = html.fromstring(resp.content) + # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息 + # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404 + page_title = tree.xpath('/html/head/title/text()') + if page_title and page_title[0].startswith('404 Page Not Found!'): + raise MovieNotFoundError(__name__, movie.dvdid) + + container = tree.xpath("//div[@class='container']")[0] + title = container.xpath("h3/text()")[0] + cover = container.xpath("//a[@class='bigImage']/img/@src")[0] + preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href") + info = container.xpath("//div[@class='col-md-3 info']")[0] + dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text + publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip() + duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip() + director_tag = info.xpath("p/span[text()='導演:']") + if director_tag: # xpath没有匹配时将得到空列表 + movie.director = director_tag[0].getnext().text.strip() + producer_tag = info.xpath("p/span[text()='製作商:']") + if producer_tag: + text = producer_tag[0].getnext().text + if text: + movie.producer = text.strip() + publisher_tag = info.xpath("p/span[text()='發行商:']") + if publisher_tag: + movie.publisher = publisher_tag[0].getnext().text.strip() + serial_tag = info.xpath("p/span[text()='系列:']") + if serial_tag: + movie.serial = serial_tag[0].getnext().text + # genre, genre_id + genre_tags = info.xpath("//span[@class='genre']/label/a") + genre, genre_id = [], [] + for tag in genre_tags: + tag_url = tag.get('href') + pre_id = tag_url.split('/')[-1] + genre.append(tag.text) + if 'uncensored' in tag_url: + movie.uncensored = True + genre_id.append('uncensored-' + pre_id) + else: + movie.uncensored = False + genre_id.append(pre_id) + # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析 + # actress, actress_pics + actress, actress_pics = [], {} + actress_tags = tree.xpath("//a[@class='avatar-box']/div/img") + for tag in actress_tags: + name = tag.get('title') + pic_url = tag.get('src') + actress.append(name) + if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像 + actress_pics[name] = pic_url + # 整理数据并更新movie的相应属性 + movie.url = f'{self.base_url}/{movie.dvdid}' + movie.dvdid = dvdid + movie.title = title.replace(dvdid, '').strip() + movie.cover = cover + movie.preview_pics = preview_pics + if publish_date != '0000-00-00': # 丢弃无效的发布日期 + movie.publish_date = publish_date + movie.duration = duration if int(duration) else None + movie.genre = genre + movie.genre_id = genre_id + movie.actress = actress + movie.actress_pics = actress_pics + + async def crawl_and_fill_cleaned(self, movie: MovieInfo): + """解析指定番号的影片数据并进行清洗""" + await self.crawl_and_fill(movie) + movie.genre_norm = self.genre_map.map(movie.genre_id) + movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换) + + +if __name__ == "__main__": + + async def test_main(): + crawler = await JavbusCrawler.create() + print(crawler.client.headers) + movie = MovieInfo('NANP-030') + # try: + await crawler.crawl_and_fill_cleaned(movie) + print(movie) + # except Exception as e: + # print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/javdb.py b/javsp/crawlers/sites/javdb.py new file mode 100644 index 000000000..ab23e18bd --- /dev/null +++ b/javsp/crawlers/sites/javdb.py @@ -0,0 +1,350 @@ +"""从JavDB抓取数据""" +import os +import re +import logging + +from httpx import Cookies + +from javsp.func import * +from javsp.avid import guess_av_type +from javsp.config import CrawlerID +from javsp.datatype import MovieInfo, GenreMap +from javsp.chromium import get_browsers_cookies + +from javsp.crawlers.exceptions import CredentialError, MovieDuplicateError, MovieNotFoundError, SiteBlocked, SitePermissionError, WebsiteError +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client + +from javsp.crawlers.interface import Crawler +from lxml import html + +logger = logging.getLogger(__name__) + +class JavDbCrawler(Crawler): + id = CrawlerID.javdb + genre_map: GenreMap + cookies_pool: list[Cookies] + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.javdb.com') + self.base_url = str(url) + self.client = get_client(url) + self.client.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5' + self.genre_map = GenreMap('data/genre_javdb.csv') + self.cookies_pool = [] + return self + + async def get_html_wrapper(self, url: str): + """包装外发的request请求并负责转换为可xpath的html,同时处理Cookies无效等问题""" + r = await self.client.get(url) + if r.status_code == 200: + # 发生重定向可能仅仅是域名重定向,因此还要检查url以判断是否被跳转到了登录页 + if r.history and '/login' in str(r.url): + # 仅在需要时去读取Cookies + if len(self.cookies_pool) == 0: + try: + self.cookies_pool = get_browsers_cookies() + except (PermissionError, OSError) as e: + logger.warning(f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", exc_info=True) + cookies_pool = [] + except Exception as e: + logger.warning(f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", exc_info=True) + cookies_pool = [] + if len(self.cookies_pool) > 0: + item = self.cookies_pool.pop() + # 更换Cookies时需要创建新的request实例,否则cloudscraper会保留它内部第一次发起网络访问时获得的Cookies + self.client.cookies = item['cookies'] + cookies_source = (item['profile'], item['site']) + logger.debug(f'未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}') + return self.get_html_wrapper(url) + else: + raise CredentialError('JavDB: 所有浏览器Cookies均已过期') + elif r.history and 'pay' in str(r.url).split('/')[-1]: + raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'") + else: + + return html.fromstring(r.text) + elif r.status_code in (403, 503): + tree = html.fromstring(r.text) + code_tag = tree.xpath("//span[@class='code-label']/span") + error_code = code_tag[0].text if code_tag else None + if error_code: + if error_code == '1020': + block_msg = f'JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器' + else: + block_msg = f'JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})' + else: + block_msg = f'JavDB: {r.status_code} 禁止访问: {url}' + raise SiteBlocked(block_msg) + else: + raise WebsiteError(f'JavDB: {r.status_code} 非预期状态码: {url}') + + + async def get_user_info(self, site: str, cookies: Cookies): + """获取cookies对应的JavDB用户信息""" + try: + self.client.cookies = cookies + resp = await self.client.get(f'https://{site}/users/profile') + + html_str = resp.text + tree = html.fromstring(html_str) + except Exception as e: + logger.info('JavDB: 获取用户信息时出错') + logger.debug(e, exc_info=1) + return + # 扫描浏览器得到的Cookies对应的临时域名可能会过期,因此需要先判断域名是否仍然指向JavDB的站点 + if 'JavDB' in html_str: + email = tree.xpath("//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()")[0].strip() + username = tree.xpath("//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()")[0].strip() + return email, username + else: + logger.debug('JavDB: 域名已过期: ' + site) + + + async def get_valid_cookies(self): + """扫描浏览器,获取一个可用的Cookies""" + # 经测试,Cookies所发往的域名不需要和登录时的域名保持一致,只要Cookies有效即可在多个域名间使用 + for d in self.cookies_pool: + info = await self.get_user_info(d['site'], d['cookies']) + if info: + return d['cookies'] + else: + logger.debug(f"{d['profile']}, {d['site']}: Cookies无效") + + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """从网页抓取并解析指定番号的数据 + Args: + movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 + """ + # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个 + tree = await self.get_html_wrapper(f'{self.base_url}/search?q={movie.dvdid}') + ids = list(map(str.lower, tree.xpath("//div[@class='video-title']/strong/text()"))) + movie_urls = tree.xpath("//a[@class='box']/@href") + match_count = len([i for i in ids if i == movie.dvdid.lower()]) + if match_count == 0: + raise MovieNotFoundError(__name__, movie.dvdid, ids) + elif match_count == 1: + index = ids.index(movie.dvdid.lower()) + new_url = movie_urls[index] + try: + html2 = await self.get_html_wrapper(new_url) + except (SitePermissionError, CredentialError): + # 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面 + box = tree.xpath("//a[@class='box']")[index] + movie.url = new_url + movie.title = box.get('title') + movie.cover = box.xpath("div/img/@src")[0] + score_str = box.xpath("div[@class='score']/span/span")[0].tail + score = re.search(r'([\d.]+)分', score_str).group(1) + movie.score = "{:.2f}".format(float(score)*2) + movie.publish_date = box.xpath("div[@class='meta']/text()")[0].strip() + return + else: + raise MovieDuplicateError(__name__, movie.dvdid, match_count) + + container = html2.xpath("/html/body/section/div/div[@class='video-detail']")[0] + info = container.xpath("//nav[@class='panel movie-panel-info']")[0] + title = container.xpath("h2/strong[@class='current-title']/text()")[0] + show_orig_title = container.xpath("//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]") + if show_orig_title: + movie.ori_title = container.xpath("h2/span[@class='origin-title']/text()")[0] + cover = container.xpath("//img[@class='video-cover']/@src")[0] + preview_pics = container.xpath("//a[@class='tile-item'][@data-fancybox='gallery']/@href") + preview_video_tag = container.xpath("//video[@id='preview-video']/source/@src") + if preview_video_tag: + preview_video = preview_video_tag[0] + if preview_video.startswith('//'): + preview_video = 'https:' + preview_video + movie.preview_video = preview_video + dvdid = info.xpath("div/span")[0].text_content() + publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text + duration = info.xpath("div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip() + director_tag = info.xpath("div/strong[text()='導演:']") + if director_tag: + movie.director = director_tag[0].getnext().text_content().strip() + av_type = guess_av_type(movie.dvdid) + if av_type != 'fc2': + producer_tag = info.xpath("div/strong[text()='片商:']") + else: + producer_tag = info.xpath("div/strong[text()='賣家:']") + if producer_tag: + movie.producer = producer_tag[0].getnext().text_content().strip() + publisher_tag = info.xpath("div/strong[text()='發行:']") + if publisher_tag: + movie.publisher = publisher_tag[0].getnext().text_content().strip() + serial_tag = info.xpath("div/strong[text()='系列:']") + if serial_tag: + movie.serial = serial_tag[0].getnext().text_content().strip() + score_tag = info.xpath("//span[@class='score-stars']") + if score_tag: + score_str = score_tag[0].tail + score = re.search(r'([\d.]+)分', score_str).group(1) + movie.score = "{:.2f}".format(float(score)*2) + genre_tags = info.xpath("//strong[text()='類別:']/../span/a") + genre, genre_id = [], [] + for tag in genre_tags: + pre_id = tag.get('href').split('/')[-1] + genre.append(tag.text) + genre_id.append(pre_id) + # 判定影片有码/无码 + subsite = pre_id.split('?')[0] + movie.uncensored = {'uncensored': True, 'tags':False}.get(subsite) + # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优 + actors_tag = info.xpath("//strong[text()='演員:']/../span")[0] + all_actors = actors_tag.xpath("a/text()") + genders = actors_tag.xpath("strong/text()") + actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀'] + magnet = container.xpath("//div[@class='magnet-name column is-four-fifths']/a/@href") + + movie.dvdid = dvdid + movie.url = self.base_url + movie.title = title.replace(dvdid, '').strip() + movie.cover = cover + movie.preview_pics = preview_pics + movie.publish_date = publish_date + movie.duration = duration + movie.genre = genre + movie.genre_id = genre_id + movie.actress = actress + movie.magnet = [i.replace('[javdb.com]','') for i in magnet] + + + async def crawl_and_fill_cleaned(self, movie: MovieInfo): + """解析指定番号的影片数据并进行清洗""" + try: + await self.crawl_and_fill(movie) + # 检查封面URL是否真的存在对应图片 + if movie.cover is not None: + r = await self.client.head(movie.cover) + if r.status_code != 200: + movie.cover = None + except SiteBlocked: + raise + logger.error('JavDB: 可能触发了反爬虫机制,请稍后再试') + if movie.genre_id and (not movie.genre_id[0].startswith('fc2?')): + movie.genre_norm = self.genre_map.map(movie.genre_id) + movie.genre_id = None # 没有别的地方需要再用到,清空genre id(表明已经完成转换) + + + async def collect_actress_alias(self, type=0, use_original=True): + """ + 收集女优的别名 + type: 0-有码, 1-无码, 2-欧美 + use_original: 是否使用原名而非译名,True-田中レモン,False-田中檸檬 + """ + import json + import time + import random + + actressAliasMap = {} + + actressAliasFilePath = "data/actress_alias.json" + # 检查文件是否存在 + if not os.path.exists(actressAliasFilePath): + # 如果文件不存在,创建文件并写入空字典 + with open(actressAliasFilePath, "w", encoding="utf-8") as file: + json.dump({}, file) + + typeList = ["censored", "uncensored", "western"] + page_url = f"{self.base_url}/actors/{typeList[type]}" + while True: + try: + tree = await self.get_html_wrapper(page_url) + actors = tree.xpath("//div[@class='box actor-box']/a") + + count = 0 + for actor in actors: + count += 1 + actor_name = actor.xpath("strong/text()")[0].strip() + actor_url = actor.xpath("@href")[0] + # actor_url = f"https://javdb.com{actor_url}" # 构造演员主页的完整URL + + # 进入演员主页,获取更多信息 + actor_html = await self.get_html_wrapper(actor_url) + # 解析演员所有名字信息 + names_span = actor_html.xpath("//span[@class='actor-section-name']")[0] + aliases_span_list = actor_html.xpath("//span[@class='section-meta']") + aliases_span = aliases_span_list[0] + + names_list = [name.strip() for name in names_span.text.split(",")] + if len(aliases_span_list) > 1: + aliases_list = [ + alias.strip() for alias in aliases_span.text.split(",") + ] + else: + aliases_list = [] + + # 将信息添加到actressAliasMap中 + actressAliasMap[names_list[-1 if use_original else 0]] = ( + names_list + aliases_list + ) + print( + f"{count} --- {names_list[-1 if use_original else 0]}: {names_list + aliases_list}" + ) + + if count == 10: + # 将数据写回文件 + with open(actressAliasFilePath, "r", encoding="utf-8") as file: + existing_data = json.load(file) + + # 合并现有数据和新爬取的数据 + existing_data.update(actressAliasMap) + + # 将合并后的数据写回文件 + with open(actressAliasFilePath, "w", encoding="utf-8") as file: + json.dump(existing_data, file, ensure_ascii=False, indent=2) + + actressAliasMap = {} # 重置actressAliasMap + + print( + f"已爬取 {count} 个女优,数据已更新并写回文件:", + actressAliasFilePath, + ) + + # 重置计数器 + count = 0 + + time.sleep(max(1, 10 * random.random())) # 随机等待 1-10 秒 + + # 判断是否有下一页按钮 + next_page_link = tree.xpath( + "//a[@rel='next' and @class='pagination-next']/@href" + ) + if not next_page_link: + break # 没有下一页,结束循环 + else: + next_page_url = f"{next_page_link[0]}" + page_url = next_page_url + + except SiteBlocked: + raise + + with open(actressAliasFilePath, "r", encoding="utf-8") as file: + existing_data = json.load(file) + + # 合并现有数据和新爬取的数据 + existing_data.update(actressAliasMap) + + # 将合并后的数据写回文件 + with open(actressAliasFilePath, "w", encoding="utf-8") as file: + json.dump(existing_data, file, ensure_ascii=False, indent=2) + + print(f"已爬取 {count} 个女优,数据已更新并写回文件:", actressAliasFilePath) + + +if __name__ == "__main__": + + async def test_main(): + crawler = await JavDbCrawler.create() + movie = MovieInfo('FC2-2735981') + try: + await crawler.crawl_and_fill_cleaned(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/javlib.py b/javsp/crawlers/sites/javlib.py new file mode 100644 index 000000000..c71a5f336 --- /dev/null +++ b/javsp/crawlers/sites/javlib.py @@ -0,0 +1,115 @@ +"""从JavLibrary抓取数据""" +import logging +from urllib.parse import urlsplit + +from httpx._transports import base + +from javsp.crawlers.exceptions import MovieDuplicateError, MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + +logger = logging.getLogger(__name__) + +class JavLibCrawler(Crawler): + id = CrawlerID.jav321 + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.javlibrary.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + # TODO: 发现JavLibrary支持使用cid搜索,会直接跳转到对应的影片页面,也许可以利用这个功能来做cid到dvdid的转换 + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + url = new_url = f'{self.base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}' + resp = await self.client.get(url) + tree = html.fromstring(resp.text) + if resp.history and urlsplit(str(resp.url)).netloc == urlsplit(self.base_url).netloc: + # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果 + new_url = resp.url + else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果 + video_tags = tree.xpath("//div[@class='video'][@id]/a") + # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果 + pre_choose = [] + for tag in video_tags: + tag_dvdid = tag.xpath("div[@class='id']/text()")[0] + if tag_dvdid.upper() == movie.dvdid.upper(): + pre_choose.append(tag) + pre_choose_urls = [i.get('href') for i in pre_choose] + match_count = len(pre_choose) + if match_count == 0: + raise MovieNotFoundError(__name__, movie.dvdid) + elif match_count == 1: + new_url = pre_choose_urls[0] + elif match_count == 2: + no_blueray = [] + for tag in pre_choose: + if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc + no_blueray.append(tag) + no_blueray_count = len(no_blueray) + if no_blueray_count == 1: + new_url = no_blueray[0].get('href') + logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}") + else: + # 两个结果中没有谁是蓝光影片,说明影片番号重复了 + raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls) + else: + # 存在不同影片但是番号相同的情况,如MIDV-010 + raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls) + # 重新抓取网页 + resp = await self.client.get(new_url) + tree = html.fromstring(resp.text) + container = tree.xpath("/html/body/div/div[@id='rightcolumn']")[0] + title_tag = container.xpath("div/h3/a/text()") + title = title_tag[0] + cover = container.xpath("//img[@id='video_jacket_img']/@src")[0] + info = container.xpath("//div[@id='video_info']")[0] + dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0] + publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0] + duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0] + director_tag = info.xpath("//span[@class='director']/a/text()") + if director_tag: + movie.director = director_tag[0] + producer = info.xpath("//span[@class='maker']/a/text()")[0] + publisher_tag = info.xpath("//span[@class='label']/a/text()") + if publisher_tag: + movie.publisher = publisher_tag[0] + score_tag = info.xpath("//span[@class='score']/text()") + if score_tag: + movie.score = score_tag[0].strip('()') + genre = info.xpath("//span[@class='genre']/a/text()") + actress = info.xpath("//span[@class='star']/a/text()") + + movie.dvdid = dvdid + movie.url = new_url + movie.title = title.replace(dvdid, '').strip() + if cover.startswith('//'): # 补全URL中缺少的协议段 + cover = 'https:' + cover + movie.cover = cover + movie.publish_date = publish_date + movie.duration = duration + movie.producer = producer + movie.genre = genre + movie.actress = actress + + +if __name__ == "__main__": + + async def test_main(): + crawler = await JavLibCrawler.create() + movie = MovieInfo('IPX-177') + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/javmenu.py b/javsp/crawlers/sites/javmenu.py new file mode 100644 index 000000000..6553d86a1 --- /dev/null +++ b/javsp/crawlers/sites/javmenu.py @@ -0,0 +1,100 @@ +"""从JavMenu抓取数据""" +import logging + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + +logger = logging.getLogger(__name__) + +class JavMenuCrawler(Crawler): + id = CrawlerID.javmenu + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.javmenu.com') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """从网页抓取并解析指定番号的数据 + Args: + movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 + """ + # JavMenu网页做得很不走心,将就了 + url = f'{self.base_url}zh/{movie.dvdid}' + r = await self.client.get(url) + if r.history: + # 被重定向到主页说明找不到影片资源 + raise MovieNotFoundError(__name__, movie.dvdid) + + tree = html.fromstring(r.text) + container = tree.xpath("//div[@class='col-md-9 px-0']")[0] + title = container.xpath("div[@class='col-12 mb-3']/h1/strong/text()")[0] + # 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站 + title = title.replace(' | JAV目錄大全 | 每日更新', '') + title = title.replace(' 免費在線看', '').replace(' 免費AV在線看', '') + cover_tag = container.xpath("//div[@class='single-video']") + if len(cover_tag) > 0: + video_tag = cover_tag[0].find('video') + # URL首尾竟然也有空格…… + movie.cover = video_tag.get('data-poster').strip() + # 预览影片改为blob了,无法获取 + # movie.preview_video = video_tag.find('source').get('src').strip() + else: + cover_img_tag = container.xpath("//img[@class='lazy rounded']/@data-src") + if cover_img_tag: + movie.cover = cover_img_tag[0].strip() + info = container.xpath("//div[@class='card-body']")[0] + publish_date = info.xpath("div/span[contains(text(), '日期:')]")[0].getnext().text + duration = info.xpath("div/span[contains(text(), '时长:')]")[0].getnext().text.replace('分钟', '') + producer = info.xpath("div/span[contains(text(), '製作:')]/following-sibling::a/span/text()") + if producer: + movie.producer = producer[0] + genre_tags = info.xpath("//a[@class='genre']") + genre, genre_id = [], [] + for tag in genre_tags: + items = tag.get('href').split('/') + pre_id = items[-3] + '/' + items[-1] + genre.append(tag.text.strip()) + genre_id.append(pre_id) + # genre的链接中含有censored字段,但是无法用来判断影片是否有码,因为完全不可靠…… + actress = info.xpath("div/span[contains(text(), '女优:')]/following-sibling::*/a/text()") or None + magnet_table = container.xpath("//table[contains(@class, 'magnet-table')]/tbody") + if magnet_table: + magnet_links = magnet_table[0].xpath("tr/td/a/@href") + # 它的FC2数据是从JavDB抓的,JavDB更换图片服务器后它也跟上了,似乎数据更新频率还可以 + movie.magnet = [i.replace('[javdb.com]','') for i in magnet_links] + preview_pics = container.xpath("//a[@data-fancybox='gallery']/@href") + + if (not movie.cover) and preview_pics: + movie.cover = preview_pics[0] + movie.url = url + movie.title = title.replace(movie.dvdid, '').strip() + movie.preview_pics = preview_pics + movie.publish_date = publish_date + movie.duration = duration + movie.genre = genre + movie.genre_id = genre_id + movie.actress = actress + + +if __name__ == "__main__": + + async def test_main(): + crawler = await JavMenuCrawler.create() + movie = MovieInfo('FC2-718323') + # try: + await crawler.crawl_and_fill(movie) + print(movie) + # except Exception as e: + # print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/mgstage.py b/javsp/crawlers/sites/mgstage.py new file mode 100644 index 000000000..bd9d76840 --- /dev/null +++ b/javsp/crawlers/sites/mgstage.py @@ -0,0 +1,127 @@ +"""从蚊香社-mgstage抓取数据""" +import re +import logging + + +from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import Cfg, CrawlerID +from lxml import html + + +logger = logging.getLogger(__name__) + +class MgstageCrawler(Crawler): + id = CrawlerID.mgstage + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.mgstage.com') + self.base_url = str(url) + self.client = get_client(url) + # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) + self.client.cookies = {'adc': '1'} + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + url = f'{self.base_url}/product/product_detail/{movie.dvdid}/' + resp = await self.client.get(url) + if resp.status_code == 403: + raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理') + # url不存在时会被重定向至主页。history非空时说明发生了重定向 + elif resp.history: + raise MovieNotFoundError(__name__, movie.dvdid) + + tree = html.fromstring(resp.text) + # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除 + title = tree.xpath("//div[@class='common_detail_cover']/h1/text()")[0].strip() + container = tree.xpath("//div[@class='detail_left']")[0] + cover = container.xpath("//a[@id='EnlargeImage']/@href")[0] + # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表 + actress_text = container.xpath("//th[text()='出演:']/following-sibling::td/text()") + actress_link = container.xpath("//th[text()='出演:']/following-sibling::td/a/text()") + actress = [i.strip() for i in actress_text + actress_link] + actress = [i for i in actress if i] # 移除空字符串 + producer = container.xpath("//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip() + duration_str = container.xpath("//th[text()='収録時間:']/following-sibling::td/text()")[0] + match = re.search(r'\d+', duration_str) + if match: + movie.duration = match.group(0) + dvdid = container.xpath("//th[text()='品番:']/following-sibling::td/text()")[0] + date_str = container.xpath("//th[text()='配信開始日:']/following-sibling::td/text()")[0] + publish_date = date_str.replace('/', '-') + serial_tag = container.xpath("//th[text()='シリーズ:']/following-sibling::td/a/text()") + if serial_tag: + movie.serial = serial_tag[0].strip() + # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 + # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip() + genre_tags = container.xpath("//th[text()='ジャンル:']/following-sibling::td/a") + genre = [i.text.strip() for i in genre_tags] + score_str = container.xpath("//td[@class='review']/span")[0].tail.strip() + match = re.search(r'^[\.\d]+', score_str) + if match: + score = float(match.group()) * 2 + movie.score = f'{score:.2f}' + # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签 + plots = [] + plot_p_tags = container.xpath("//dl[@id='introduction']/dd/p[not(@class='more')]") + for p in plot_p_tags: + children = p.getchildren() + # 没有children时表明plot不含有格式,此时简单地提取文本就可以 + if not children: + plots.append(p.text_content()) + continue + for child in children: + if child.tag == 'br' and plots[-1] != '\n': + plots.append('\n') + else: + if child.text: + plots.append(child.text) + if child.tail: + plots.append(child.tail) + plot = ''.join(plots).strip() + preview_pics = container.xpath("//a[@class='sample_image']/@href") + + if Cfg().crawler.hardworking: + # 预览视频是点击按钮后再加载的,不在静态网页中 + btn_url = container.xpath("//a[@class='button_sample']/@href")[0] + video_pid = btn_url.split('/')[-1] + req_url = f'{self.base_url}/sampleplayer/sampleRespons.php?pid={video_pid}' + resp = await self.client.get(req_url) + j = resp.json() + video_url = j.get('url') + if video_url: + # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX + preview_video = video_url.split('.ism/')[0] + '.mp4' + movie.preview_video = preview_video + + movie.dvdid = dvdid + movie.url = url + movie.title = title + movie.cover = cover + movie.actress = actress + movie.producer = producer + movie.publish_date = publish_date + movie.genre = genre + movie.plot = plot + movie.preview_pics = preview_pics + movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 + + +if __name__ == "__main__": + async def test_main(): + crawler = await MgstageCrawler.create() + movie = MovieInfo('ABF-153') + # try: + await crawler.crawl_and_fill(movie) + print(movie) + # except Exception as e: + # print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/njav.py b/javsp/crawlers/sites/njav.py new file mode 100644 index 000000000..5787397c9 --- /dev/null +++ b/javsp/crawlers/sites/njav.py @@ -0,0 +1,150 @@ +"""从NJAV抓取数据""" +import re +import logging +from typing import List + +from javsp.crawlers.exceptions import MovieNotFoundError +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from javsp.lib import strftime_to_minutes +from lxml import html + + +logger = logging.getLogger(__name__) + +def get_list_first(list: List): + return list[0] if list and len(list) > 0 else None + +class NjavCrawler(Crawler): + id = CrawlerID.njav + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.njav.tv/') + self.base_url = str(url) + self.client = get_client(url) + return self + + async def search_video(self, movie: MovieInfo) -> str: + id_uc = movie.dvdid + # 抓取网页 + url = f'{self.base_url}ja/search?keyword={id_uc}' + resp = await self.client.get(url) + tree = html.fromstring(resp.text) + list = tree.xpath("//div[@class='box-item']/div[@class='detail']/a") + video_url = None + for item in list: + search_title = item.xpath("text()")[0] + if id_uc in search_title: + video_url = item.xpath("@href") + break + if id_uc.startswith("FC2-"): + fc2id = id_uc.replace('FC2-', '') + if "FC2" in search_title and fc2id in search_title: + video_url = item.xpath("@href") + break + + return get_list_first(video_url) + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """解析指定番号的影片数据""" + # 抓取网页 + url = await self.search_video(movie) + url = self.base_url + "ja/" + url + if not url: + raise MovieNotFoundError(__name__, movie.dvdid) + resp = await self.client.get(url) + tree = html.fromstring(resp.text) + container = tree.xpath("//div[@class='container']/div/div[@class='col']") + if len(container) > 0: + container = container[0] + else: + raise MovieNotFoundError(__name__, movie.dvdid) + + title = container.xpath("//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()")[0] + thumb_pic = container.xpath("//div[@id='player']/@data-poster") + plot = " ".join(container.xpath("//div[@class='description']/p/text()")) + magnet = container.xpath("//div[@class='magnet']/a/@href") + real_id = None + publish_date = None + duration_str = None + uncensored = None + preview_pics = None + preview_video = None + serial = None + publisher = None + producer = None + genre = [] + actress = [] + + for item in container.xpath("//div[@class='detail-item']/div"): + item_title = item.xpath('span/text()')[0] + if "タグ:" in item_title: + genre += item.xpath("span")[1].xpath("a/text()") + elif "ジャンル:" in item_title: + genre += item.xpath("span")[1].xpath("a/text()") + elif "レーベル:" in item_title: + genre += item.xpath("span")[1].xpath("a/text()") + elif "女優:" in item_title: + actress = item.xpath("span")[1].xpath("a/text()") + elif "シリーズ:" in item_title: + serial = get_list_first(item.xpath("span")[1].xpath("a/text()")) + elif "メーカー:" in item_title: + producer = get_list_first(item.xpath("span")[1].xpath("a/text()")) + elif "コード:" in item_title: + real_id = get_list_first(item.xpath("span")[1].xpath("text()")) + elif "公開日:" in item_title: + publish_date = get_list_first(item.xpath("span")[1].xpath("text()")) + elif "再生時間:" in item_title: + duration_str = get_list_first(item.xpath("span")[1].xpath("text()")) + + # 清除标题里的番号字符 + keywords = [real_id, " "] + if movie.dvdid.startswith("FC2"): + keywords += ["FC2","PPV","-"] + [movie.dvdid.split("-")[-1]] + for keyword in keywords: + title = re.sub(re.escape(keyword), "", title, flags=re.I) + + # 判断是否无码 + uncensored_arr = magnet + [title] + for uncensored_str in uncensored_arr: + if 'uncensored' in uncensored_str.lower(): + uncensored = True + + movie.url = url + movie.title = title + movie.genre = genre + movie.actress = actress + movie.duration = str(strftime_to_minutes(duration_str)) + movie.publish_date = publish_date + movie.publisher = publisher + movie.producer = producer + movie.uncensored = uncensored + movie.preview_pics = preview_pics + movie.preview_video = preview_video + movie.plot = plot + movie.serial = serial + movie.magnet = magnet + + # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 + if movie.preview_pics: + movie.cover = preview_pics[0] + else: + movie.cover = get_list_first(thumb_pic) + +if __name__ == "__main__": + + async def test_main(): + crawler = await NjavCrawler.create() + movie = MovieInfo('012023_002') + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/crawlers/sites/prestige.py b/javsp/crawlers/sites/prestige.py new file mode 100644 index 000000000..bc0734554 --- /dev/null +++ b/javsp/crawlers/sites/prestige.py @@ -0,0 +1,101 @@ +"""从蚊香社-prestige抓取数据""" +import re +import logging + + + +from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked +from javsp.datatype import MovieInfo +from javsp.network.utils import resolve_site_fallback +from javsp.network.client import get_client +from javsp.crawlers.interface import Crawler +from javsp.config import CrawlerID +from lxml import html + + +logger = logging.getLogger(__name__) + + +class PrestigeCrawler(Crawler): + id = CrawlerID.prestige + + @classmethod + async def create(cls): + self = cls() + url = await resolve_site_fallback(self.id, 'https://www.prestige-av.com') + self.base_url = str(url) + self.client = get_client(url) + # prestige要求访问者携带已通过R18认证的cookies才能够获得完整数据,否则会被重定向到认证页面 + # (其他多数网站的R18认证只是在网页上遮了一层,完整数据已经传回,不影响爬虫爬取) + self.client.cookies = {'__age_auth__': 'true'} + return self + + async def crawl_and_fill(self, movie: MovieInfo) -> None: + """从网页抓取并解析指定番号的数据 + Args: + movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 + """ + url = f'{self.base_url}/goods/goods_detail.php?sku={movie.dvdid}' + resp = await self.client.get(url) + if resp.status_code == 500: + # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试 + raise MovieNotFoundError(__name__, movie.dvdid) + elif resp.status_code == 403: + raise SiteBlocked('prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理') + resp.raise_for_status() + tree = html.fromstring(resp.text) + container_tags = tree.xpath("//section[@class='px-4 mb-4 md:px-8 md:mb-16']") + if not container_tags: + raise MovieNotFoundError(__name__, movie.dvdid) + + container = container_tags[0] + title = container.xpath("h1/span")[0].tail.strip() + cover = container.xpath("//div[@class='c-ratio-image mr-8']/picture/source/img/@src")[0] + cover = cover.split('?')[0] + actress = container.xpath("//p[text()='出演者:']/following-sibling::div/p/a/text()") + # 移除女优名中的空格,使女优名与其他网站保持一致 + actress = [i.strip().replace(' ', '') for i in actress] + duration_str = container.xpath("//p[text()='収録時間:']")[0].getnext().text_content() + match = re.search(r'\d+', duration_str) + if match: + movie.duration = match.group(0) + date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[0] + publish_date = date_url.split('?date=')[-1] + producer = container.xpath("//p[text()='メーカー:']/following-sibling::div/a/text()")[0].strip() + dvdid = container.xpath("//p[text()='品番:']/following-sibling::div/p/text()")[0] + genre_tags = container.xpath("//p[text()='ジャンル:']/following-sibling::div/a") + genre = [tag.text.strip() for tag in genre_tags] + serial = container.xpath("//p[text()='レーベル:']/following-sibling::div/a/text()")[0].strip() + plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[0].text.strip() + preview_pics = container.xpath("//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src") + preview_pics = [i.split('?')[0] for i in preview_pics] + + # prestige改版后已经无法获取高清封面,此前已经获取的高清封面地址也已失效 + movie.url = url + movie.dvdid = dvdid + movie.title = title + movie.cover = cover + movie.actress = actress + movie.publish_date = publish_date + movie.producer = producer + movie.genre = genre + movie.serial = serial + movie.plot = plot + movie.preview_pics = preview_pics + movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片 + + + +if __name__ == "__main__": + + async def test_main(): + crawler = await PrestigeCrawler.create() + movie = MovieInfo('ABP-647') + try: + await crawler.crawl_and_fill(movie) + print(movie) + except Exception as e: + print(repr(e)) + + import asyncio + asyncio.run(test_main()) diff --git a/javsp/func.py b/javsp/func.py index 042afea5c..6232747fd 100644 --- a/javsp/func.py +++ b/javsp/func.py @@ -16,6 +16,8 @@ from pathlib import Path import importlib.metadata as meta +from pydantic_core import Url + # 判断系统是否可以使用tk USE_GUI = True try: @@ -23,7 +25,7 @@ except ImportError: USE_GUI = False -from javsp.web.base import * +from javsp.network.utils import get_client, url_download from javsp.lib import re_escape, resource_path @@ -150,7 +152,7 @@ def split_by_punc(s): return ls -def check_update(allow_check=True, auto_update=True): +async def check_update(allow_check=True, auto_update=True): """检查版本更新""" def print_header(title, info=[]): @@ -181,7 +183,9 @@ def print_header(title, info=[]): release_url = 'https://github.com/Yuukiy/JavSP/releases/latest' print('正在检查更新...', end='') try: - data = request_get(api_url, timeout=3).json() + client = get_client(Url(api_url)) + resp = await client.get(api_url) + data = resp.json() latest_version = data['tag_name'] release_time = utc2local(data['published_at']) release_date = release_time.isoformat().split('T')[0] @@ -233,7 +237,7 @@ def print_header(title, info=[]): if auto_update: try: logger.info('尝试自动更新到新版本: ' + latest_version + " (按'Ctrl+C'取消)") - download_update(data) + await download_update(data) except KeyboardInterrupt: logger.info('用户取消更新') except Exception as e: @@ -243,7 +247,7 @@ def print_header(title, info=[]): print() # 输出空行,作为新旧程序的分隔 -def download_update(rel_info): +async def download_update(rel_info): """下载版本更新 Args: @@ -253,7 +257,8 @@ def download_update(rel_info): down_url = rel_info['assets'][0]['browser_download_url'] asset_name = rel_info['assets'][0]['name'] desc = '下载更新' if shutil.get_terminal_size().columns < 120 else '下载更新: '+asset_name - download(down_url, asset_name, desc=desc) + await url_download(Url(down_url), asset_name, desc=desc) + # download(down_url, asset_name, desc=desc) if os.path.exists(asset_name): # 备份原有的程序 basepath, ext = os.path.splitext(sys.executable) @@ -270,8 +275,3 @@ def download_update(rel_info): p.wait() p.terminate() sys.exit(0) - - -if __name__ == "__main__": - setattr(sys, 'javsp_version', 'v0') - check_update() diff --git a/javsp/network/client.py b/javsp/network/client.py new file mode 100644 index 000000000..33232b677 --- /dev/null +++ b/javsp/network/client.py @@ -0,0 +1,45 @@ +"""网络请求的统一接口""" + +from typing import Dict +from pydantic_core import Url + +from httpx import AsyncClient, AsyncHTTPTransport + +from javsp.config import Cfg + +default_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36' +} + +def get_proxy(unproxied: bool): + if Cfg().network.proxy_server is None or unproxied: + return None + else: + return str(Cfg().network.proxy_server) + +client_dictionary: Dict[str, AsyncClient] = {} +def get_client(url: Url) -> AsyncClient: + if url.host is None: + raise Exception(f"Unknown url {url}") + else: + index = url.host + if index in client_dictionary: + return client_dictionary[index] + else: + unproxied = url.host in Cfg().network.unproxied + + transport = AsyncHTTPTransport( + proxy=get_proxy(unproxied), + retries=Cfg().network.retries) + + client = AsyncClient( + transport=transport, + # 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效 + headers=default_headers.copy(), + timeout=Cfg().network.timeout.total_seconds(), + follow_redirects=True, + ) + + client_dictionary[index] = client + + return client diff --git a/javsp/network/utils.py b/javsp/network/utils.py new file mode 100644 index 000000000..34caf68da --- /dev/null +++ b/javsp/network/utils.py @@ -0,0 +1,105 @@ +from datetime import timedelta +import logging +import time +from tqdm.asyncio import tqdm +from typing import Any, Coroutine, NamedTuple +import aiofiles +from pretty_errors import os +from pydantic.types import ByteSize +from pydantic_core import Url + +from pydantic_extra_types.pendulum_dt import Duration + +from javsp.config import Cfg, CrawlerID +from javsp.network.client import get_client + +import asyncio + +logger = logging.getLogger(__name__) + +class DownloadInfo(NamedTuple): + size: ByteSize + elapsed: timedelta + + def get_rate(self) -> float: + """get rate of this download, unit: Mbps""" + return self.size.to("mbit") / self.elapsed.total_seconds() + +async def url_download(url: Url, target_path: str, desc: str | None = None) -> DownloadInfo: + url_str = str(url) + + if url.scheme == 'file': + path: str = url.path + start_time: float = time.time() + async with aiofiles.open(path, "rb") as src: + async with aiofiles.open(target_path, "wb") as dest: + await dest.write(await src.read()) + filesize = os.path.getsize(path) + elapsed = time.time() - start_time + return DownloadInfo(ByteSize(filesize), Duration(seconds=elapsed)) + + if not desc: + desc = url_str.split('/')[-1] + + client = get_client(url) + + # REF: https://www.python-httpx.org/advanced/clients/#monitoring-download-progress + async with aiofiles.open(target_path, 'wb') as download_file: + # NOTE: Create a client for each request for now, need further refactor + async with client.stream("GET", url_str) as response: + total = int(response.headers["Content-Length"]) + + with tqdm(total=total, unit_scale=True, unit_divisor=1024, unit="B") as progress: + num_bytes_downloaded = response.num_bytes_downloaded + for chunk in response.iter_bytes(): + await download_file.write(chunk) + progress.update(response.num_bytes_downloaded - num_bytes_downloaded) + num_bytes_downloaded = response.num_bytes_downloaded + + return DownloadInfo(ByteSize(response.num_bytes_downloaded), response.elapsed) + +async def test_connect(url_str: str, timeout: Duration) -> bool: + """测试与指定url的连接,不使用映射,但使用代理""" + try: + client = get_client(Url(url_str)) + response = \ + await client.get( + url_str, + timeout=timeout.total_seconds(), + ) + return response.status_code == 200 + except Exception as e: + logger.debug(f"Not connectable: {url_str}\n" + repr(e)) + return False + +async def choose_one_connectable(urls: list[str]) -> str | None: + co_connectables: list[Coroutine[Any, Any, bool]] = [] + for url in urls: + co_connectables.append(test_connect(url, Duration(seconds=3))) + + connectables = await asyncio.gather(*co_connectables) + for i, connectable in enumerate(connectables): + if connectable: + return urls[i] + return None + +async def resolve_site_fallback(cr_id: CrawlerID, default: str) -> Url: + if cr_id not in Cfg().network.fallback: + return Url(default) + + fallbacks = Cfg().network.fallback[cr_id] + chosen = await choose_one_connectable(fallbacks) + if chosen is None: + return Url(default) + else: + return Url(chosen) + + +if __name__ == '__main__': + async def aentry(): + print(await choose_one_connectable(['http://iandown.what', 'http://www.baidu.com'])) + + # async def aentry(): + # print(await test_connect("https://www.y78k.com/", Duration(seconds=3))) + + asyncio.run(aentry()) diff --git a/javsp/web/translate.py b/javsp/translate.py similarity index 94% rename from javsp/web/translate.py rename to javsp/translate.py index 2e762cb15..1f202209a 100644 --- a/javsp/web/translate.py +++ b/javsp/translate.py @@ -6,7 +6,7 @@ import random import logging from pydantic_core import Url -import requests +import httpx from hashlib import md5 @@ -15,7 +15,7 @@ from javsp.config import BaiduTranslateEngine, BingTranslateEngine, Cfg, ClaudeTranslateEngine, GoogleTranslateEngine, OpenAITranslateEngine, TranslateEngine from javsp.datatype import MovieInfo -from javsp.web.base import read_proxy +from javsp.network.client import get_proxy logger = logging.getLogger(__name__) @@ -49,13 +49,7 @@ def translate_movie_info(info: MovieInfo): return False return True -def translate(texts, engine: Union[ - BaiduTranslateEngine, - BingTranslateEngine, - ClaudeTranslateEngine, - OpenAITranslateEngine, - None - ], actress=[]): +def translate(texts, engine: TranslateEngine, actress=[]): """ 翻译入口:对错误进行处理并且统一返回格式 @@ -146,7 +140,7 @@ def baidu_translate(texts, app_id, api_key, to='zh'): wait = 1.0 - (now - last_access) if wait > 0: time.sleep(wait) - r = requests.post(api_url, params=payload, headers=headers) + r = httpx.post(api_url, params=payload, headers=headers) result = r.json() baidu_translate._last_access = time.perf_counter() return result @@ -163,7 +157,7 @@ def bing_translate(texts, api_key, to='zh-Hans'): 'X-ClientTraceId': str(uuid.uuid4()) } body = [{'text': texts}] - r = requests.post(api_url, params=params, headers=headers, json=body) + r = httpx.post(api_url, params=params, headers=headers, json=body) result = r.json() return result @@ -175,12 +169,12 @@ def google_trans(texts, to='zh_CN'): # client参数的选择: https://github.com/lmk123/crx-selection-translate/issues/223#issue-184432017 global _google_trans_wait url = f"https://translate.google.com.hk/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={to}&q={texts}" - proxies = read_proxy() - r = requests.get(url, proxies=proxies) + proxies = get_proxy(False) + r = httpx.get(url, proxies=proxies) while r.status_code == 429: logger.warning(f"HTTP {r.status_code}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试") time.sleep(_google_trans_wait) - r = requests.get(url, proxies=proxies) + r = httpx.get(url, proxies=proxies) if r.status_code == 429: _google_trans_wait += random.randint(60, 90) if r.status_code == 200: @@ -204,7 +198,7 @@ def claude_translate(texts, api_key, to="zh_CN"): "max_tokens": 1024, "messages": [{"role": "user", "content": texts}], } - r = requests.post(api_url, headers=headers, json=data) + r = httpx.post(api_url, headers=headers, json=data) if r.status_code == 200: result = r.json().get("content", [{}])[0].get("text", "").strip() else: @@ -236,7 +230,7 @@ def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"): "temperature": 0, "max_tokens": 1024, } - r = requests.post(api_url, headers=headers, json=data) + r = httpx.post(api_url, headers=headers, json=data) if r.status_code == 200: if 'error' in r.json(): result = { diff --git a/javsp/web/airav.py b/javsp/web/airav.py deleted file mode 100644 index 22e9fdbf7..000000000 --- a/javsp/web/airav.py +++ /dev/null @@ -1,118 +0,0 @@ -"""从airav抓取数据""" -import re -import logging -from html import unescape - - -from javsp.web.base import Request -from javsp.web.exceptions import * -from javsp.config import Cfg -from javsp.datatype import MovieInfo - -# 初始化Request实例 -request = Request(use_scraper=True) -request.headers['Accept-Language'] = 'zh-TW,zh;q=0.9' -# 近期airav服务器似乎不稳定,时好时坏,单次查询平均在17秒左右,timeout时间增加到20秒 -request.timeout = 20 - - -logger = logging.getLogger(__name__) -base_url = 'https://www.airav.wiki' - - -def search_movie(dvdid): - """通过搜索番号获取指定的影片在网站上的ID""" - # 部分影片的ID并不直接等于番号(如012717-360),此时需要尝试通过搜索来寻找影片 - page = 0 - count = 1 - result = [] - while len(result) < count: - url = f'{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}' - r = request.get(url).json() - # {"offset": 2460, "count": 12345, "result": [...], "status": "ok"} - if r['result']: - result.extend(r['result']) - count = r['count'] - page += 1 - else: # 结果为空,结束循环 - break - # 如果什么都没搜索到,直接返回 - if not result: - raise MovieNotFoundError(__name__, dvdid) - # 排序,以优先选择更符合预期的结果(如'012717_472'对应的'1pondo_012717_472'和'_1pondo_012717_472') - result.sort(key=lambda x:x['barcode']) - # 从所有搜索结果中选择最可能的番号,返回它的URL - target = dvdid.replace('-', '_') - for item in result: - # {'vid': '', 'slug': '', 'name': '', 'url': '', 'view': '', 'img_url': '', 'barcode': ''} - barcode = item['barcode'].replace('-', '_') - if target in barcode: - return item['barcode'] - raise MovieNotFoundError(__name__, dvdid, result) - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据 - url = f'{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW' - resp = request.get(url).json() - # 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息 - if resp['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid): - barcode = search_movie(movie.dvdid) - if barcode: - url = f'{base_url}/api/video/barcode/{barcode}?lng=zh-TW' - resp = request.get(url).json() - if resp['count'] == 0: - raise MovieNotFoundError(__name__, movie.dvdid, resp) - - # 从API返回的数据中提取需要的字段 - # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展 - data = resp['result'] - dvdid = data['barcode'] - movie.dvdid = dvdid - movie.url = base_url + '/video/' + dvdid - # plot和title中可能含有HTML的转义字符,需要进行解转义处理 - movie.plot = unescape(data['description']) or None - movie.cover = data['img_url'] - # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id - movie.genre = [i['name'] for i in data['tags']] - movie.title = unescape(data['name']) - movie.actress = [i['name'] for i in data['actors']] - movie.publish_date = data['publish_date'] - movie.preview_pics = data['images'] or [] - if data['factories']: - movie.producer = data['factories'][0]['name'] - - if Cfg().crawler.hardworking: - # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472') - video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}" - resp = request.get(video_url).json() - # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'} - if 'data' in resp: - # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址 - # TODO: 发现部分影片(如080719-976)的传统格式预览片错误 - movie.preview_video = resp['data'].get('url') - - # airav上部分影片会被标记为'馬賽克破壞版'等,这些影片的title、plot和genre都不再准确 - for keyword in ('馬賽克破壞版', '馬賽克破解版', '無碼流出版'): - if movie.title and keyword in movie.title: - movie.title = None - movie.genre = [] - if movie.plot and keyword in movie.plot: - movie.plot = None - movie.genre = [] - if not any([movie.title, movie.plot, movie.genre]): - break - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('DSAD-938') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/arzon.py b/javsp/web/arzon.py deleted file mode 100644 index 433949018..000000000 --- a/javsp/web/arzon.py +++ /dev/null @@ -1,100 +0,0 @@ -"""从arzon抓取数据""" -import os -import sys -import logging -import re - -from javsp.web.base import request_get -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo -import requests -from lxml import html - -logger = logging.getLogger(__name__) -base_url = "https://www.arzon.jp" - -def get_cookie(): - # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F - skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1" - session = requests.Session() - session.get(skip_verify_url, timeout=(12, 7)) - return session.cookies.get_dict() - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - full_id = movie.dvdid - cookies = get_cookie() - url = f'{base_url}/itemlist.html?t=&m=all&s=&q={full_id}' - # url = f'{base_url}/imagelist.html?q={full_id}' - r = request_get(url, cookies, delay_raise=True) - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported - data = html.fromstring(r.content) - - urls = data.xpath("//h2/a/@href") - if len(urls) == 0: - raise MovieNotFoundError(__name__, movie.dvdid) - - item_url = base_url + urls[0] - e = request_get(item_url, cookies, delay_raise=True) - item = html.fromstring(e.content) - - title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0] - cover = item.xpath("//td[@align='center']//a/img/@src")[0] - item_text = item.xpath("//div[@class='item_text']/text()") - plot = [item.strip() for item in item_text if item.strip() != ''][0] - preview_pics_arr = item.xpath("//div[@class='detail_img']//img/@src") - # 使用列表推导式添加 "http:" 并去除 "m_" - preview_pics = [("https:" + url).replace("m_", "") for url in preview_pics_arr] - - container = item.xpath("//div[@class='item_register']/table//tr") - for row in container: - key = row.xpath("./td[1]/text()")[0] - contents = row.xpath("./td[2]//text()") - content = [item.strip() for item in contents if item.strip() != ''] - index = 0 - value = content[index] if content and index < len(content) else None - if key == "AV女優:": - movie.actress = content - if key == "AVメーカー:": - movie.producer = value - if key == "AVレーベル:": - video_type = value - if key == "シリーズ:": - movie.serial = value - if key == "監督:": - movie.director = value - if key == "発売日:" and value: - movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") - if key == "収録時間:" and value: - movie.duration = re.search(r'([\d.]+)分', value).group(1) - if key == "品番:": - dvd_id = value - elif key == "タグ:": - genre = value - - genres = '' - if video_type: - genres = [video_type] - if(genre != None): - genres.append(genre) - - movie.genre = genres - movie.url = item_url - movie.title = title - movie.plot = plot - movie.cover = f'https:{cover}' - movie.preview_pics = preview_pics - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('csct-011') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/arzon_iv.py b/javsp/web/arzon_iv.py deleted file mode 100644 index 3ea7a322f..000000000 --- a/javsp/web/arzon_iv.py +++ /dev/null @@ -1,93 +0,0 @@ -"""从arzon抓取数据""" -import os -import sys -import logging -import re - -from javsp.web.base import request_get -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo -import requests -from lxml import html - -logger = logging.getLogger(__name__) -base_url = "https://www.arzon.jp" - -def get_cookie(): - # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F - skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1" - session = requests.Session() - session.get(skip_verify_url, timeout=(12, 7)) - return session.cookies.get_dict() - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - full_id = movie.dvdid - cookies = get_cookie() - url = f'{base_url}/imagelist.html?q={full_id}' - r = request_get(url, cookies, delay_raise=True) - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported - data = html.fromstring(r.content) - - urls = data.xpath("//h2/a/@href") - if len(urls) == 0: - raise MovieNotFoundError(__name__, movie.dvdid) - - item_url = base_url + urls[0] - e = request_get(item_url, cookies, delay_raise=True) - item = html.fromstring(e.content) - - title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0] - cover = item.xpath("//td[@align='center']//a/img/@src")[0] - item_text = item.xpath("//div[@class='item_text']/text()") - plot = [item.strip() for item in item_text if item.strip() != ''][0] - - container = item.xpath("//div[@class='item_register']/table//tr") - for row in container: - key = row.xpath("./td[1]/text()")[0] - contents = row.xpath("./td[2]//text()") - content = [item.strip() for item in contents if item.strip() != ''] - index = 0 - value = content[index] if content and index < len(content) else None - if key == "タレント:": - movie.actress = content - if key == "イメージメーカー:": - movie.producer = value - if key == "イメージレーベル:": - video_type = value - if key == "監督:": - movie.director = value - if key == "発売日:" and value: - movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-") - if key == "収録時間:" and value: - movie.duration = re.search(r'([\d.]+)分', value).group(1) - if key == "品番:": - dvd_id = value - elif key == "タグ:": - genre = value - - genres = '' - if video_type: - genres = [video_type] - if(genre != None): - genres.append(genre) - - movie.genre = genres - movie.url = item_url - movie.title = title - movie.plot = plot - movie.cover = f'https:{cover}' - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('KIDM-1137B') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/avsox.py b/javsp/web/avsox.py deleted file mode 100644 index ea96d6cc3..000000000 --- a/javsp/web/avsox.py +++ /dev/null @@ -1,75 +0,0 @@ -"""从avsox抓取数据""" -import logging - -from javsp.web.base import get_html -from javsp.web.exceptions import * -from javsp.config import Cfg, CrawlerID -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = str(Cfg().network.proxy_free[CrawlerID.avsox]) - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # avsox无法直接跳转到影片的网页,因此先搜索再从搜索结果中寻找目标网页 - full_id = movie.dvdid - if full_id.startswith('FC2-'): - full_id = full_id.replace('FC2-', 'FC2-PPV-') - html = get_html(f'{base_url}tw/search/{full_id}') - ids = html.xpath("//div[@class='photo-info']/span/date[1]/text()") - urls = html.xpath("//a[contains(@class, 'movie-box')]/@href") - ids_lower = list(map(str.lower, ids)) - if full_id.lower() in ids_lower: - url = urls[ids_lower.index(full_id.lower())] - url = url.replace('/tw/', '/cn/', 1) - else: - raise MovieNotFoundError(__name__, movie.dvdid, ids) - - # 提取影片信息 - html = get_html(url) - container = html.xpath("/html/body/div[@class='container']")[0] - title = container.xpath("h3/text()")[0] - cover = container.xpath("//a[@class='bigImage']/@href")[0] - info = container.xpath("div/div[@class='col-md-3 info']")[0] - dvdid = info.xpath("p/span[@style]/text()")[0] - publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip() - duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟', '').strip() - producer, serial = None, None - producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a") - if producer_tag: - producer = producer_tag[0].text_content() - serial_tag = info.xpath("p[text()='系列:']") - if serial_tag: - serial = serial_tag[0].getnext().xpath("a/text()")[0] - genre = info.xpath("p/span[@class='genre']/a/text()") - actress = container.xpath("//a[@class='avatar-box']/span/text()") - - movie.dvdid = dvdid.replace('FC2-PPV-', 'FC2-') - movie.url = url - movie.title = title.replace(dvdid, '').strip() - movie.cover = cover - movie.publish_date = publish_date - movie.duration = duration - movie.genre = genre - movie.actress = actress - if full_id.startswith('FC2-'): - # avsox把FC2作品的拍摄者归类到'系列'而制作商固定为'FC2-PPV',这既不合理也与其他的站点不兼容,因此进行调整 - movie.producer = serial - else: - movie.producer = producer - movie.serial = serial - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('082713-417') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/avwiki.py b/javsp/web/avwiki.py deleted file mode 100644 index fbd4ecbb3..000000000 --- a/javsp/web/avwiki.py +++ /dev/null @@ -1,72 +0,0 @@ -"""从av-wiki抓取数据""" -import logging - - -from javsp.web.base import * -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - -logger = logging.getLogger(__name__) -base_url = 'https://av-wiki.net' - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - movie.url = url = f'{base_url}/{movie.dvdid}' - resp = request_get(url, delay_raise=True) - if resp.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - html = resp2html(resp) - - cover_tag = html.xpath("//header/div/a[@class='image-link-border']/img") - if cover_tag: - try: - srcset = cover_tag[0].get('srcset').split(', ') - src_set_urls = {} - for src in srcset: - url, width = src.split() - width = int(width.rstrip('w')) - src_set_urls[width] = url - max_pic = sorted(src_set_urls.items(), key=lambda x:x[0], reverse=True) - movie.cover = max_pic[0][1] - except: - movie.cover = cover_tag[0].get('src') - body = html.xpath("//section[@class='article-body']")[0] - title = body.xpath("div/p/text()")[0] - title = title.replace(f"【{movie.dvdid}】", '') - cite_url = body.xpath("div/cite/a/@href")[0] - cite_url = cite_url.split('?aff=')[0] - info = body.xpath("dl[@class='dltable']")[0] - dt_txt_ls, dd_tags = info.xpath("dt/text()"), info.xpath("dd") - data = {} - for dt_txt, dd in zip(dt_txt_ls, dd_tags): - dt_txt = dt_txt.strip() - a_tag = dd.xpath('a') - if len(a_tag) == 0: - dd_txt = dd.text.strip() - else: - dd_txt = [i.text.strip() for i in a_tag] - if isinstance(dd_txt, list) and dt_txt != 'AV女優名': # 只有女优名以列表的数据格式保留 - dd_txt = dd_txt[0] - data[dt_txt] = dd_txt - - ATTR_MAP = {'メーカー': 'producer', 'AV女優名': 'actress', 'メーカー品番': 'dvdid', 'シリーズ': 'serial', '配信開始日': 'publish_date'} - for key, attr in ATTR_MAP.items(): - setattr(movie, attr, data.get(key)) - movie.title = title - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - - movie = MovieInfo('259LUXU-593') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/base.py b/javsp/web/base.py deleted file mode 100644 index 717b5168a..000000000 --- a/javsp/web/base.py +++ /dev/null @@ -1,270 +0,0 @@ -"""网络请求的统一接口""" -import os -import sys -import time -import shutil -import logging -import requests -import contextlib -import cloudscraper -import lxml.html -from tqdm import tqdm -from lxml import etree -from lxml.html.clean import Cleaner -from requests.models import Response - - -from javsp.config import Cfg -from javsp.web.exceptions import * - - -__all__ = ['Request', 'get_html', 'post_html', 'request_get', 'resp2html', 'is_connectable', 'download', 'get_resp_text', 'read_proxy'] - - -headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'} - -logger = logging.getLogger(__name__) -# 删除js脚本相关的tag,避免网页检测到没有js运行环境时强行跳转,影响调试 -cleaner = Cleaner(kill_tags=['script', 'noscript']) - -def read_proxy(): - if Cfg().network.proxy_server is None: - return {} - else: - proxy = str(Cfg().network.proxy_server) - return {'http': proxy, 'https': proxy} - -# 与网络请求相关的功能汇总到一个模块中以方便处理,但是不同站点的抓取器又有自己的需求(针对不同网站 -# 需要使用不同的UA、语言等)。每次都传递参数很麻烦,而且会面临函数参数越加越多的问题。因此添加这个 -# 处理网络请求的类,它带有默认的属性,但是也可以在各个抓取器模块里进行进行定制 -class Request(): - """作为网络请求出口并支持各个模块定制功能""" - def __init__(self, use_scraper=False) -> None: - # 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效 - self.headers = headers.copy() - self.cookies = {} - - self.proxies = read_proxy() - self.timeout = Cfg().network.timeout.total_seconds() - if not use_scraper: - self.scraper = None - self.__get = requests.get - self.__post = requests.post - self.__head = requests.head - else: - self.scraper = cloudscraper.create_scraper() - self.__get = self._scraper_monitor(self.scraper.get) - self.__post = self._scraper_monitor(self.scraper.post) - self.__head = self._scraper_monitor(self.scraper.head) - - def _scraper_monitor(self, func): - """监控cloudscraper的工作状态,遇到不支持的Challenge时尝试退回常规的requests请求""" - def wrapper(*args, **kw): - try: - return func(*args, **kw) - except Exception as e: - logger.debug(f"无法通过CloudFlare检测: '{e}', 尝试退回常规的requests请求") - if func == self.scraper.get: - return requests.get(*args, **kw) - else: - return requests.post(*args, **kw) - return wrapper - - def get(self, url, delay_raise=False): - r = self.__get(url, - headers=self.headers, - proxies=self.proxies, - cookies=self.cookies, - timeout=self.timeout) - if not delay_raise: - r.raise_for_status() - return r - - def post(self, url, data, delay_raise=False): - r = self.__post(url, - data=data, - headers=self.headers, - proxies=self.proxies, - cookies=self.cookies, - timeout=self.timeout) - if not delay_raise: - r.raise_for_status() - return r - - def head(self, url, delay_raise=True): - r = self.__head(url, - headers=self.headers, - proxies=self.proxies, - cookies=self.cookies, - timeout=self.timeout) - if not delay_raise: - r.raise_for_status() - return r - - def get_html(self, url): - r = self.get(url) - html = resp2html(r) - return html - - -class DownloadProgressBar(tqdm): - def update_to(self, b=1, bsize=1, tsize=None): - if tsize is not None: - self.total = tsize - self.update(b * bsize - self.n) - - -def request_get(url, cookies={}, timeout=None, delay_raise=False): - """获取指定url的原始请求""" - if timeout is None: - timeout = Cfg().network.timeout.seconds - - r = requests.get(url, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout) - if not delay_raise: - if r.status_code == 403 and b'>Just a moment...<' in r.content: - raise SiteBlocked(f"403 Forbidden: 无法通过CloudFlare检测: {url}") - else: - r.raise_for_status() - return r - - -def request_post(url, data, cookies={}, timeout=None, delay_raise=False): - """向指定url发送post请求""" - if timeout is None: - timeout = Cfg().network.timeout.seconds - r = requests.post(url, data=data, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout) - if not delay_raise: - r.raise_for_status() - return r - - -def get_resp_text(resp: Response, encoding=None): - """提取Response的文本""" - if encoding: - resp.encoding = encoding - else: - resp.encoding = resp.apparent_encoding - return resp.text - - -def get_html(url, encoding='utf-8'): - """使用get方法访问指定网页并返回经lxml解析后的document""" - resp = request_get(url) - text = get_resp_text(resp, encoding=encoding) - html = lxml.html.fromstring(text) - html.make_links_absolute(url, resolve_base_href=True) - # 清理功能仅应在需要的时候用来调试网页(如prestige),否则可能反过来影响调试(如JavBus) - # html = cleaner.clean_html(html) - if hasattr(sys, 'javsp_debug_mode'): - lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug - return html - - -def resp2html(resp, encoding='utf-8') -> lxml.html.HtmlComment: - """将request返回的response转换为经lxml解析后的document""" - text = get_resp_text(resp, encoding=encoding) - html = lxml.html.fromstring(text) - html.make_links_absolute(resp.url, resolve_base_href=True) - # html = cleaner.clean_html(html) - if hasattr(sys, 'javsp_debug_mode'): - lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug - return html - - -def post_html(url, data, encoding='utf-8', cookies={}): - """使用post方法访问指定网页并返回经lxml解析后的document""" - resp = request_post(url, data, cookies=cookies) - text = get_resp_text(resp, encoding=encoding) - html = lxml.html.fromstring(text) - # jav321提供ed2k形式的资源链接,其中的非ASCII字符可能导致转换失败,因此要先进行处理 - ed2k_tags = html.xpath("//a[starts-with(@href,'ed2k://')]") - for tag in ed2k_tags: - tag.attrib['ed2k'], tag.attrib['href'] = tag.attrib['href'], '' - html.make_links_absolute(url, resolve_base_href=True) - for tag in ed2k_tags: - tag.attrib['href'] = tag.attrib['ed2k'] - tag.attrib.pop('ed2k') - # html = cleaner.clean_html(html) - # lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug - return html - - -def dump_xpath_node(node, filename=None): - """将xpath节点dump到文件""" - if not filename: - filename = node.tag + '.html' - with open(filename, 'wt', encoding='utf-8') as f: - content = etree.tostring(node, pretty_print=True).decode('utf-8') - f.write(content) - - -def is_connectable(url, timeout=3): - """测试与指定url的连接""" - try: - r = requests.get(url, headers=headers, timeout=timeout) - return True - except requests.exceptions.RequestException as e: - logger.debug(f"Not connectable: {url}\n" + repr(e)) - return False - - -def urlretrieve(url, filename=None, reporthook=None, headers=None): - if "arzon" in url: - headers["Referer"] = "https://www.arzon.jp/" - """使用requests实现urlretrieve""" - # https://blog.csdn.net/qq_38282706/article/details/80253447 - with contextlib.closing(requests.get(url, headers=headers, - proxies=read_proxy(), stream=True)) as r: - header = r.headers - with open(filename, 'wb+') as fp: - bs = 1024 - size = -1 - blocknum = 0 - if "content-length" in header: - size = int(header["Content-Length"]) # 文件总大小(理论值) - if reporthook: # 写入前运行一次回调函数 - reporthook(blocknum, bs, size) - for chunk in r.iter_content(chunk_size=1024): - if chunk: - fp.write(chunk) - fp.flush() - blocknum += 1 - if reporthook: - reporthook(blocknum, bs, size) # 每写入一次运行一次回调函数 - - -def download(url, output_path, desc=None): - """下载指定url的资源""" - # 支持“下载”本地资源,以供fc2fan的本地镜像所使用 - if not url.startswith('http'): - start_time = time.time() - shutil.copyfile(url, output_path) - filesize = os.path.getsize(url) - elapsed = time.time() - start_time - info = {'total': filesize, 'elapsed': elapsed, 'rate': filesize/elapsed} - return info - if not desc: - desc = url.split('/')[-1] - referrer = headers.copy() - referrer['referer'] = url[:url.find('/', 8)+1] # 提取base_url部分 - with DownloadProgressBar(unit='B', unit_scale=True, - miniters=1, desc=desc, leave=False) as t: - urlretrieve(url, filename=output_path, reporthook=t.update_to, headers=referrer) - info = {k: t.format_dict[k] for k in ('total', 'elapsed', 'rate')} - return info - - -def open_in_chrome(url, new=0, autoraise=True): - """使用指定的Chrome Profile打开url,便于调试""" - import subprocess - chrome = R'C:\Program Files\Google\Chrome\Application\chrome.exe' - subprocess.run(f'"{chrome}" --profile-directory="Profile 2" {url}', shell=True) - -import webbrowser -webbrowser.open = open_in_chrome - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - download('https://www.javbus.com/pics/cover/6n54_b.jpg', 'cover.jpg') diff --git a/javsp/web/dl_getchu.py b/javsp/web/dl_getchu.py deleted file mode 100644 index 15267f1f7..000000000 --- a/javsp/web/dl_getchu.py +++ /dev/null @@ -1,122 +0,0 @@ -"""从dl.getchu官网抓取数据""" -import re -import logging - -from javsp.web.base import resp2html, request_get -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - -logger = logging.getLogger(__name__) - -# https://dl.getchu.com/i/item4045373 -base_url = 'https://dl.getchu.com' -# dl.getchu用utf-8会乱码 -base_encode = 'euc-jp' - - -def get_movie_title(html): - container = html.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[2]") - if len(container) > 0: - container = container[0] - rows = container.xpath('.//tr') - title = '' - for row in rows: - for cell in row.xpath('.//td/div'): - # 获取单元格文本内容 - if cell.text: - title = str(cell.text).strip() - return title - - -def get_movie_img(html, getchu_id): - img_src = '' - container = html.xpath(f'//img[contains(@src, "{getchu_id}top.jpg")]') - if len(container) > 0: - container = container[0] - img_src = container.get('src') - return img_src - - -def get_movie_preview(html, getchu_id): - preview_pics = [] - container = html.xpath(f'//img[contains(@src, "{getchu_id}_")]') - if len(container) > 0: - for c in container: - preview_pics.append(c.get('src')) - return preview_pics - - -DURATION_PATTERN = re.compile(r'(?:動画)?(\d+)分') -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # 去除番号中的'GETCHU'字样 - id_uc = movie.dvdid.upper() - if not id_uc.startswith('GETCHU-'): - raise ValueError('Invalid GETCHU number: ' + movie.dvdid) - getchu_id = id_uc.replace('GETCHU-', '') - # 抓取网页 - url = f'{base_url}/i/item{getchu_id}' - r = request_get(url, delay_raise=True) - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - html = resp2html(r, base_encode) - container = html.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[3]") - if len(container) > 0: - container = container[0] - # 将表格提取为键值对 - rows = container.xpath('.//table/tr') - kv_rows = [i for i in rows if len(i) == 2] - data = {} - for row in kv_rows: - # 获取单元格文本内容 - key = row.xpath("td[@class='bluetext']/text()")[0] - # 是否包含a标签: 有的属性是用表示的,不是text - a_tags = row.xpath("td[2]/a") - if a_tags: - value = [i.text for i in a_tags] - else: - # 获取第2个td标签的内容(下标从1开始计数) - value = row.xpath("td[2]/text()") - data[key] = value - - for key, value in data.items(): - if key == 'サークル': - movie.producer = value[0] - elif key == '作者': - # 暂时没有在getchu找到多个actress的片子 - movie.actress = [i.strip() for i in value] - elif key == '画像数&ページ数': - match = DURATION_PATTERN.search(' '.join(value)) - if match: - movie.duration = match.group(1) - elif key == '配信開始日': - movie.publish_date = value[0].replace('/', '-') - elif key == '趣向': - movie.genre = value - elif key == '作品内容': - idx = -1 - for i, line in enumerate(value): - if line.lstrip().startswith('※'): - idx = i - break - movie.plot = ''.join(value[:idx]) - - movie.title = get_movie_title(html) - movie.cover = get_movie_img(html, getchu_id) - movie.preview_pics = get_movie_preview(html, getchu_id) - movie.dvdid = id_uc - movie.url = url - - -if __name__ == "__main__": - import pretty_errors - - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('getchu-4041026') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/fanza.py b/javsp/web/fanza.py deleted file mode 100644 index e975c4c8f..000000000 --- a/javsp/web/fanza.py +++ /dev/null @@ -1,231 +0,0 @@ -"""从fanza抓取数据""" -import os -import re -import sys -import json -import logging -from typing import Dict, List, Tuple - - -from javsp.web.base import Request, resp2html -from javsp.web.exceptions import * -from javsp.config import Cfg -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://www.dmm.co.jp' -# 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) -request = Request() -request.cookies = {'age_check_done': '1'} -request.headers['Accept-Language'] = 'ja,en-US;q=0.9' - - -_PRODUCT_PRIORITY = {'digital': 10, 'mono': 5, 'monthly': 2, 'rental': 1} -_TYPE_PRIORITY = {'videoa': 10, 'anime': 8, 'nikkatsu': 6, 'doujin': 4, 'dvd': 3, 'ppr': 2, 'paradisetv': 1} -def sort_search_result(result: List[Dict]): - """排序搜索结果""" - scores = {i['url']:(_PRODUCT_PRIORITY.get(i['product'], 0), _TYPE_PRIORITY.get(i['type'], 0)) for i in result} - sorted_result = sorted(result, key=lambda x:scores[x['url']], reverse=True) - return sorted_result - - -def get_urls_of_cid(cid: str) -> Tuple[str, str]: - """搜索cid可能的影片URL""" - r = request.get(f"https://www.dmm.co.jp/search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0") - if r.status_code == 404: - raise MovieNotFoundError(__name__, cid) - r.raise_for_status() - html = resp2html_wrapper(r) - result = html.xpath("//ul[@id='list']/li/div/p/a/@href") - parsed_result = {} - for url in result: - items = url.split('/') - type_, cid = None, None - for i, part in enumerate(items): - if part == '-': - product, type_ = items[i-2], items[i-1] - elif part.startswith('cid='): - cid = part[4:] - new_url = '/'.join(i for i in items if not i.startswith('?')) + '/' - parsed_result.setdefault(cid, []).append({'product': product, 'type': type_, 'url': new_url}) - break - if cid not in parsed_result: - if len(result) > 0: - logger.debug(f"Unknown URL in search result: " + ', '.join(result)) - raise MovieNotFoundError(__name__, cid) - sorted_result = sort_search_result(parsed_result[cid]) - return sorted_result - - -def resp2html_wrapper(resp): - html = resp2html(resp) - if 'not available in your region' in html.text_content(): - raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置') - elif '/login/' in resp.url: - raise SiteBlocked('FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP') - return html - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - default_url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/' - r0 = request.get(default_url, delay_raise=True) - if r0.status_code == 404: - urls = get_urls_of_cid(movie.cid) - for d in urls: - func_name = f"parse_{d['type']}_page" - if func_name in globals(): - parse_func = globals()[func_name] - else: - logger.debug(f"不知道怎么解析 fanza {d['type']} 的页面: {d['url']}") - continue - r = request.get(d['url']) - html = resp2html_wrapper(r) - try: - parse_func(movie, html) - movie.url = d['url'] - break - except: - logger.debug(f"Fail to parse {d['url']}", exc_info=True) - if d is urls[-1]: - logger.warning(f"在fanza查找到的cid={movie.cid}的影片页面均解析失败") - raise - else: - html = resp2html_wrapper(r0) - parse_videoa_page(movie, html) - movie.url = default_url - - -def parse_videoa_page(movie: MovieInfo, html): - """解析AV影片的页面布局""" - title = html.xpath("//div[@class='hreview']/h1/text()")[0] - # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来 - container = html.xpath("//table[@class='mg-b12']/tr/td")[0] - cover = container.xpath("//div[@id='sample-video']/a/@href")[0] - # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083 - date_tag = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()") - if date_tag: - movie.publish_date = date_tag[0].strip().replace('/', '-') - duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip() - match = re.search(r'\d+', duration_str) - if match: - movie.duration = match.group(0) - # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况 - actress = container.xpath("//span[@id='performer']/a/text()") - director_tag = container.xpath("//td[text()='監督:']/following-sibling::td/a/text()") - if director_tag: - movie.director = director_tag[0].strip() - serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()") - if serial_tag: - movie.serial = serial_tag[0].strip() - producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()") - if producer_tag: - movie.producer = producer_tag[0].strip() - # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 - # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()") - # if label_tag: - # label = label_tag[0].strip() - # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选 - genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]") - genre, genre_id = [], [] - for tag in genre_tags: - genre.append(tag.text.strip()) - genre_id.append(tag.get('href').split('=')[-1].strip('/')) - cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() - plot = container.xpath("//div[contains(@class, 'mg-b20 lh4')]/text()")[0].strip() - preview_pics = container.xpath("//a[@name='sample-image']/img/@src") - score_tag = container.xpath("//p[@class='d-review__average']/strong/text()") - if score_tag: - match = re.search(r'\d+', score_tag[0].strip()) - if match: - score = float(match.group()) * 2 - movie.score = f'{score:.2f}' - else: - score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0] - movie.score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 - - if Cfg().crawler.hardworking: - # 预览视频是动态加载的,不在静态网页中 - video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}' - html2 = request.get_html(video_url) - # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据 - script = html2.xpath("//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()")[0].strip() - match = re.search(r'\{.*\}', script) - # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配 - try: - data = json.loads(match.group()) - video_url = data.get('src') - if video_url and video_url.startswith('//'): - video_url = 'https:' + video_url - movie.preview_video = video_url - except Exception as e: - logger.debug('解析视频地址时异常: ' + repr(e)) - - movie.cid = cid - movie.title = title - movie.cover = cover - movie.actress = actress - movie.genre = genre - movie.genre_id = genre_id - movie.plot = plot - movie.preview_pics = preview_pics - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 - - -def parse_anime_page(movie: MovieInfo, html): - """解析动画影片的页面布局""" - title = html.xpath("//h1[@id='title']/text()")[0] - container = html.xpath("//table[@class='mg-b12']/tr/td")[0] - cover = container.xpath("//img[@name='package-image']/@src")[0] - date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip() - publish_date = date_str.replace('/', '-') - duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()") - if duration_tag: - movie.duration = duration_tag[0].strip().replace('分', '') - serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()") - if serial_tag: - movie.serial = serial_tag[0].strip() - producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()") - if producer_tag: - movie.producer = producer_tag[0].strip() - genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]") - genre, genre_id = [], [] - for tag in genre_tags: - genre.append(tag.text.strip()) - genre_id.append(tag.get('href').split('=')[-1].strip('/')) - cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip() - plot = container.xpath("//div[@class='mg-b20 lh4']/p")[0].text_content().strip() - preview_pics = container.xpath("//a[@name='sample-image']/img/@data-lazy") - score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0] - score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50 - - movie.cid = cid - movie.title = title - movie.cover = cover - movie.publish_date = publish_date - movie.genre = genre - movie.genre_id = genre_id - movie.plot = plot - movie.score = f'{score/5:.2f}' # 转换为10分制 - movie.preview_pics = preview_pics - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 - - -# parse_dvd_page = parse_videoa_page # 118wtktabf067 -parse_ppr_page = parse_videoa_page -parse_nikkatsu_page = parse_videoa_page -parse_doujin_page = parse_anime_page - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo(cid='d_aisoft3356') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/fc2.py b/javsp/web/fc2.py deleted file mode 100644 index 66be7ae4e..000000000 --- a/javsp/web/fc2.py +++ /dev/null @@ -1,105 +0,0 @@ -"""从FC2官网抓取数据""" -import logging - - -from javsp.web.base import get_html, request_get, resp2html -from javsp.web.exceptions import * -from javsp.config import Cfg -from javsp.lib import strftime_to_minutes -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://adult.contents.fc2.com' - - -def get_movie_score(fc2_id): - """通过评论数据来计算FC2的影片评分(10分制),无法获得评分时返回None""" - html = get_html(f'{base_url}/article/{fc2_id}/review') - review_tags = html.xpath("//ul[@class='items_comment_headerReviewInArea']/li") - reviews = {} - for tag in review_tags: - score = int(tag.xpath("div/span/text()")[0]) - vote = int(tag.xpath("span")[0].text_content()) - reviews[score] = vote - total_votes = sum(reviews.values()) - if (total_votes >= 2): # 至少也该有两个人评价才有参考意义一点吧 - summary = sum([k*v for k, v in reviews.items()]) - final_score = summary / total_votes * 2 # 乘以2转换为10分制 - return final_score - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # 去除番号中的'FC2'字样 - id_uc = movie.dvdid.upper() - if not id_uc.startswith('FC2-'): - raise ValueError('Invalid FC2 number: ' + movie.dvdid) - fc2_id = id_uc.replace('FC2-', '') - # 抓取网页 - url = f'{base_url}/article/{fc2_id}/' - resp = request_get(url) - if '/id.fc2.com/' in resp.url: - raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP') - html = resp2html(resp) - container = html.xpath("//div[@class='items_article_left']") - if len(container) > 0: - container = container[0] - else: - raise MovieNotFoundError(__name__, movie.dvdid) - # FC2 标题增加反爬乱码,使用数组合并标题 - title_arr = container.xpath("//div[@class='items_article_headerInfo']/h3/text()") - title = ''.join(title_arr) - thumb_tag = container.xpath("//div[@class='items_article_MainitemThumb']")[0] - thumb_pic = thumb_tag.xpath("span/img/@src")[0] - duration_str = thumb_tag.xpath("span/p[@class='items_article_info']/text()")[0] - # FC2没有制作商和发行商的区分,作为个人市场,影片页面的'by'更接近于制作商 - producer = container.xpath("//li[text()='by ']/a/text()")[0] - genre = container.xpath("//a[@class='tag tagTag']/text()") - date_str = container.xpath("//div[@class='items_article_Releasedate']/p/text()")[0] - publish_date = date_str[-10:].replace('/', '-') # '販売日 : 2017/11/30' - preview_pics = container.xpath("//ul[@data-feed='sample-images']/li/a/@href") - - if Cfg().crawler.hardworking: - # 通过评论数据来计算准确的评分 - score = get_movie_score(fc2_id) - if score: - movie.score = f'{score:.2f}' - # 预览视频是动态加载的,不在静态网页中 - desc_frame_url = container.xpath("//section[@class='items_article_Contents']/iframe/@src")[0] - key = desc_frame_url.split('=')[-1] # /widget/article/718323/description?ac=60fc08fa... - api_url = f'{base_url}/api/v2/videos/{fc2_id}/sample?key={key}' - r = request_get(api_url).json() - movie.preview_video = r['path'] - else: - # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星 - score_tag_attr = container.xpath("//a[@class='items_article_Stars']/p/span/@class")[0] - score = int(score_tag_attr[-1]) * 2 - movie.score = f'{score:.2f}' - - movie.dvdid = id_uc - movie.url = url - movie.title = title - movie.genre = genre - movie.producer = producer - movie.duration = str(strftime_to_minutes(duration_str)) - movie.publish_date = publish_date - movie.preview_pics = preview_pics - # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 - if movie.preview_pics: - movie.cover = preview_pics[0] - else: - movie.cover = thumb_pic - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('FC2-718323') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/fc2fan.py b/javsp/web/fc2fan.py deleted file mode 100644 index 229b3e3df..000000000 --- a/javsp/web/fc2fan.py +++ /dev/null @@ -1,80 +0,0 @@ -"""解析fc2fan本地镜像的数据""" -# FC2官网的影片下架就无法再抓取数据,如果用户有fc2fan的镜像,那可以尝试从镜像中解析影片数据 -import os -import re -import logging -import lxml.html -import requests - - -from javsp.web.base import resp2html -from javsp.web.exceptions import * -from javsp.config import Cfg -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_path = str(Cfg().crawler.fc2fan_local_path) -use_local_mirror = os.path.exists(base_path) - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - if use_local_mirror: - html_file = f'{base_path}/{movie.dvdid}.html' - if not os.path.exists(html_file): - raise MovieNotFoundError(__name__, movie.dvdid, html_file) - html = lxml.html.parse(html_file) - else: - url = f"https://fc2club.top/html/{movie.dvdid}.html" - r = requests.get(url) - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - elif r.text == '': - raise WebsiteError(f'fc2fan: 站点不可用 (HTTP {r.status_code}): {url}') - html = resp2html(r) - try: - container = html.xpath("//div[@class='col-sm-8']")[0] - except IndexError: - raise WebsiteError(f'fc2fan: 站点不可用') - title = container.xpath("h3/text()")[0] - score_str = container.xpath("h5/strong[text()='影片评分']")[0].tail.strip() - match = re.search(r'\d+', score_str) - if match: - score = int(match.group()) / 10 # fc2fan站长是按100分来打分的 - movie.score = f'{score:.1f}' - resource_info = container.xpath("h5/strong[text()='资源参数']")[0].tail - if '无码' in resource_info: - movie.uncensored = True - elif '有码' in resource_info: - movie.uncensored = False - # FC2没有制作商和发行商的区分,作为个人市场,卖家更接近于制作商 - producer = container.xpath("h5/strong[text()='卖家信息']")[0].getnext().text - if producer: - movie.producer = producer.strip() - genre = container.xpath("h5/strong[text()='影片标签']/../a/text()") - actress = container.xpath("h5/strong[text()='女优名字']/../a/text()") - preview_pics = container.xpath("//ul[@class='slides']/li/img/@src") - if use_local_mirror: - preview_pics = [os.path.normpath(os.path.join(base_path, i)) for i in preview_pics] - # big_preview = container.xpath("//img[@id='thumbpic']/../@href")[0] # 影片真实截图,目前暂时用不到 - - movie.title = title - movie.genre = genre - movie.actress = actress - if preview_pics: - movie.preview_pics = preview_pics - movie.cover = preview_pics[0] - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('FC2-1879420') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/fc2ppvdb.py b/javsp/web/fc2ppvdb.py deleted file mode 100644 index b0ad60892..000000000 --- a/javsp/web/fc2ppvdb.py +++ /dev/null @@ -1,76 +0,0 @@ -"""从FC2PPVDB抓取数据""" -import logging -from typing import List - - -from javsp.web.base import get_html -from javsp.web.exceptions import * -from javsp.lib import strftime_to_minutes -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://fc2ppvdb.com' - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # 去除番号中的'FC2'字样 - id_uc = movie.dvdid.upper() - if not id_uc.startswith('FC2-'): - raise ValueError('Invalid FC2 number: ' + movie.dvdid) - fc2_id = id_uc.replace('FC2-', '') - # 抓取网页 - url = f'{base_url}/articles/{fc2_id}' - html = get_html(url) - container = html.xpath("//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]") - if len(container) > 0: - container = container[0] - else: - raise MovieNotFoundError(__name__, movie.dvdid) - - title = container.xpath("//h2/a/text()") - thumb_pic = container.xpath(f"//img[@alt='{fc2_id}']/@src") - duration_str = container.xpath("//div[starts-with(text(),'収録時間:')]/span/text()") - actress = container.xpath("//div[starts-with(text(),'女優:')]/span/a/text()") - genre = container.xpath("//div[starts-with(text(),'タグ:')]/span/a/text()") - publish_date = container.xpath("//div[starts-with(text(),'販売日:')]/span/text()") - publisher = container.xpath("//div[starts-with(text(),'販売者:')]/span/a/text()") - uncensored_str = container.xpath("//div[starts-with(text(),'モザイク:')]/span/text()") - uncensored_str_f = get_list_first(uncensored_str); - uncensored = True if uncensored_str_f == '無' else False if uncensored_str_f == '有' else None - preview_pics = None - preview_video = container.xpath("//a[starts-with(text(),'サンプル動画')]/@href") - - movie.dvdid = id_uc - movie.url = url - movie.title = get_list_first(title) - movie.genre = genre - movie.actress = actress - movie.duration = str(strftime_to_minutes(get_list_first(duration_str))) - movie.publish_date = get_list_first(publish_date) - movie.publisher = get_list_first(publisher) - movie.uncensored = uncensored - movie.preview_pics = preview_pics - movie.preview_video = get_list_first(preview_video) - - # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 - if movie.preview_pics: - movie.cover = preview_pics[0] - else: - movie.cover = get_list_first(thumb_pic) - -def get_list_first(list:List): - return list[0] if list and len(list) > 0 else None - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('FC2-4497837') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/gyutto.py b/javsp/web/gyutto.py deleted file mode 100644 index db7d6c795..000000000 --- a/javsp/web/gyutto.py +++ /dev/null @@ -1,87 +0,0 @@ -"""从https://gyutto.com/官网抓取数据""" -import logging -import time - -from javsp.web.base import resp2html, request_get -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - -logger = logging.getLogger(__name__) - -# https://dl.gyutto.com/i/item266923 -base_url = 'http://gyutto.com' -base_encode = 'euc-jp' - -def get_movie_title(html): - container = html.xpath("//h1") - if len(container) > 0: - container = container[0] - title = container.text - - return title - -def get_movie_img(html, index = 1): - images = [] - container = html.xpath("//a[@class='highslide']/img") - if len(container) > 0: - if index == 0: - return container[0].get('src') - - for row in container: - images.append(row.get('src')) - - return images - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # 去除番号中的'gyutto'字样 - id_uc = movie.dvdid.upper() - if not id_uc.startswith('GYUTTO-'): - raise ValueError('Invalid gyutto number: ' + movie.dvdid) - gyutto_id = id_uc.replace('GYUTTO-', '') - # 抓取网页 - url = f'{base_url}/i/item{gyutto_id}?select_uaflag=1' - r = request_get(url, delay_raise=True) - if r.status_code == 404: - raise MovieNotFoundError(__name__, movie.dvdid) - html = resp2html(r, base_encode) - container = html.xpath("//dl[@class='BasicInfo clearfix']") - - for row in container: - key = row.xpath(".//dt/text()") - if key[0] == "サークル": - producer = ''.join(row.xpath(".//dd/a/text()")) - elif key[0] == "ジャンル": - genre = row.xpath(".//dd/a/text()") - elif key[0] == "配信開始日": - date = row.xpath(".//dd/text()") - date_str = ''.join(date) - date_time = time.strptime(date_str, "%Y年%m月%d日") - publish_date = time.strftime("%Y-%m-%d", date_time) - - plot = html.xpath("//div[@class='unit_DetailLead']/p/text()")[0] - - movie.title = get_movie_title(html) - movie.cover = get_movie_img(html, 0) - movie.preview_pics = get_movie_img(html) - movie.dvdid = id_uc - movie.url = url - movie.producer = producer - # movie.actress = actress - # movie.duration = duration - movie.publish_date = publish_date - movie.genre = genre - movie.plot = plot - -if __name__ == "__main__": - import pretty_errors - - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - movie = MovieInfo('gyutto-266923') - - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/jav321.py b/javsp/web/jav321.py deleted file mode 100644 index 4e42617a5..000000000 --- a/javsp/web/jav321.py +++ /dev/null @@ -1,100 +0,0 @@ -"""从jav321抓取数据""" -import re -import logging - - -from javsp.web.base import post_html -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://www.jav321.com' - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - html = post_html(f'{base_url}/search', data={'sn': movie.dvdid}) - page_url = html.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0] - #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542 - cid = page_url.split('/')[-1] # /video/ipx00177 - # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片 - if cid == 'search': - raise MovieNotFoundError(__name__, movie.dvdid) - title = html.xpath("//div[@class='panel-heading']/h3/text()")[0] - info = html.xpath("//div[@class='col-md-9']")[0] - # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签 - company_tags = info.xpath("a[contains(@href,'/company/')]/text()") - if company_tags: - movie.producer = company_tags[0] - # actress, actress_pics - # jav321现在连女优信息都没有了,首页通过女优栏跳转过去也全是空白 - actress, actress_pics = [], {} - actress_tags = html.xpath("//div[@class='thumbnail']/a[contains(@href,'/star/')]/img") - for tag in actress_tags: - name = tag.tail.strip() - pic_url = tag.get('src') - actress.append(name) - # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url, - # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据 - actress_pics[name] = pic_url - # genre, genre_id - genre_tags = info.xpath("a[contains(@href,'/genre/')]") - genre, genre_id = [], [] - for tag in genre_tags: - genre.append(tag.text) - genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1 - dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper() - publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '') - duration_str = info.xpath("b[text()='収録時間']")[0].tail - match = re.search(r'\d+', duration_str) - if match: - movie.duration = match.group(0) - # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星 - score_tag = info.xpath("//b[text()='平均評価']/following-sibling::img/@data-original") - if score_tag: - score = int(score_tag[0][5:7])/5 # /10*2 - movie.score = str(score) - serial_tag = info.xpath("a[contains(@href,'/series/')]/text()") - if serial_tag: - movie.serial = serial_tag[0] - preview_video_tag = info.xpath("//video/source/@src") - if preview_video_tag: - movie.preview_video = preview_video_tag[0] - plot_tag = info.xpath("//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()") - if plot_tag: - movie.plot = plot_tag[0] - preview_pics = html.xpath("//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src") - if len(preview_pics) == 0: - # 尝试搜索另一种布局下的封面,需要使用onerror过滤掉明明没有封面时网站往里面塞的默认URL - preview_pics = html.xpath("//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src") - # 有的图片链接里有多个//,网站质量堪忧…… - preview_pics = [i[:8] + i[8:].replace('//', '/') for i in preview_pics] - # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析 - - movie.url = page_url - movie.cid = cid - movie.dvdid = dvdid - movie.title = title - movie.actress = actress - movie.actress_pics = actress_pics - movie.genre = genre - movie.genre_id = genre_id - movie.publish_date = publish_date - # preview_pics的第一张图始终是封面,剩下的才是预览图 - if len(preview_pics) > 0: - movie.cover = preview_pics[0] - movie.preview_pics = preview_pics[1:] - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('SCUTE-1177') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/javbus.py b/javsp/web/javbus.py deleted file mode 100644 index a98cd9974..000000000 --- a/javsp/web/javbus.py +++ /dev/null @@ -1,115 +0,0 @@ -"""从JavBus抓取数据""" -import logging - - -from javsp.web.base import * -from javsp.web.exceptions import * -from javsp.func import * -from javsp.config import Cfg, CrawlerID -from javsp.datatype import MovieInfo, GenreMap - - -logger = logging.getLogger(__name__) -genre_map = GenreMap('data/genre_javbus.csv') -permanent_url = 'https://www.javbus.com' -if Cfg().network.proxy_server is not None: - base_url = permanent_url -else: - base_url = str(Cfg().network.proxy_free[CrawlerID.javbus]) - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - url = f'{base_url}/{movie.dvdid}' - resp = request_get(url, delay_raise=True) - # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息 - if resp.history and resp.history[0].status_code == 302: - html = resp2html(resp.history[0]) - else: - html = resp2html(resp) - # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404 - page_title = html.xpath('/html/head/title/text()') - if page_title and page_title[0].startswith('404 Page Not Found!'): - raise MovieNotFoundError(__name__, movie.dvdid) - - container = html.xpath("//div[@class='container']")[0] - title = container.xpath("h3/text()")[0] - cover = container.xpath("//a[@class='bigImage']/img/@src")[0] - preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href") - info = container.xpath("//div[@class='col-md-3 info']")[0] - dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text - publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip() - duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip() - director_tag = info.xpath("p/span[text()='導演:']") - if director_tag: # xpath没有匹配时将得到空列表 - movie.director = director_tag[0].getnext().text.strip() - producer_tag = info.xpath("p/span[text()='製作商:']") - if producer_tag: - text = producer_tag[0].getnext().text - if text: - movie.producer = text.strip() - publisher_tag = info.xpath("p/span[text()='發行商:']") - if publisher_tag: - movie.publisher = publisher_tag[0].getnext().text.strip() - serial_tag = info.xpath("p/span[text()='系列:']") - if serial_tag: - movie.serial = serial_tag[0].getnext().text - # genre, genre_id - genre_tags = info.xpath("//span[@class='genre']/label/a") - genre, genre_id = [], [] - for tag in genre_tags: - tag_url = tag.get('href') - pre_id = tag_url.split('/')[-1] - genre.append(tag.text) - if 'uncensored' in tag_url: - movie.uncensored = True - genre_id.append('uncensored-' + pre_id) - else: - movie.uncensored = False - genre_id.append(pre_id) - # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析 - # actress, actress_pics - actress, actress_pics = [], {} - actress_tags = html.xpath("//a[@class='avatar-box']/div/img") - for tag in actress_tags: - name = tag.get('title') - pic_url = tag.get('src') - actress.append(name) - if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像 - actress_pics[name] = pic_url - # 整理数据并更新movie的相应属性 - movie.url = f'{permanent_url}/{movie.dvdid}' - movie.dvdid = dvdid - movie.title = title.replace(dvdid, '').strip() - movie.cover = cover - movie.preview_pics = preview_pics - if publish_date != '0000-00-00': # 丢弃无效的发布日期 - movie.publish_date = publish_date - movie.duration = duration if int(duration) else None - movie.genre = genre - movie.genre_id = genre_id - movie.actress = actress - movie.actress_pics = actress_pics - - -def parse_clean_data(movie: MovieInfo): - """解析指定番号的影片数据并进行清洗""" - parse_data(movie) - movie.genre_norm = genre_map.map(movie.genre_id) - movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换) - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('NANP-030') - try: - parse_clean_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/javdb.py b/javsp/web/javdb.py deleted file mode 100644 index 5120aae76..000000000 --- a/javsp/web/javdb.py +++ /dev/null @@ -1,333 +0,0 @@ -"""从JavDB抓取数据""" -import os -import re -import logging - -from javsp.web.base import Request, resp2html -from javsp.web.exceptions import * -from javsp.func import * -from javsp.avid import guess_av_type -from javsp.config import Cfg, CrawlerID -from javsp.datatype import MovieInfo, GenreMap -from javsp.chromium import get_browsers_cookies - - -# 初始化Request实例。使用scraper绕过CloudFlare后,需要指定网页语言,否则可能会返回其他语言网页,影响解析 -request = Request(use_scraper=True) -request.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5' - -logger = logging.getLogger(__name__) -genre_map = GenreMap('data/genre_javdb.csv') -permanent_url = 'https://javdb.com' -if Cfg().network.proxy_server is not None: - base_url = permanent_url -else: - base_url = str(Cfg().network.proxy_free[CrawlerID.javdb]) - - -def get_html_wrapper(url): - """包装外发的request请求并负责转换为可xpath的html,同时处理Cookies无效等问题""" - global request, cookies_pool - r = request.get(url, delay_raise=True) - if r.status_code == 200: - # 发生重定向可能仅仅是域名重定向,因此还要检查url以判断是否被跳转到了登录页 - if r.history and '/login' in r.url: - # 仅在需要时去读取Cookies - if 'cookies_pool' not in globals(): - try: - cookies_pool = get_browsers_cookies() - except (PermissionError, OSError) as e: - logger.warning(f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", exc_info=True) - cookies_pool = [] - except Exception as e: - logger.warning(f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", exc_info=True) - cookies_pool = [] - if len(cookies_pool) > 0: - item = cookies_pool.pop() - # 更换Cookies时需要创建新的request实例,否则cloudscraper会保留它内部第一次发起网络访问时获得的Cookies - request = Request(use_scraper=True) - request.cookies = item['cookies'] - cookies_source = (item['profile'], item['site']) - logger.debug(f'未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}') - return get_html_wrapper(url) - else: - raise CredentialError('JavDB: 所有浏览器Cookies均已过期') - elif r.history and 'pay' in r.url.split('/')[-1]: - raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'") - else: - html = resp2html(r) - return html - elif r.status_code in (403, 503): - html = resp2html(r) - code_tag = html.xpath("//span[@class='code-label']/span") - error_code = code_tag[0].text if code_tag else None - if error_code: - if error_code == '1020': - block_msg = f'JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器' - else: - block_msg = f'JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})' - else: - block_msg = f'JavDB: {r.status_code} 禁止访问: {url}' - raise SiteBlocked(block_msg) - else: - raise WebsiteError(f'JavDB: {r.status_code} 非预期状态码: {url}') - - -def get_user_info(site, cookies): - """获取cookies对应的JavDB用户信息""" - try: - request.cookies = cookies - html = request.get_html(f'https://{site}/users/profile') - except Exception as e: - logger.info('JavDB: 获取用户信息时出错') - logger.debug(e, exc_info=1) - return - # 扫描浏览器得到的Cookies对应的临时域名可能会过期,因此需要先判断域名是否仍然指向JavDB的站点 - if 'JavDB' in html.text: - email = html.xpath("//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()")[0].strip() - username = html.xpath("//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()")[0].strip() - return email, username - else: - logger.debug('JavDB: 域名已过期: ' + site) - - -def get_valid_cookies(): - """扫描浏览器,获取一个可用的Cookies""" - # 经测试,Cookies所发往的域名不需要和登录时的域名保持一致,只要Cookies有效即可在多个域名间使用 - for d in cookies_pool: - info = get_user_info(d['site'], d['cookies']) - if info: - return d['cookies'] - else: - logger.debug(f"{d['profile']}, {d['site']}: Cookies无效") - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个 - html = get_html_wrapper(f'{base_url}/search?q={movie.dvdid}') - ids = list(map(str.lower, html.xpath("//div[@class='video-title']/strong/text()"))) - movie_urls = html.xpath("//a[@class='box']/@href") - match_count = len([i for i in ids if i == movie.dvdid.lower()]) - if match_count == 0: - raise MovieNotFoundError(__name__, movie.dvdid, ids) - elif match_count == 1: - index = ids.index(movie.dvdid.lower()) - new_url = movie_urls[index] - try: - html2 = get_html_wrapper(new_url) - except (SitePermissionError, CredentialError): - # 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面 - box = html.xpath("//a[@class='box']")[index] - movie.url = new_url - movie.title = box.get('title') - movie.cover = box.xpath("div/img/@src")[0] - score_str = box.xpath("div[@class='score']/span/span")[0].tail - score = re.search(r'([\d.]+)分', score_str).group(1) - movie.score = "{:.2f}".format(float(score)*2) - movie.publish_date = box.xpath("div[@class='meta']/text()")[0].strip() - return - else: - raise MovieDuplicateError(__name__, movie.dvdid, match_count) - - container = html2.xpath("/html/body/section/div/div[@class='video-detail']")[0] - info = container.xpath("//nav[@class='panel movie-panel-info']")[0] - title = container.xpath("h2/strong[@class='current-title']/text()")[0] - show_orig_title = container.xpath("//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]") - if show_orig_title: - movie.ori_title = container.xpath("h2/span[@class='origin-title']/text()")[0] - cover = container.xpath("//img[@class='video-cover']/@src")[0] - preview_pics = container.xpath("//a[@class='tile-item'][@data-fancybox='gallery']/@href") - preview_video_tag = container.xpath("//video[@id='preview-video']/source/@src") - if preview_video_tag: - preview_video = preview_video_tag[0] - if preview_video.startswith('//'): - preview_video = 'https:' + preview_video - movie.preview_video = preview_video - dvdid = info.xpath("div/span")[0].text_content() - publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text - duration = info.xpath("div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip() - director_tag = info.xpath("div/strong[text()='導演:']") - if director_tag: - movie.director = director_tag[0].getnext().text_content().strip() - av_type = guess_av_type(movie.dvdid) - if av_type != 'fc2': - producer_tag = info.xpath("div/strong[text()='片商:']") - else: - producer_tag = info.xpath("div/strong[text()='賣家:']") - if producer_tag: - movie.producer = producer_tag[0].getnext().text_content().strip() - publisher_tag = info.xpath("div/strong[text()='發行:']") - if publisher_tag: - movie.publisher = publisher_tag[0].getnext().text_content().strip() - serial_tag = info.xpath("div/strong[text()='系列:']") - if serial_tag: - movie.serial = serial_tag[0].getnext().text_content().strip() - score_tag = info.xpath("//span[@class='score-stars']") - if score_tag: - score_str = score_tag[0].tail - score = re.search(r'([\d.]+)分', score_str).group(1) - movie.score = "{:.2f}".format(float(score)*2) - genre_tags = info.xpath("//strong[text()='類別:']/../span/a") - genre, genre_id = [], [] - for tag in genre_tags: - pre_id = tag.get('href').split('/')[-1] - genre.append(tag.text) - genre_id.append(pre_id) - # 判定影片有码/无码 - subsite = pre_id.split('?')[0] - movie.uncensored = {'uncensored': True, 'tags':False}.get(subsite) - # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优 - actors_tag = info.xpath("//strong[text()='演員:']/../span")[0] - all_actors = actors_tag.xpath("a/text()") - genders = actors_tag.xpath("strong/text()") - actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀'] - magnet = container.xpath("//div[@class='magnet-name column is-four-fifths']/a/@href") - - movie.dvdid = dvdid - movie.url = new_url.replace(base_url, permanent_url) - movie.title = title.replace(dvdid, '').strip() - movie.cover = cover - movie.preview_pics = preview_pics - movie.publish_date = publish_date - movie.duration = duration - movie.genre = genre - movie.genre_id = genre_id - movie.actress = actress - movie.magnet = [i.replace('[javdb.com]','') for i in magnet] - - -def parse_clean_data(movie: MovieInfo): - """解析指定番号的影片数据并进行清洗""" - try: - parse_data(movie) - # 检查封面URL是否真的存在对应图片 - if movie.cover is not None: - r = request.head(movie.cover) - if r.status_code != 200: - movie.cover = None - except SiteBlocked: - raise - logger.error('JavDB: 可能触发了反爬虫机制,请稍后再试') - if movie.genre_id and (not movie.genre_id[0].startswith('fc2?')): - movie.genre_norm = genre_map.map(movie.genre_id) - movie.genre_id = None # 没有别的地方需要再用到,清空genre id(表明已经完成转换) - - -def collect_actress_alias(type=0, use_original=True): - """ - 收集女优的别名 - type: 0-有码, 1-无码, 2-欧美 - use_original: 是否使用原名而非译名,True-田中レモン,False-田中檸檬 - """ - import json - import time - import random - - actressAliasMap = {} - - actressAliasFilePath = "data/actress_alias.json" - # 检查文件是否存在 - if not os.path.exists(actressAliasFilePath): - # 如果文件不存在,创建文件并写入空字典 - with open(actressAliasFilePath, "w", encoding="utf-8") as file: - json.dump({}, file) - - typeList = ["censored", "uncensored", "western"] - page_url = f"{base_url}/actors/{typeList[type]}" - while True: - try: - html = get_html_wrapper(page_url) - actors = html.xpath("//div[@class='box actor-box']/a") - - count = 0 - for actor in actors: - count += 1 - actor_name = actor.xpath("strong/text()")[0].strip() - actor_url = actor.xpath("@href")[0] - # actor_url = f"https://javdb.com{actor_url}" # 构造演员主页的完整URL - - # 进入演员主页,获取更多信息 - actor_html = get_html_wrapper(actor_url) - # 解析演员所有名字信息 - names_span = actor_html.xpath("//span[@class='actor-section-name']")[0] - aliases_span_list = actor_html.xpath("//span[@class='section-meta']") - aliases_span = aliases_span_list[0] - - names_list = [name.strip() for name in names_span.text.split(",")] - if len(aliases_span_list) > 1: - aliases_list = [ - alias.strip() for alias in aliases_span.text.split(",") - ] - else: - aliases_list = [] - - # 将信息添加到actressAliasMap中 - actressAliasMap[names_list[-1 if use_original else 0]] = ( - names_list + aliases_list - ) - print( - f"{count} --- {names_list[-1 if use_original else 0]}: {names_list + aliases_list}" - ) - - if count == 10: - # 将数据写回文件 - with open(actressAliasFilePath, "r", encoding="utf-8") as file: - existing_data = json.load(file) - - # 合并现有数据和新爬取的数据 - existing_data.update(actressAliasMap) - - # 将合并后的数据写回文件 - with open(actressAliasFilePath, "w", encoding="utf-8") as file: - json.dump(existing_data, file, ensure_ascii=False, indent=2) - - actressAliasMap = {} # 重置actressAliasMap - - print( - f"已爬取 {count} 个女优,数据已更新并写回文件:", - actressAliasFilePath, - ) - - # 重置计数器 - count = 0 - - time.sleep(max(1, 10 * random.random())) # 随机等待 1-10 秒 - - # 判断是否有下一页按钮 - next_page_link = html.xpath( - "//a[@rel='next' and @class='pagination-next']/@href" - ) - if not next_page_link: - break # 没有下一页,结束循环 - else: - next_page_url = f"{next_page_link[0]}" - page_url = next_page_url - - except SiteBlocked: - raise - - with open(actressAliasFilePath, "r", encoding="utf-8") as file: - existing_data = json.load(file) - - # 合并现有数据和新爬取的数据 - existing_data.update(actressAliasMap) - - # 将合并后的数据写回文件 - with open(actressAliasFilePath, "w", encoding="utf-8") as file: - json.dump(existing_data, file, ensure_ascii=False, indent=2) - - print(f"已爬取 {count} 个女优,数据已更新并写回文件:", actressAliasFilePath) - - -if __name__ == "__main__": - # collect_actress_alias() - movie = MovieInfo('FC2-2735981') - try: - parse_clean_data(movie) - print(movie) - except CrawlerError as e: - print(repr(e)) diff --git a/javsp/web/javlib.py b/javsp/web/javlib.py deleted file mode 100644 index 85f77b75f..000000000 --- a/javsp/web/javlib.py +++ /dev/null @@ -1,141 +0,0 @@ -"""从JavLibrary抓取数据""" -import logging -from urllib.parse import urlsplit - - -from javsp.web.base import Request, read_proxy, resp2html -from javsp.web.exceptions import * -from javsp.web.proxyfree import get_proxy_free_url -from javsp.config import Cfg, CrawlerID -from javsp.datatype import MovieInfo - - -# 初始化Request实例 -request = Request(use_scraper=True) - -logger = logging.getLogger(__name__) -permanent_url = 'https://www.javlibrary.com' -base_url = '' - - -def init_network_cfg(): - """设置合适的代理模式和base_url""" - request.timeout = 5 - proxy_free_url = get_proxy_free_url('javlib') - urls = [str(Cfg().network.proxy_free[CrawlerID.javlib]), permanent_url] - if proxy_free_url and proxy_free_url not in urls: - urls.insert(1, proxy_free_url) - # 使用代理容易触发IUAM保护,先尝试不使用代理访问 - proxy_cfgs = [{}, read_proxy()] if Cfg().network.proxy_server else [{}] - for proxies in proxy_cfgs: - request.proxies = proxies - for url in urls: - if proxies == {} and url == permanent_url: - continue - try: - resp = request.get(url, delay_raise=True) - if resp.status_code == 200: - request.timeout = Cfg().network.timeout.seconds - return url - except Exception as e: - logger.debug(f"Fail to connect to '{url}': {e}") - logger.warning('无法绕开JavLib的反爬机制') - request.timeout = Cfg().network.timeout.seconds - return permanent_url - - -# TODO: 发现JavLibrary支持使用cid搜索,会直接跳转到对应的影片页面,也许可以利用这个功能来做cid到dvdid的转换 -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - global base_url - if not base_url: - base_url = init_network_cfg() - logger.debug(f"JavLib网络配置: {base_url}, proxy={request.proxies}") - url = new_url = f'{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}' - resp = request.get(url) - html = resp2html(resp) - if resp.history: - if urlsplit(resp.url).netloc == urlsplit(base_url).netloc: - # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果 - new_url = resp.url - else: - # 重定向到了不同的netloc时,新地址并不是影片地址。这种情况下新地址中丢失了path字段, - # 为无效地址(应该是JavBus重定向配置有问题),需要使用新的base_url抓取数据 - base_url = 'https://' + urlsplit(resp.url).netloc - logger.warning(f"请将配置文件中的JavLib免代理地址更新为: {base_url}") - return parse_data(movie) - else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果 - video_tags = html.xpath("//div[@class='video'][@id]/a") - # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果 - pre_choose = [] - for tag in video_tags: - tag_dvdid = tag.xpath("div[@class='id']/text()")[0] - if tag_dvdid.upper() == movie.dvdid.upper(): - pre_choose.append(tag) - pre_choose_urls = [i.get('href') for i in pre_choose] - match_count = len(pre_choose) - if match_count == 0: - raise MovieNotFoundError(__name__, movie.dvdid) - elif match_count == 1: - new_url = pre_choose_urls[0] - elif match_count == 2: - no_blueray = [] - for tag in pre_choose: - if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc - no_blueray.append(tag) - no_blueray_count = len(no_blueray) - if no_blueray_count == 1: - new_url = no_blueray[0].get('href') - logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}") - else: - # 两个结果中没有谁是蓝光影片,说明影片番号重复了 - raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls) - else: - # 存在不同影片但是番号相同的情况,如MIDV-010 - raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls) - # 重新抓取网页 - html = request.get_html(new_url) - container = html.xpath("/html/body/div/div[@id='rightcolumn']")[0] - title_tag = container.xpath("div/h3/a/text()") - title = title_tag[0] - cover = container.xpath("//img[@id='video_jacket_img']/@src")[0] - info = container.xpath("//div[@id='video_info']")[0] - dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0] - publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0] - duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0] - director_tag = info.xpath("//span[@class='director']/a/text()") - if director_tag: - movie.director = director_tag[0] - producer = info.xpath("//span[@class='maker']/a/text()")[0] - publisher_tag = info.xpath("//span[@class='label']/a/text()") - if publisher_tag: - movie.publisher = publisher_tag[0] - score_tag = info.xpath("//span[@class='score']/text()") - if score_tag: - movie.score = score_tag[0].strip('()') - genre = info.xpath("//span[@class='genre']/a/text()") - actress = info.xpath("//span[@class='star']/a/text()") - - movie.dvdid = dvdid - movie.url = new_url.replace(base_url, permanent_url) - movie.title = title.replace(dvdid, '').strip() - if cover.startswith('//'): # 补全URL中缺少的协议段 - cover = 'https:' + cover - movie.cover = cover - movie.publish_date = publish_date - movie.duration = duration - movie.producer = producer - movie.genre = genre - movie.actress = actress - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - base_url = permanent_url - movie = MovieInfo('IPX-177') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - print(e) diff --git a/javsp/web/javmenu.py b/javsp/web/javmenu.py deleted file mode 100644 index 5296a69cd..000000000 --- a/javsp/web/javmenu.py +++ /dev/null @@ -1,88 +0,0 @@ -"""从JavMenu抓取数据""" -import logging - -from javsp.web.base import Request, resp2html -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - - -request = Request() - -logger = logging.getLogger(__name__) -base_url = 'https://mrzyx.xyz' - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - # JavMenu网页做得很不走心,将就了 - url = f'{base_url}/{movie.dvdid}' - r = request.get(url) - if r.history: - # 被重定向到主页说明找不到影片资源 - raise MovieNotFoundError(__name__, movie.dvdid) - - html = resp2html(r) - container = html.xpath("//div[@class='col-md-9 px-0']")[0] - title = container.xpath("div[@class='col-12 mb-3']/h1/strong/text()")[0] - # 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站 - title = title.replace(' | JAV目錄大全 | 每日更新', '') - title = title.replace(' 免費在線看', '').replace(' 免費AV在線看', '') - cover_tag = container.xpath("//div[@class='single-video']") - if len(cover_tag) > 0: - video_tag = cover_tag[0].find('video') - # URL首尾竟然也有空格…… - movie.cover = video_tag.get('data-poster').strip() - # 预览影片改为blob了,无法获取 - # movie.preview_video = video_tag.find('source').get('src').strip() - else: - cover_img_tag = container.xpath("//img[@class='lazy rounded']/@data-src") - if cover_img_tag: - movie.cover = cover_img_tag[0].strip() - info = container.xpath("//div[@class='card-body']")[0] - publish_date = info.xpath("div/span[contains(text(), '日期:')]")[0].getnext().text - duration = info.xpath("div/span[contains(text(), '時長:')]")[0].getnext().text.replace('分鐘', '') - producer = info.xpath("div/span[contains(text(), '製作:')]/following-sibling::a/span/text()") - if producer: - movie.producer = producer[0] - genre_tags = info.xpath("//a[@class='genre']") - genre, genre_id = [], [] - for tag in genre_tags: - items = tag.get('href').split('/') - pre_id = items[-3] + '/' + items[-1] - genre.append(tag.text.strip()) - genre_id.append(pre_id) - # genre的链接中含有censored字段,但是无法用来判断影片是否有码,因为完全不可靠…… - actress = info.xpath("div/span[contains(text(), '女優:')]/following-sibling::*/a/text()") or None - magnet_table = container.xpath("//table[contains(@class, 'magnet-table')]/tbody") - if magnet_table: - magnet_links = magnet_table[0].xpath("tr/td/a/@href") - # 它的FC2数据是从JavDB抓的,JavDB更换图片服务器后它也跟上了,似乎数据更新频率还可以 - movie.magnet = [i.replace('[javdb.com]','') for i in magnet_links] - preview_pics = container.xpath("//a[@data-fancybox='gallery']/@href") - - if (not movie.cover) and preview_pics: - movie.cover = preview_pics[0] - movie.url = url - movie.title = title.replace(movie.dvdid, '').strip() - movie.preview_pics = preview_pics - movie.publish_date = publish_date - movie.duration = duration - movie.genre = genre - movie.genre_id = genre_id - movie.actress = actress - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('FC2-718323') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/mgstage.py b/javsp/web/mgstage.py deleted file mode 100644 index 4904e51db..000000000 --- a/javsp/web/mgstage.py +++ /dev/null @@ -1,114 +0,0 @@ -"""从蚊香社-mgstage抓取数据""" -import re -import logging - - -from javsp.web.base import Request, resp2html -from javsp.web.exceptions import * -from javsp.config import Cfg -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://www.mgstage.com' -# 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面) -request = Request() -request.cookies = {'adc': '1'} - - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - url = f'{base_url}/product/product_detail/{movie.dvdid}/' - resp = request.get(url, delay_raise=True) - if resp.status_code == 403: - raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理') - # url不存在时会被重定向至主页。history非空时说明发生了重定向 - elif resp.history: - raise MovieNotFoundError(__name__, movie.dvdid) - - html = resp2html(resp) - # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除 - title = html.xpath("//div[@class='common_detail_cover']/h1/text()")[0].strip() - container = html.xpath("//div[@class='detail_left']")[0] - cover = container.xpath("//a[@id='EnlargeImage']/@href")[0] - # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表 - actress_text = container.xpath("//th[text()='出演:']/following-sibling::td/text()") - actress_link = container.xpath("//th[text()='出演:']/following-sibling::td/a/text()") - actress = [i.strip() for i in actress_text + actress_link] - actress = [i for i in actress if i] # 移除空字符串 - producer = container.xpath("//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip() - duration_str = container.xpath("//th[text()='収録時間:']/following-sibling::td/text()")[0] - match = re.search(r'\d+', duration_str) - if match: - movie.duration = match.group(0) - dvdid = container.xpath("//th[text()='品番:']/following-sibling::td/text()")[0] - date_str = container.xpath("//th[text()='配信開始日:']/following-sibling::td/text()")[0] - publish_date = date_str.replace('/', '-') - serial_tag = container.xpath("//th[text()='シリーズ:']/following-sibling::td/a/text()") - if serial_tag: - movie.serial = serial_tag[0].strip() - # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到 - # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip() - genre_tags = container.xpath("//th[text()='ジャンル:']/following-sibling::td/a") - genre = [i.text.strip() for i in genre_tags] - score_str = container.xpath("//td[@class='review']/span")[0].tail.strip() - match = re.search(r'^[\.\d]+', score_str) - if match: - score = float(match.group()) * 2 - movie.score = f'{score:.2f}' - # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签 - plots = [] - plot_p_tags = container.xpath("//dl[@id='introduction']/dd/p[not(@class='more')]") - for p in plot_p_tags: - children = p.getchildren() - # 没有children时表明plot不含有格式,此时简单地提取文本就可以 - if not children: - plots.append(p.text_content()) - continue - for child in children: - if child.tag == 'br' and plots[-1] != '\n': - plots.append('\n') - else: - if child.text: - plots.append(child.text) - if child.tail: - plots.append(child.tail) - plot = ''.join(plots).strip() - preview_pics = container.xpath("//a[@class='sample_image']/@href") - - if Cfg().crawler.hardworking: - # 预览视频是点击按钮后再加载的,不在静态网页中 - btn_url = container.xpath("//a[@class='button_sample']/@href")[0] - video_pid = btn_url.split('/')[-1] - req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}' - resp = request.get(req_url).json() - video_url = resp.get('url') - if video_url: - # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX - preview_video = video_url.split('.ism/')[0] + '.mp4' - movie.preview_video = preview_video - - movie.dvdid = dvdid - movie.url = url - movie.title = title - movie.cover = cover - movie.actress = actress - movie.producer = producer - movie.publish_date = publish_date - movie.genre = genre - movie.plot = plot - movie.preview_pics = preview_pics - movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片 - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('HRV-045') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/njav.py b/javsp/web/njav.py deleted file mode 100644 index f94e943f3..000000000 --- a/javsp/web/njav.py +++ /dev/null @@ -1,134 +0,0 @@ -"""从NJAV抓取数据""" -import re -import logging -from typing import List - - -from javsp.web.base import get_html -from javsp.web.exceptions import * -from javsp.lib import strftime_to_minutes -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://njav.tv/ja' - -def search_video(movie: MovieInfo): - id_uc = movie.dvdid - # 抓取网页 - url = f'{base_url}/search?keyword={id_uc}' - html = get_html(url) - list = html.xpath("//div[@class='box-item']/div[@class='detail']/a") - video_url = None - for item in list: - search_title = item.xpath("text()")[0] - if id_uc in search_title: - video_url = item.xpath("@href") - break - if id_uc.startswith("FC2-"): - fc2id = id_uc.replace('FC2-', '') - if "FC2" in search_title and fc2id in search_title: - video_url = item.xpath("@href") - break - - return get_list_first(video_url) - -def parse_data(movie: MovieInfo): - """解析指定番号的影片数据""" - # 抓取网页 - url = search_video(movie) - if not url: - raise MovieNotFoundError(__name__, movie.dvdid) - html = get_html(url) - container = html.xpath("//div[@class='container']/div/div[@class='col']") - if len(container) > 0: - container = container[0] - else: - raise MovieNotFoundError(__name__, movie.dvdid) - - title = container.xpath("//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()")[0] - thumb_pic = container.xpath("//div[@id='player']/@data-poster") - plot = " ".join(container.xpath("//div[@class='description']/p/text()")) - magnet = container.xpath("//div[@class='magnet']/a/@href") - real_id = None - publish_date = None - duration_str = None - uncensored = None - preview_pics = None - preview_video = None - serial = None - publisher = None - producer = None - genre = [] - actress = [] - - detail_dic = {} - for item in container.xpath("//div[@class='detail-item']/div"): - item_title = item.xpath('span/text()')[0] - if "タグ:" in item_title: - genre += item.xpath("span")[1].xpath("a/text()") - elif "ジャンル:" in item_title: - genre += item.xpath("span")[1].xpath("a/text()") - elif "レーベル:" in item_title: - genre += item.xpath("span")[1].xpath("a/text()") - elif "女優:" in item_title: - actress = item.xpath("span")[1].xpath("a/text()") - elif "シリーズ:" in item_title: - serial = get_list_first(item.xpath("span")[1].xpath("a/text()")) - elif "メーカー:" in item_title: - producer = get_list_first(item.xpath("span")[1].xpath("a/text()")) - elif "コード:" in item_title: - real_id = get_list_first(item.xpath("span")[1].xpath("text()")) - elif "公開日:" in item_title: - publish_date = get_list_first(item.xpath("span")[1].xpath("text()")) - elif "再生時間:" in item_title: - duration_str = get_list_first(item.xpath("span")[1].xpath("text()")) - - # 清除标题里的番号字符 - keywords = [real_id, " "] - if movie.dvdid.startswith("FC2"): - keywords += ["FC2","PPV","-"] + [movie.dvdid.split("-")[-1]] - for keyword in keywords: - title = re.sub(re.escape(keyword), "", title, flags=re.I) - - # 判断是否无码 - uncensored_arr = magnet + [title] - for uncensored_str in uncensored_arr: - if 'uncensored' in uncensored_str.lower(): - uncensored = True - - movie.url = url - movie.title = title - movie.genre = genre - movie.actress = actress - movie.duration = str(strftime_to_minutes(duration_str)) - movie.publish_date = publish_date - movie.publisher = publisher - movie.producer = producer - movie.uncensored = uncensored - movie.preview_pics = preview_pics - movie.preview_video = preview_video - movie.plot = plot - movie.serial = serial - movie.magnet = magnet - - # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面 - if movie.preview_pics: - movie.cover = preview_pics[0] - else: - movie.cover = get_list_first(thumb_pic) - -def get_list_first(list:List): - return list[0] if list and len(list) > 0 else None - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('012023_002') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/prestige.py b/javsp/web/prestige.py deleted file mode 100644 index f6884c658..000000000 --- a/javsp/web/prestige.py +++ /dev/null @@ -1,83 +0,0 @@ -"""从蚊香社-prestige抓取数据""" -import re -import logging - - -from javsp.web.base import * -from javsp.web.exceptions import * -from javsp.datatype import MovieInfo - - -logger = logging.getLogger(__name__) -base_url = 'https://www.prestige-av.com' -# prestige要求访问者携带已通过R18认证的cookies才能够获得完整数据,否则会被重定向到认证页面 -# (其他多数网站的R18认证只是在网页上遮了一层,完整数据已经传回,不影响爬虫爬取) -cookies = {'__age_auth__': 'true'} - - -def parse_data(movie: MovieInfo): - """从网页抓取并解析指定番号的数据 - Args: - movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内 - """ - url = f'{base_url}/goods/goods_detail.php?sku={movie.dvdid}' - resp = request_get(url, cookies=cookies, delay_raise=True) - if resp.status_code == 500: - # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试 - raise MovieNotFoundError(__name__, movie.dvdid) - elif resp.status_code == 403: - raise SiteBlocked('prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理') - resp.raise_for_status() - html = resp2html(resp) - container_tags = html.xpath("//section[@class='px-4 mb-4 md:px-8 md:mb-16']") - if not container_tags: - raise MovieNotFoundError(__name__, movie.dvdid) - - container = container_tags[0] - title = container.xpath("h1/span")[0].tail.strip() - cover = container.xpath("//div[@class='c-ratio-image mr-8']/picture/source/img/@src")[0] - cover = cover.split('?')[0] - actress = container.xpath("//p[text()='出演者:']/following-sibling::div/p/a/text()") - # 移除女优名中的空格,使女优名与其他网站保持一致 - actress = [i.strip().replace(' ', '') for i in actress] - duration_str = container.xpath("//p[text()='収録時間:']")[0].getnext().text_content() - match = re.search(r'\d+', duration_str) - if match: - movie.duration = match.group(0) - date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[0] - publish_date = date_url.split('?date=')[-1] - producer = container.xpath("//p[text()='メーカー:']/following-sibling::div/a/text()")[0].strip() - dvdid = container.xpath("//p[text()='品番:']/following-sibling::div/p/text()")[0] - genre_tags = container.xpath("//p[text()='ジャンル:']/following-sibling::div/a") - genre = [tag.text.strip() for tag in genre_tags] - serial = container.xpath("//p[text()='レーベル:']/following-sibling::div/a/text()")[0].strip() - plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[0].text.strip() - preview_pics = container.xpath("//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src") - preview_pics = [i.split('?')[0] for i in preview_pics] - - # prestige改版后已经无法获取高清封面,此前已经获取的高清封面地址也已失效 - movie.url = url - movie.dvdid = dvdid - movie.title = title - movie.cover = cover - movie.actress = actress - movie.publish_date = publish_date - movie.producer = producer - movie.genre = genre - movie.serial = serial - movie.plot = plot - movie.preview_pics = preview_pics - movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片 - - -if __name__ == "__main__": - import pretty_errors - pretty_errors.configure(display_link=True) - logger.root.handlers[1].level = logging.DEBUG - - movie = MovieInfo('ABP-647') - try: - parse_data(movie) - print(movie) - except CrawlerError as e: - logger.error(e, exc_info=1) diff --git a/javsp/web/proxyfree.py b/javsp/web/proxyfree.py deleted file mode 100644 index 89c1e63a4..000000000 --- a/javsp/web/proxyfree.py +++ /dev/null @@ -1,75 +0,0 @@ -"""获取各个网站的免代理地址""" -import re -import sys - -from javsp.web.base import is_connectable, get_html, get_resp_text, request_get - - -def get_proxy_free_url(site_name: str, prefer_url=None) -> str: - """获取指定网站的免代理地址 - Args: - site_name (str): 站点名称 - prefer_url (str, optional): 优先测试此url是否可用 - Returns: - str: 指定站点的免代理地址(失败时为空字符串) - """ - if prefer_url and is_connectable(prefer_url, timeout=5): - return prefer_url - # 当prefer_url不可用时,尝试自动获取指定网站的免代理地址 - site_name = site_name.lower() - func_name = f'_get_{site_name}_urls' - get_funcs = [i for i in dir(sys.modules[__name__]) if i.startswith('_get_')] - if func_name in get_funcs: - get_urls = getattr(sys.modules[__name__], func_name) - try: - urls = get_urls() - return _choose_one(urls) - except: - return '' - else: - raise Exception("Dont't know how to get proxy-free url for " + site_name) - - -def _choose_one(urls) -> str: - for url in urls: - if is_connectable(url, timeout=5): - return url - return '' - - -def _get_avsox_urls() -> list: - html = get_html('https://tellme.pw/avsox') - urls = html.xpath('//h4/strong/a/@href') - return urls - - -def _get_javbus_urls() -> list: - html = get_html('https://www.javbus.one/') - text = html.text_content() - urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A) - return urls - - -def _get_javlib_urls() -> list: - html = get_html('https://github.com/javlibcom') - text = html.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content() - match = re.search(r'[\w\.]+', text, re.A) - if match: - domain = f'https://www.{match.group(0)}.com' - return [domain] - - -def _get_javdb_urls() -> list: - html = get_html('https://jav524.app') - js_links = html.xpath("//script[@src]/@src") - for link in js_links: - if '/js/index' in link: - text = get_resp_text(request_get(link)) - match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A) - if match: - return [match.group(1)] - - -if __name__ == "__main__": - print('javdb:\t', _get_javdb_urls()) - print('javlib:\t', _get_javlib_urls()) diff --git a/poetry.lock b/poetry.lock index 1c92293a3..f9b1b8d77 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,21 @@ # This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +[[package]] +name = "aiofiles" +version = "24.1.0" +description = "File support for asyncio." +optional = false +python-versions = ">=3.8" +files = [ + {file = "aiofiles-24.1.0-py3-none-any.whl", hash = "sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5"}, + {file = "aiofiles-24.1.0.tar.gz", hash = "sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c"}, +] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "annotated-types" version = "0.7.0" @@ -16,6 +32,33 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "anyio" +version = "4.6.0" +description = "High level compatibility layer for multiple asynchronous event loop implementations" +optional = false +python-versions = ">=3.9" +files = [ + {file = "anyio-4.6.0-py3-none-any.whl", hash = "sha256:c7d2e9d63e31599eeb636c8c5c03a7e108d73b345f064f1c19fdc87b79036a9a"}, + {file = "anyio-4.6.0.tar.gz", hash = "sha256:137b4559cbb034c477165047febb6ff83f390fc3b20bf181c1fc0a728cb8beeb"}, +] + +[package.dependencies] +exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +idna = ">=2.8" +sniffio = ">=1.1" +typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""} + +[package.extras] +doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"] +test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.21.0b1)"] +trio = ["trio (>=0.26.1)"] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "certifi" version = "2024.8.30" @@ -116,131 +159,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "charset-normalizer" -version = "3.3.2" -description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." -optional = false -python-versions = ">=3.7.0" -files = [ - {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"}, - {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"}, - {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"}, - {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"}, - {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"}, - {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"}, - {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"}, - {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"}, -] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - -[[package]] -name = "cloudscraper" -version = "1.2.71" -description = "A Python module to bypass Cloudflare's anti-bot page." -optional = false -python-versions = "*" -files = [ - {file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"}, - {file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"}, -] - -[package.dependencies] -pyparsing = ">=2.4.7" -requests = ">=2.9.2" -requests-toolbelt = ">=0.9.1" - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "colorama" version = "0.4.4" @@ -570,6 +488,79 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "h11" +version = "0.14.0" +description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1" +optional = false +python-versions = ">=3.7" +files = [ + {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"}, + {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, +] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + +[[package]] +name = "httpcore" +version = "1.0.5" +description = "A minimal low-level HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpcore-1.0.5-py3-none-any.whl", hash = "sha256:421f18bac248b25d310f3cacd198d55b8e6125c107797b609ff9b7a6ba7991b5"}, + {file = "httpcore-1.0.5.tar.gz", hash = "sha256:34a38e2f9291467ee3b44e89dd52615370e152954ba21721378a87b2960f7a61"}, +] + +[package.dependencies] +certifi = "*" +h11 = ">=0.13,<0.15" + +[package.extras] +asyncio = ["anyio (>=4.0,<5.0)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +trio = ["trio (>=0.22.0,<0.26.0)"] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + +[[package]] +name = "httpx" +version = "0.27.2" +description = "The next generation HTTP client." +optional = false +python-versions = ">=3.8" +files = [ + {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"}, + {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"}, +] + +[package.dependencies] +anyio = "*" +certifi = "*" +httpcore = "==1.*" +idna = "*" +sniffio = "*" +socksio = {version = "==1.*", optional = true, markers = "extra == \"socks\""} + +[package.extras] +brotli = ["brotli", "brotlicffi"] +cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"] +http2 = ["h2 (>=3,<5)"] +socks = ["socksio (==1.*)"] +zstd = ["zstandard (>=0.18.0)"] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "idna" version = "3.10" @@ -1428,25 +1419,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "pyparsing" -version = "3.1.4" -description = "pyparsing module - Classes and methods to define and execute parsing grammars" -optional = false -python-versions = ">=3.6.8" -files = [ - {file = "pyparsing-3.1.4-py3-none-any.whl", hash = "sha256:a6a7ee4235a3f944aa1fa2249307708f893fe5717dc603503c6c7969c070fb7c"}, - {file = "pyparsing-3.1.4.tar.gz", hash = "sha256:f86ec8d1a83f11977c9a6ea7598e8c27fc5cddfa5b07ea2241edbbde1d7bc032"}, -] - -[package.extras] -diagrams = ["jinja2", "railroad-diagrams"] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "pytest" version = "8.3.3" @@ -1623,51 +1595,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "requests" -version = "2.31.0" -description = "Python HTTP for Humans." -optional = false -python-versions = ">=3.7" -files = [ - {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, - {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, -] - -[package.dependencies] -certifi = ">=2017.4.17" -charset-normalizer = ">=2,<4" -idna = ">=2.5,<4" -urllib3 = ">=1.21.1,<3" - -[package.extras] -socks = ["PySocks (>=1.5.6,!=1.5.7)"] -use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - -[[package]] -name = "requests-toolbelt" -version = "1.0.0" -description = "A utility belt for advanced users of python-requests" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" -files = [ - {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"}, - {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"}, -] - -[package.dependencies] -requests = ">=2.0.1,<3.0.0" - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "setuptools" version = "75.1.0" @@ -1748,6 +1675,38 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" +[[package]] +name = "sniffio" +version = "1.3.1" +description = "Sniff out which async library your code is running under" +optional = false +python-versions = ">=3.7" +files = [ + {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"}, + {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, +] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + +[[package]] +name = "socksio" +version = "1.0.0" +description = "Sans-I/O implementation of SOCKS4, SOCKS4A, and SOCKS5." +optional = false +python-versions = ">=3.6" +files = [ + {file = "socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3"}, + {file = "socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac"}, +] + +[package.source] +type = "legacy" +url = "https://pypi.tuna.tsinghua.edu.cn/simple" +reference = "mirrors" + [[package]] name = "time-machine" version = "2.15.0" @@ -1992,28 +1951,6 @@ type = "legacy" url = "https://pypi.tuna.tsinghua.edu.cn/simple" reference = "mirrors" -[[package]] -name = "urllib3" -version = "2.2.3" -description = "HTTP library with thread-safe connection pooling, file post, and more." -optional = false -python-versions = ">=3.8" -files = [ - {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"}, - {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"}, -] - -[package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] -h2 = ["h2 (>=4,<5)"] -socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] -zstd = ["zstandard (>=0.18.0)"] - -[package.source] -type = "legacy" -url = "https://pypi.tuna.tsinghua.edu.cn/simple" -reference = "mirrors" - [[package]] name = "zipp" version = "3.20.2" @@ -2041,4 +1978,4 @@ reference = "mirrors" [metadata] lock-version = "2.0" python-versions = "<3.13,>=3.10" -content-hash = "056b2f7a21b0286a04a5ecadb809f6472c636348fe07976ac42c9c47c620f04c" +content-hash = "3c98b4c2562b1cc5d88474d6962ab34e60be1be488d840c691c0d0e1095d7285" diff --git a/pyproject.toml b/pyproject.toml index a5e1b4d10..a74d2bc1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,11 +13,9 @@ format = "v{base}.{distance}" [tool.poetry.dependencies] python = "<3.13,>=3.10" -cloudscraper = "1.2.71" colorama = "0.4.4" pillow = "10.2.0" pretty-errors = "1.2.19" -requests = "2.31.0" tqdm = "4.59.0" # https://stackoverflow.com/questions/446209/possible-values-from-sys-platform pywin32 = {version = "^306", markers = "sys_platform == 'win32'"} @@ -29,6 +27,8 @@ confz = "^2.0.1" pydantic-extra-types = "^2.9.0" pendulum = "^3.0.0" slimeface = "^2024.9.27" +httpx = {extras = ["socks"], version = "^0.27.2"} +aiofiles = "^24.1.0" [tool.poetry.scripts] javsp = "javsp.__main__:entry" diff --git a/tools/config_migration.py b/tools/config_migration.py index 95adc45d6..f08f9ed67 100644 --- a/tools/config_migration.py +++ b/tools/config_migration.py @@ -76,13 +76,16 @@ def fix_pat(p): # 设置代理服务器地址,支持 http, socks5/socks5h 代理,比如'http://127.0.0.1:1080' # null表示禁用代理 proxy_server: {'null' if proxy_disabled else f"'{cfg['Network']['proxy']}'"} - # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置 - proxy_free: -{'\n'.join([f" {id}: '{url}'" for id, url in dict(cfg['ProxyFree']).items()])} # 网络问题导致抓取数据失败时的重试次数,通常3次就差不多了 - retry: {cfg['Network']['retry']} + retries: {cfg['Network']['retry']} # https://en.wikipedia.org/wiki/ISO_8601#Durations timeout: PT{cfg['Network']['timeout']}S + # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置 + unproxied: [{ + ', '.join(dict(cfg['ProxyFree']).values()) +}] + fallback: +{'\n'.join([f" {id}: ['{url}']" for id, url in dict(cfg['ProxyFree']).items()])} ################################ crawler: @@ -100,8 +103,6 @@ def fix_pat(p): hardworking: {yes_to_true(cfg['Crawler']['hardworking_mode'])} # 使用网页番号作为最终番号(启用时会对番号大小写等进行更正) respect_site_avid: {yes_to_true(cfg['Crawler']['respect_site_avid'])} - # fc2fan已关站。如果你有镜像,请设置本地镜像文件夹的路径,此文件夹内要有类似'FC2-12345.html'的网页文件 - fc2fan_local_path: '{cfg['Crawler']['fc2fan_local_path']}' # 刮削一部电影后的等待时间(设置为0禁用此功能) # https://en.wikipedia.org/wiki/ISO_8601#Durations sleep_after_scraping: PT{cfg['Crawler']['sleep_after_scraping']}S diff --git a/unittest/test_proxyfree.py b/unittest/test_proxyfree.py index 1537d93ad..65151a9d4 100644 --- a/unittest/test_proxyfree.py +++ b/unittest/test_proxyfree.py @@ -1,18 +1,25 @@ -import os -import sys - -sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) -from javsp.web.proxyfree import * +import asyncio +import tracemalloc +from javsp.crawlers.proxyfree import get_proxy_free_url +from javsp.config import CrawlerID def test_get_url(): - assert get_proxy_free_url('javlib') != '' - assert get_proxy_free_url('javdb') != '' + async def wrap(): + assert await get_proxy_free_url(CrawlerID.javlib) != None + assert await get_proxy_free_url(CrawlerID.javdb) != None + asyncio.run(wrap()) def test_get_url_with_prefer(): - prefer_url = 'https://www.baidu.com' - assert prefer_url == get_proxy_free_url('javlib', prefer_url) + async def wrap(): + prefer_url = 'https://www.baidu.com' + assert prefer_url == await get_proxy_free_url(CrawlerID.javlib, prefer_url) + asyncio.run(wrap()) if __name__ == "__main__": - print(get_proxy_free_url('javlib')) + async def aentry(): + print(await get_proxy_free_url(CrawlerID.javlib)) + + tracemalloc.start() + asyncio.run(aentry(), debug=True)