diff --git a/config.yml b/config.yml
index 53fac4863..7d8790195 100644
--- a/config.yml
+++ b/config.yml
@@ -25,16 +25,24 @@ network:
# 设置代理服务器地址,支持 http, socks5/socks5h 代理,比如'http://127.0.0.1:1080'
# null表示禁用代理
proxy_server: null
- # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置
- proxy_free:
- avsox: 'https://avsox.click'
- javbus: 'https://www.seedmm.help'
- javdb: 'https://javdb368.com'
- javlib: 'https://www.y78k.com'
# 网络问题导致抓取数据失败时的重试次数,通常3次就差不多了
- retry: 3
+ retries: 3
# https://en.wikipedia.org/wiki/ISO_8601#Durations
timeout: PT10S
+ # 对列表中的地址不使用梯子(如果启用了的话)
+ unproxied: [
+ 'https://www.seedmm.help',
+ 'https://javdb368.com',
+ 'https://www.y78k.com',
+ 'https://www.javbus.one',
+ 'https://www.tellme.pw',
+ ]
+ # 各个站点的代替地址。
+ # JavSP会按顺序尝试列表里的每一个服务器,如果都不行会使用默认的主站点地址
+ fallback:
+ javbus: ['https://www.seedmm.help']
+ javdb: ['https://javdb368.com']
+ javlib: ['https://www.y78k.com']
################################
crawler:
@@ -52,8 +60,6 @@ crawler:
hardworking: true
# 使用网页番号作为最终番号(启用时会对番号大小写等进行更正)
respect_site_avid: true
- # fc2fan已关站。如果你有镜像,请设置本地镜像文件夹的路径,此文件夹内要有类似'FC2-12345.html'的网页文件
- fc2fan_local_path: null
# 刮削一部电影后的等待时间(设置为0禁用此功能)
# https://en.wikipedia.org/wiki/ISO_8601#Durations
sleep_after_scraping: PT1S
diff --git a/javsp/__main__.py b/javsp/__main__.py
index 7771170e7..456bbebf8 100644
--- a/javsp/__main__.py
+++ b/javsp/__main__.py
@@ -3,13 +3,14 @@
import sys
import json
import time
+import asyncio
import logging
from PIL import Image
from pydantic import ValidationError
+from pydantic_core import Url
from pydantic_extra_types.pendulum_dt import Duration
-import requests
-import threading
-from typing import Dict, List
+from typing import Any, Coroutine, Dict, List
+from javsp.crawlers.all import crawlers
sys.stdout.reconfigure(encoding='utf-8')
@@ -23,7 +24,7 @@
from javsp.print import TqdmOut
-from javsp.cropper import Cropper, get_cropper
+from javsp.cropper import get_cropper
# 将StreamHandler的stream修改为TqdmOut,以与Tqdm协同工作
@@ -41,11 +42,11 @@
from javsp.func import *
from javsp.image import *
from javsp.datatype import Movie, MovieInfo
-from javsp.web.base import download
-from javsp.web.exceptions import *
-from javsp.web.translate import translate_movie_info
+from javsp.network.utils import url_download
+from javsp.crawlers.exceptions import *
+from javsp.translate import translate_movie_info
-from javsp.config import Cfg, CrawlerID
+from javsp.config import Cfg, CrawlerID, UseJavDBCover
actressAliasMap = {}
@@ -57,86 +58,49 @@ def resolve_alias(name):
return name # 如果找不到别名对应的固定名字,则返回原名
-def import_crawlers():
- """按配置文件的抓取器顺序将该字段转换为抓取器的函数列表"""
- unknown_mods = []
- for _, mods in Cfg().crawler.selection.items():
- valid_mods = []
- for name in mods:
- try:
- # 导入fc2fan抓取器的前提: 配置了fc2fan的本地路径
- # if name == 'fc2fan' and (not os.path.isdir(Cfg().Crawler.fc2fan_local_path)):
- # logger.debug('由于未配置有效的fc2fan路径,已跳过该抓取器')
- # continue
- import_name = 'javsp.web.' + name
- __import__(import_name)
- valid_mods.append(import_name) # 抓取器有效: 使用完整模块路径,便于程序实际使用
- except ModuleNotFoundError:
- unknown_mods.append(name) # 抓取器无效: 仅使用模块名,便于显示
- if unknown_mods:
- logger.warning('配置的抓取器无效: ' + ', '.join(unknown_mods))
-
-
# 爬虫是IO密集型任务,可以通过多线程提升效率
-def parallel_crawler(movie: Movie, tqdm_bar=None):
+async def parallel_crawler(movie: Movie, tqdm_bar=None) -> dict[CrawlerID, MovieInfo]:
"""使用多线程抓取不同网站的数据"""
- def wrapper(parser, info: MovieInfo, retry):
+
+ async def wrapper(id: CrawlerID, movie: MovieInfo) -> None:
"""对抓取器函数进行包装,便于更新提示信息和自动重试"""
- crawler_name = threading.current_thread().name
- task_info = f'Crawler: {crawler_name}: {info.dvdid}'
- for cnt in range(retry):
- try:
- parser(info)
- movie_id = info.dvdid or info.cid
- logger.debug(f"{crawler_name}: 抓取成功: '{movie_id}': '{info.url}'")
- setattr(info, 'success', True)
- if isinstance(tqdm_bar, tqdm):
- tqdm_bar.set_description(f'{crawler_name}: 抓取完成')
- break
- except MovieNotFoundError as e:
- logger.debug(e)
- break
- except MovieDuplicateError as e:
- logger.exception(e)
- break
- except (SiteBlocked, SitePermissionError, CredentialError) as e:
- logger.error(e)
- break
- except requests.exceptions.RequestException as e:
- logger.debug(f'{crawler_name}: 网络错误,正在重试 ({cnt+1}/{retry}): \n{repr(e)}')
- if isinstance(tqdm_bar, tqdm):
- tqdm_bar.set_description(f'{crawler_name}: 网络错误,正在重试')
- except Exception as e:
- logger.exception(e)
+ try:
+ crawler = await crawlers[id].create()
+ await crawler.crawl_and_fill(movie)
+ movie_id = info.dvdid or info.cid
+ logger.debug(f"{crawler.id.value}: 抓取成功: '{movie_id}': '{info.url}'")
+ setattr(info, 'success', True)
+ if isinstance(tqdm_bar, tqdm):
+ tqdm_bar.set_description(f'{crawler.id.value}: 抓取完成')
+ except MovieNotFoundError as e:
+ logger.debug(e)
+ except MovieDuplicateError as e:
+ logger.exception(e)
+ except (SiteBlocked, SitePermissionError, CredentialError) as e:
+ logger.error(e)
+ except Exception as e:
+ logger.exception(e)
# 根据影片的数据源获取对应的抓取器
- crawler_mods: List[CrawlerID] = Cfg().crawler.selection[movie.data_src]
+ crawler_to_use: List[CrawlerID] = Cfg().crawler.selection[movie.data_src]
+
+ all_info: Dict[CrawlerID, MovieInfo] = {i: MovieInfo(movie) for i in crawler_to_use}
- all_info = {i.value: MovieInfo(movie) for i in crawler_mods}
# 番号为cid但同时也有有效的dvdid时,也尝试使用普通模式进行抓取
if movie.data_src == 'cid' and movie.dvdid:
- crawler_mods = crawler_mods + Cfg().crawler.selection.normal
+ crawler_to_use += Cfg().crawler.selection.normal
for i in all_info.values():
i.dvdid = None
for i in Cfg().crawler.selection.normal:
all_info[i] = MovieInfo(movie.dvdid)
- thread_pool = []
- for mod_partial, info in all_info.items():
- mod = f"javsp.web.{mod_partial}"
- parser = getattr(sys.modules[mod], 'parse_data')
- # 将all_info中的info实例传递给parser,parser抓取完成后,info实例的值已经完成更新
- # TODO: 抓取器如果带有parse_data_raw,说明它已经自行进行了重试处理,此时将重试次数设置为1
- if hasattr(sys.modules[mod], 'parse_data_raw'):
- th = threading.Thread(target=wrapper, name=mod, args=(parser, info, 1))
- else:
- th = threading.Thread(target=wrapper, name=mod, args=(parser, info, Cfg().network.retry))
- th.start()
- thread_pool.append(th)
- # 等待所有线程结束
- timeout = Cfg().network.retry * Cfg().network.timeout.total_seconds()
- for th in thread_pool:
- th: threading.Thread
- th.join(timeout=timeout)
+
+ co_pool: list[Coroutine[Any, Any, None]] = []
+ for crawler_id, info in all_info.items():
+ co_pool.append(wrapper(crawler_id, info))
+
+ # 等待所有协程结束
+ await asyncio.gather(*co_pool)
+
# 根据抓取结果更新影片类型判定
if movie.data_src == 'cid' and movie.dvdid:
titles = [all_info[i].title for i in Cfg().crawler.selection[movie.data_src]]
@@ -148,22 +112,22 @@ def wrapper(parser, info: MovieInfo, retry):
movie.data_src = 'normal'
movie.cid = None
all_info = {k: v for k, v in all_info.items() if k not in Cfg().crawler.selection['cid']}
+
# 删除抓取失败的站点对应的数据
all_info = {k:v for k,v in all_info.items() if hasattr(v, 'success')}
for info in all_info.values():
del info.success
- # 删除all_info中键名中的'web.'
- all_info = {k[4:]:v for k,v in all_info.items()}
+
return all_info
-def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
+def info_summary(movie: Movie, all_info: Dict[CrawlerID, MovieInfo]):
"""汇总多个来源的在线数据生成最终数据"""
final_info = MovieInfo(movie)
########## 部分字段配置了专门的选取逻辑,先处理这些字段 ##########
# genre
- if 'javdb' in all_info and all_info['javdb'].genre:
- final_info.genre = all_info['javdb'].genre
+ if 'javdb' in all_info and all_info[CrawlerID.javdb].genre:
+ final_info.genre = all_info[CrawlerID.javdb].genre
########## 移除所有抓取器数据中,标题尾部的女优名 ##########
if Cfg().summarizer.title.remove_trailing_actor_name:
@@ -197,7 +161,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
setattr(final_info, attr, incoming)
absorbed.append(attr)
if absorbed:
- logger.debug(f"从'{name}'中获取了字段: " + ' '.join(absorbed))
+ logger.debug(f"从'{name.value}'中获取了字段: " + ' '.join(absorbed))
# 使用网站的番号作为番号
if Cfg().crawler.respect_site_avid:
id_weight = {}
@@ -216,7 +180,7 @@ def info_summary(movie: Movie, all_info: Dict[str, MovieInfo]):
else:
final_info.cid = final_id
# javdb封面有水印,优先采用其他站点的封面
- javdb_cover = getattr(all_info.get('javdb'), 'cover', None)
+ javdb_cover = getattr(all_info.get(CrawlerID.javdb), 'cover', None)
if javdb_cover is not None:
match Cfg().crawler.use_javdb_cover:
case UseJavDBCover.fallback:
@@ -402,7 +366,7 @@ def should_use_ai_crop_match(label):
fanart_cropped = add_label_to_poster(fanart_cropped, UNCENSORED_MARK_FILE, LabelPostion.BOTTOM_LEFT)
fanart_cropped.save(movie.poster_file)
-def RunNormalMode(all_movies):
+async def RunNormalMode(all_movies):
"""普通整理模式"""
def check_step(result, msg='步骤错误'):
"""检查一个整理步骤的结果,并负责更新tqdm的进度"""
@@ -427,7 +391,7 @@ def check_step(result, msg='步骤错误'):
inner_bar = tqdm(total=total_step, desc='步骤', ascii=True, leave=False)
# 依次执行各个步骤
inner_bar.set_description(f'启动并发任务')
- all_info = parallel_crawler(movie, inner_bar)
+ all_info = await parallel_crawler(movie, inner_bar)
msg = f'为其配置的{len(Cfg().crawler.selection[movie.data_src])}个抓取器均未获取到影片信息'
check_step(all_info, msg)
@@ -447,9 +411,9 @@ def check_step(result, msg='步骤错误'):
inner_bar.set_description('下载封面图片')
if Cfg().summarizer.cover.highres:
- cover_dl = download_cover(movie.info.covers, movie.fanart_file, movie.info.big_covers)
+ cover_dl = await download_cover(movie.info.covers, movie.fanart_file, movie.info.big_covers)
else:
- cover_dl = download_cover(movie.info.covers, movie.fanart_file)
+ cover_dl = await download_cover(movie.info.covers, movie.fanart_file)
check_step(cover_dl, '下载封面图片失败')
cover, pic_path = cover_dl
# 确保实际下载的封面的url与即将写入到movie.info中的一致
@@ -476,12 +440,12 @@ def check_step(result, msg='步骤错误'):
fanart_destination = f"{extrafanartdir}/{id}.png"
try:
- info = download(pic_url, fanart_destination)
+ info = await url_download(Url(pic_url), fanart_destination)
if valid_pic(fanart_destination):
filesize = get_fmt_size(pic_path)
width, height = get_pic_size(pic_path)
- elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed']))
- speed = get_fmt_size(info['rate']) + '/s'
+ elapsed = str(info.elapsed)
+ speed = f"{info.get_rate()}Mbps"
logger.info(f"已下载剧照{pic_url} {id}.png: {width}x{height}, {filesize} [{elapsed}, {speed}]")
else:
check_step(False, f"下载剧照{id}: {pic_url}失败")
@@ -512,38 +476,29 @@ def check_step(result, msg='步骤错误'):
return return_movies
-def download_cover(covers, fanart_path, big_covers=[]):
+async def download_cover(covers, fanart_path, big_covers=[]):
"""下载封面图片"""
# 优先下载高清封面
for url in big_covers:
pic_path = get_pic_path(fanart_path, url)
- for _ in range(Cfg().network.retry):
- try:
- info = download(url, pic_path)
- if valid_pic(pic_path):
- filesize = get_fmt_size(pic_path)
- width, height = get_pic_size(pic_path)
- elapsed = time.strftime("%M:%S", time.gmtime(info['elapsed']))
- speed = get_fmt_size(info['rate']) + '/s'
- logger.info(f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]")
- return (url, pic_path)
- except requests.exceptions.HTTPError:
- # HTTPError通常说明猜测的高清封面地址实际不可用,因此不再重试
- break
+ info = await url_download(Url(url), pic_path)
+ if valid_pic(pic_path):
+ filesize = get_fmt_size(pic_path)
+ width, height = get_pic_size(pic_path)
+ elapsed = str(info.elapsed)
+ speed = f"{info.get_rate()}Mbps"
+ logger.info(f"已下载高清封面: {width}x{height}, {filesize} [{elapsed}, {speed}]")
+ return (url, pic_path)
# 如果没有高清封面或高清封面下载失败
for url in covers:
pic_path = get_pic_path(fanart_path, url)
- for _ in range(Cfg().network.retry):
- try:
- download(url, pic_path)
- if valid_pic(pic_path):
- logger.debug(f"已下载封面: '{url}'")
- return (url, pic_path)
- else:
- logger.debug(f"图片无效或已损坏: '{url}',尝试更换下载地址")
- break
- except Exception as e:
- logger.debug(e, exc_info=True)
+ await url_download(Url(url), pic_path)
+ if valid_pic(pic_path):
+ logger.debug(f"已下载封面: '{url}'")
+ return (url, pic_path)
+ else:
+ logger.debug(f"图片无效或已损坏: '{url}',尝试更换下载地址")
+ break
logger.error(f"下载封面图片失败")
logger.debug('big_covers:'+str(big_covers) + ', covers'+str(covers))
return None
@@ -558,14 +513,7 @@ def get_pic_path(fanart_path, url):
pic_path = fanart_base + "." + pic_extend
return pic_path
-def error_exit(success, err_info):
- """检查业务逻辑是否成功完成,如果失败则报错退出程序"""
- if not success:
- logger.error(err_info)
- sys.exit(1)
-
-
-def entry():
+async def aentry():
try:
Cfg()
except ValidationError as e:
@@ -583,22 +531,28 @@ def entry():
# 检查更新
version_info = 'JavSP ' + getattr(sys, 'javsp_version', '未知版本/从代码运行')
logger.debug(version_info.center(60, '='))
- check_update(Cfg().other.check_update, Cfg().other.auto_update)
+ await check_update(Cfg().other.check_update, Cfg().other.auto_update)
root = get_scan_dir(Cfg().scanner.input_directory)
- error_exit(root, '未选择要扫描的文件夹')
+ if root is None:
+ logger.error('未选择要扫描的文件夹')
+ sys.exit(1)
# 导入抓取器,必须在chdir之前
- import_crawlers()
os.chdir(root)
print(f'扫描影片文件...')
recognized = scan_movies(root)
movie_count = len(recognized)
recognize_fail = []
- error_exit(movie_count, '未找到影片文件')
+ if movie_count == 0:
+ logger.error('未找到影片文件')
+ sys.exit(1)
logger.info(f'扫描影片文件:共找到 {movie_count} 部影片')
- RunNormalMode(recognized + recognize_fail)
+ await RunNormalMode(recognized + recognize_fail)
sys.exit(0)
+def entry():
+ asyncio.run(aentry(), debug=True)
+
if __name__ == "__main__":
entry()
diff --git a/javsp/config.py b/javsp/config.py
index 3fbc8f071..e87b5dc28 100644
--- a/javsp/config.py
+++ b/javsp/config.py
@@ -39,9 +39,10 @@ class CrawlerID(str, Enum):
class Network(BaseConfig):
proxy_server: Url | None
- retry: NonNegativeInt = 3
+ retries: NonNegativeInt = 3
timeout: Duration
- proxy_free: Dict[CrawlerID, Url]
+ unproxied: List[Url]
+ fallback: Dict[CrawlerID, List[str]]
class CrawlerSelect(BaseConfig):
def items(self) -> List[tuple[str, list[CrawlerID]]]:
@@ -109,7 +110,6 @@ class Crawler(BaseConfig):
required_keys: list[MovieInfoField]
hardworking: bool
respect_site_avid: bool
- fc2fan_local_path: Path | None
sleep_after_scraping: Duration
use_javdb_cover: UseJavDBCover
normalize_actress_name: bool
diff --git a/javsp/crawlers/all.py b/javsp/crawlers/all.py
new file mode 100644
index 000000000..8c262ecc1
--- /dev/null
+++ b/javsp/crawlers/all.py
@@ -0,0 +1,30 @@
+from collections.abc import Coroutine
+from typing import Any, Dict
+from javsp.config import CrawlerID
+from javsp.crawlers.interface import Crawler
+from javsp.crawlers.sites import \
+ airav, arzon, arzon_iv, avsox, avwiki, dl_getchu, fanza, fc2, fc2ppvdb, \
+ gyutto, jav321, javbus, javdb, javlib, javmenu, mgstage, njav, prestige
+
+__all__ = ['crawlers']
+
+crawlers: Dict[CrawlerID, type[Crawler]] = {
+ CrawlerID.airav: airav. AiravCrawler,
+ CrawlerID.arzon: arzon. ArzonCrawler,
+ CrawlerID.arzon_iv: arzon_iv. ArzonIvCrawler,
+ CrawlerID.avsox: avsox. AvsoxCrawler,
+ CrawlerID.avwiki: avwiki. AvWikiCrawler,
+ CrawlerID.dl_getchu: dl_getchu.DlGetchuCrawler,
+ CrawlerID.fanza: fanza. FanzaCrawler,
+ CrawlerID.fc2: fc2. Fc2Crawler,
+ CrawlerID.fc2ppvdb: fc2ppvdb. Fc2PpvDbCrawler,
+ CrawlerID.gyutto: gyutto. GyuttoCrawler,
+ CrawlerID.jav321: jav321. Jav321Crawler,
+ CrawlerID.javbus: javbus. JavbusCrawler,
+ CrawlerID.javdb: javdb. JavDbCrawler,
+ CrawlerID.javlib: javlib. JavLibCrawler,
+ CrawlerID.javmenu: javmenu. JavMenuCrawler,
+ CrawlerID.mgstage: mgstage. MgstageCrawler,
+ CrawlerID.njav: njav. NjavCrawler,
+ CrawlerID.prestige: prestige. PrestigeCrawler,
+}
diff --git a/javsp/web/exceptions.py b/javsp/crawlers/exceptions.py
similarity index 100%
rename from javsp/web/exceptions.py
rename to javsp/crawlers/exceptions.py
diff --git a/javsp/crawlers/interface.py b/javsp/crawlers/interface.py
new file mode 100644
index 000000000..a641b0a27
--- /dev/null
+++ b/javsp/crawlers/interface.py
@@ -0,0 +1,21 @@
+from httpx import AsyncClient
+from javsp.config import CrawlerID
+from javsp.datatype import MovieInfo
+from abc import ABC, abstractmethod
+from typing import Self
+
+
+class Crawler(ABC):
+ base_url: str
+ client: AsyncClient
+ id: CrawlerID
+
+
+ @classmethod
+ @abstractmethod
+ async def create(cls) -> Self:
+ pass
+
+ @abstractmethod
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ pass
diff --git a/javsp/crawlers/proxyfree.py b/javsp/crawlers/proxyfree.py
new file mode 100644
index 000000000..381eeb7af
--- /dev/null
+++ b/javsp/crawlers/proxyfree.py
@@ -0,0 +1,98 @@
+"""获取各个网站的免代理地址"""
+from collections.abc import Callable, Coroutine
+import re
+from typing import Any, Dict
+
+from pydantic_core import Url
+from pydantic_extra_types.pendulum_dt import Duration
+from lxml import html
+
+from javsp.config import CrawlerID
+from javsp.network.utils import test_connect, choose_one_connectable
+from javsp.network.client import get_client
+
+
+async def _get_avsox_urls() -> list[str]:
+ link = 'https://tellme.pw/avsox'
+ client = get_client(Url(link))
+ resp = await client.get(link)
+ tree = html.fromstring(resp.text)
+ urls = tree.xpath('//h4/strong/a/@href')
+ return urls
+
+
+async def _get_javbus_urls() -> list[str]:
+ link = 'https://www.javbus.one/'
+ client = get_client(Url(link))
+ resp = await client.get(link)
+ text = resp.text
+ urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A)
+ return urls
+
+
+async def _get_javlib_urls() -> list[str]:
+ link = 'https://github.com/javlibcom'
+ client = get_client(Url(link))
+ resp = await client.get(link)
+ tree = html.fromstring(resp.text)
+ text = tree.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content()
+ match = re.search(r'[\w\.]+', text, re.A)
+ if match:
+ domain = f'https://www.{match.group(0)}.com'
+ return [domain]
+ return []
+
+
+async def _get_javdb_urls() -> list[str]:
+ root_link = 'https://jav524.app'
+ client = get_client(Url(root_link))
+ resp = await client.get(root_link)
+ tree = html.fromstring(resp.text)
+ js_links = tree.xpath("//script[@src]/@src")
+ for link in js_links:
+ if '/js/index' in link:
+ link = root_link + link
+ resp = await client.get(link)
+ text = resp.text
+ match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A)
+ if match:
+ return [match.group(1)]
+ return []
+
+proxy_free_fns: Dict[CrawlerID, Callable[[], Coroutine[Any, Any, list[str]]]]= {
+ CrawlerID.avsox: _get_avsox_urls,
+ CrawlerID.javdb: _get_javdb_urls,
+ CrawlerID.javbus: _get_javbus_urls,
+ CrawlerID.javlib: _get_javlib_urls,
+}
+
+async def get_proxy_free_url(site_name: CrawlerID, prefer_url: str | None = None) -> str | None:
+ """获取指定网站的免代理地址
+ Args:
+ site_name (str): 站点名称
+ prefer_url (str, optional): 优先测试此url是否可用
+ Returns:
+ str: 指定站点的免代理地址(失败时为空字符串)
+ """
+ if prefer_url and await test_connect(prefer_url, Duration(seconds=5)):
+ return prefer_url
+
+ if site_name in proxy_free_fns:
+ try:
+ urls = await proxy_free_fns[site_name]()
+ return await choose_one_connectable(urls)
+ except:
+ return None
+ else:
+ raise Exception("Dont't know how to get proxy-free url for " + site_name)
+
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ print('javdb:\t', await _get_javdb_urls())
+ print('javlib:\t', await _get_javlib_urls())
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/airav.py b/javsp/crawlers/sites/airav.py
new file mode 100644
index 000000000..5afd46998
--- /dev/null
+++ b/javsp/crawlers/sites/airav.py
@@ -0,0 +1,124 @@
+"""从airav抓取数据"""
+import re
+from html import unescape
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.network.client import get_client
+from javsp.network.utils import resolve_site_fallback
+from javsp.config import Cfg, CrawlerID
+from javsp.datatype import MovieInfo
+from javsp.crawlers.interface import Crawler
+
+
+class AiravCrawler(Crawler):
+ id = CrawlerID.airav
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.airav.wiki')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ self.client.headers['Accept-Language'] = 'zh-TW,zh;q=0.9'
+ return self
+
+ async def search_movie(self, dvdid: str):
+ """通过搜索番号获取指定的影片在网站上的ID"""
+ # 部分影片的ID并不直接等于番号(如012717-360),此时需要尝试通过搜索来寻找影片
+ page = 0
+ count = 1
+ result = []
+ while len(result) < count:
+ url = f'{self.base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}'
+ response = await self.client.get(url)
+ resp = response.json()
+ # {"offset": 2460, "count": 12345, "result": [...], "status": "ok"}
+ if resp['result']:
+ result.extend(resp['result'])
+ count = resp['count']
+ page += 1
+ else: # 结果为空,结束循环
+ break
+ # 如果什么都没搜索到,直接返回
+ if not result:
+ raise MovieNotFoundError(__name__, dvdid)
+ # 排序,以优先选择更符合预期的结果(如'012717_472'对应的'1pondo_012717_472'和'_1pondo_012717_472')
+ result.sort(key=lambda x:x['barcode'])
+ # 从所有搜索结果中选择最可能的番号,返回它的URL
+ target = dvdid.replace('-', '_')
+ for item in result:
+ # {'vid': '', 'slug': '', 'name': '', 'url': '', 'view': '', 'img_url': '', 'barcode': ''}
+ barcode = item['barcode'].replace('-', '_')
+ if target in barcode:
+ return item['barcode']
+ raise MovieNotFoundError(__name__, dvdid, result)
+
+
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据
+ url = f'{self.base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW'
+ response = await self.client.get(url)
+ resp_json = response.json()
+ # 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息
+ if resp_json['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid):
+ barcode = await self.search_movie(movie.dvdid)
+ if barcode:
+ url = f'{self.base_url}/api/video/barcode/{barcode}?lng=zh-TW'
+ response = await self.client.get(url)
+ resp_json = response.json()
+
+ if resp_json['count'] == 0:
+ raise MovieNotFoundError(__name__, movie.dvdid, resp_json)
+
+ # 从API返回的数据中提取需要的字段
+ # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展
+ data = resp_json['result']
+ dvdid = data['barcode']
+ movie.dvdid = dvdid
+ movie.url = self.base_url + '/video/' + dvdid
+ # plot和title中可能含有HTML的转义字符,需要进行解转义处理
+ movie.plot = unescape(data['description']) or None
+ movie.cover = data['img_url']
+ # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id
+ movie.genre = [i['name'] for i in data['tags']]
+ movie.title = unescape(data['name'])
+ movie.actress = [i['name'] for i in data['actors']]
+ movie.publish_date = data['publish_date']
+ movie.preview_pics = data['images'] or []
+ if data['factories']:
+ movie.producer = data['factories'][0]['name']
+
+ if Cfg().crawler.hardworking:
+ # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472')
+ video_url = f"{self.base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
+ response = await self.client.get(video_url)
+ resp = response.json()
+ # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'}
+ if 'data' in resp:
+ # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址
+ # TODO: 发现部分影片(如080719-976)的传统格式预览片错误
+ movie.preview_video = resp['data'].get('url')
+
+ # airav上部分影片会被标记为'馬賽克破壞版'等,这些影片的title、plot和genre都不再准确
+ for keyword in ('馬賽克破壞版', '馬賽克破解版', '無碼流出版'):
+ if movie.title and keyword in movie.title:
+ movie.title = None
+ movie.genre = []
+ if movie.plot and keyword in movie.plot:
+ movie.plot = None
+ movie.genre = []
+ if not any([movie.title, movie.plot, movie.genre]):
+ break
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await AiravCrawler.create()
+ movie = MovieInfo("DSAD-938")
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/arzon.py b/javsp/crawlers/sites/arzon.py
new file mode 100644
index 000000000..f4887f4d7
--- /dev/null
+++ b/javsp/crawlers/sites/arzon.py
@@ -0,0 +1,105 @@
+"""从arzon抓取数据"""
+import re
+
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from javsp.crawlers.exceptions import *
+from javsp.datatype import MovieInfo
+from lxml import html
+
+class ArzonCrawler(Crawler):
+ id = CrawlerID.arzon
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, "https://www.arzon.jp")
+ self.base_url = str(url)
+ self.client = get_client(url)
+ # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
+ skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1"
+ await self.client.get(skip_verify_url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ full_id = movie.dvdid
+ url = f'{self.base_url}/itemlist.html?t=&m=all&s=&q={full_id}'
+ # url = f'{base_url}/imagelist.html?q={full_id}'
+
+ r = await self.client.get(url)
+ if r.status_code == 404:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
+ data = html.fromstring(r.content)
+
+ urls = data.xpath("//h2/a/@href")
+ if len(urls) == 0:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ item_url = self.base_url + urls[0]
+ e = await self.client.get(item_url)
+ item = html.fromstring(e.content)
+
+ title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0]
+ cover = item.xpath("//td[@align='center']//a/img/@src")[0]
+ item_text = item.xpath("//div[@class='item_text']/text()")
+ plot = [item.strip() for item in item_text if item.strip() != ''][0]
+ preview_pics_arr = item.xpath("//div[@class='detail_img']//img/@src")
+ # 使用列表推导式添加 "http:" 并去除 "m_"
+ preview_pics = [("https:" + url).replace("m_", "") for url in preview_pics_arr]
+
+ container = item.xpath("//div[@class='item_register']/table//tr")
+ for row in container:
+ key = row.xpath("./td[1]/text()")[0]
+ contents = row.xpath("./td[2]//text()")
+ content = [item.strip() for item in contents if item.strip() != '']
+ index = 0
+ value = content[index] if content and index < len(content) else None
+ if key == "AV女優:":
+ movie.actress = content
+ if key == "AVメーカー:":
+ movie.producer = value
+ if key == "AVレーベル:":
+ video_type = value
+ if key == "シリーズ:":
+ movie.serial = value
+ if key == "監督:":
+ movie.director = value
+ if key == "発売日:" and value:
+ movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
+ if key == "収録時間:" and value:
+ movie.duration = re.search(r'([\d.]+)分', value).group(1)
+ if key == "品番:":
+ dvd_id = value
+ elif key == "タグ:":
+ genre = value
+
+ genres = ''
+ if video_type:
+ genres = [video_type]
+ if(genre != None):
+ genres.append(genre)
+
+ movie.genre = genres
+ movie.url = item_url
+ movie.title = title
+ movie.plot = plot
+ movie.cover = f'https:{cover}'
+ movie.preview_pics = preview_pics
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await ArzonCrawler.create()
+ movie = MovieInfo("CSCT-011")
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/arzon_iv.py b/javsp/crawlers/sites/arzon_iv.py
new file mode 100644
index 000000000..a84c97aea
--- /dev/null
+++ b/javsp/crawlers/sites/arzon_iv.py
@@ -0,0 +1,100 @@
+"""从arzon_iv抓取数据"""
+import re
+
+
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from javsp.crawlers.exceptions import *
+from javsp.datatype import MovieInfo
+from lxml import html
+
+class ArzonIvCrawler(Crawler):
+ id = CrawlerID.arzon_iv
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, "https://www.arzon.jp")
+ self.base_url = str(url)
+ self.client = get_client(url)
+ # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
+ skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1"
+ await self.client.get(skip_verify_url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ full_id = movie.dvdid
+ url = f'{self.base_url}/imagelist.html?q={full_id}'
+ # url = f'{base_url}/imagelist.html?q={full_id}'
+
+ r = await self.client.get(url)
+ if r.status_code == 404:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
+ data = html.fromstring(r.content)
+
+ urls = data.xpath("//h2/a/@href")
+ if len(urls) == 0:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ item_url = self.base_url + urls[0]
+ e = await self.client.get(item_url)
+ item = html.fromstring(e.content)
+
+ title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0]
+ cover = item.xpath("//td[@align='center']//a/img/@src")[0]
+ item_text = item.xpath("//div[@class='item_text']/text()")
+ plot = [item.strip() for item in item_text if item.strip() != ''][0]
+
+ container = item.xpath("//div[@class='item_register']/table//tr")
+ for row in container:
+ key = row.xpath("./td[1]/text()")[0]
+ contents = row.xpath("./td[2]//text()")
+ content = [item.strip() for item in contents if item.strip() != '']
+ index = 0
+ value = content[index] if content and index < len(content) else None
+ if key == "タレント:":
+ movie.actress = content
+ if key == "イメージメーカー:":
+ movie.producer = value
+ if key == "イメージレーベル:":
+ video_type = value
+ if key == "監督:":
+ movie.director = value
+ if key == "発売日:" and value:
+ movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
+ if key == "収録時間:" and value:
+ movie.duration = re.search(r'([\d.]+)分', value).group(1)
+ if key == "品番:":
+ dvd_id = value
+ elif key == "タグ:":
+ genre = value
+
+ genres = ''
+ if video_type:
+ genres = [video_type]
+ if(genre != None):
+ genres.append(genre)
+
+ movie.genre = genres
+ movie.url = item_url
+ movie.title = title
+ movie.plot = plot
+ movie.cover = f'https:{cover}'
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await ArzonIvCrawler.create()
+ movie = MovieInfo("KIDM-1137B")
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/avsox.py b/javsp/crawlers/sites/avsox.py
new file mode 100644
index 000000000..47b0ea32d
--- /dev/null
+++ b/javsp/crawlers/sites/avsox.py
@@ -0,0 +1,88 @@
+"""从avsox抓取数据"""
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+class AvsoxCrawler(Crawler):
+ id = CrawlerID.avsox
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, "https://avsox.click/")
+ self.base_url = str(url)
+ self.client = get_client(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ full_id: str = movie.dvdid
+ if full_id.startswith('FC2-'):
+ full_id = full_id.replace('FC2-', 'FC2-PPV-')
+ resp = await self.client.get(f'{self.base_url}tw/search/{full_id}')
+ tree = html.fromstring(resp.text)
+ tree.make_links_absolute(str(resp.url), resolve_base_href=True)
+ ids = tree.xpath("//div[@class='photo-info']/span/date[1]/text()")
+ urls = tree.xpath("//a[contains(@class, 'movie-box')]/@href")
+ ids_lower = list(map(str.lower, ids))
+ if full_id.lower() not in ids_lower:
+ raise MovieNotFoundError(__name__, movie.dvdid, ids)
+
+ url = urls[ids_lower.index(full_id.lower())]
+ url = url.replace('/tw/', '/cn/', 1)
+
+ # 提取影片信息
+ resp = await self.client.get(url)
+ # with open('file.html', 'wb') as f:
+ # f.write(resp.content)
+ tree = html.fromstring(resp.text)
+ container = tree.xpath("/html/body/div[@class='container']")[0]
+ title = container.xpath("h3/text()")[0]
+ cover = container.xpath("//a[@class='bigImage']/@href")[0]
+ info = container.xpath("div/div[@class='col-md-3 info']")[0]
+ dvdid = info.xpath("p/span[@style]/text()")[0]
+ publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip()
+ duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟', '').strip()
+ producer, serial = None, None
+ producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a")
+ if producer_tag:
+ producer = producer_tag[0].text_content()
+ serial_tag = info.xpath("p[text()='系列:']")
+ if serial_tag:
+ serial = serial_tag[0].getnext().xpath("a/text()")[0]
+ genre = info.xpath("p/span[@class='genre']/a/text()")
+ actress = container.xpath("//a[@class='avatar-box']/span/text()")
+
+ movie.dvdid = dvdid.replace('FC2-PPV-', 'FC2-')
+ movie.url = url
+ movie.title = title.replace(dvdid, '').strip()
+ movie.cover = cover
+ movie.publish_date = publish_date
+ movie.duration = duration
+ movie.genre = genre
+ movie.actress = actress
+ if full_id.startswith('FC2-'):
+ # avsox把FC2作品的拍摄者归类到'系列'而制作商固定为'FC2-PPV',这既不合理也与其他的站点不兼容,因此进行调整
+ movie.producer = serial
+ else:
+ movie.producer = producer
+ movie.serial = serial
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await AvsoxCrawler.create()
+ movie = MovieInfo("082713-417")
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/avwiki.py b/javsp/crawlers/sites/avwiki.py
new file mode 100644
index 000000000..7bc2041e5
--- /dev/null
+++ b/javsp/crawlers/sites/avwiki.py
@@ -0,0 +1,82 @@
+"""从av-wiki抓取数据"""
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.crawlers.interface import Crawler
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.config import CrawlerID
+from lxml import html
+
+class AvWikiCrawler(Crawler):
+ id = CrawlerID.avwiki
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://av-wiki.net')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """从网页抓取并解析指定番号的数据
+ Args:
+ movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
+ """
+ movie.url = url = f'{self.base_url}/{movie.dvdid}'
+
+ resp = await self.client.get(url)
+ if resp.status_code == 404:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ tree = html.fromstring(resp.content)
+
+ cover_tag = tree.xpath("//header/div/a[@class='image-link-border']/img")
+ if cover_tag:
+ try:
+ srcset = cover_tag[0].get('srcset').split(', ')
+ src_set_urls = {}
+ for src in srcset:
+ url, width = src.split()
+ width = int(width.rstrip('w'))
+ src_set_urls[width] = url
+ max_pic = sorted(src_set_urls.items(), key=lambda x:x[0], reverse=True)
+ movie.cover = max_pic[0][1]
+ except:
+ movie.cover = cover_tag[0].get('src')
+ body = tree.xpath("//section[@class='article-body']")[0]
+ title = body.xpath("div/p/text()")[0]
+ title = title.replace(f"【{movie.dvdid}】", '')
+ cite_url = body.xpath("div/cite/a/@href")[0]
+ cite_url = cite_url.split('?aff=')[0]
+ info = body.xpath("dl[@class='dltable']")[0]
+ dt_txt_ls, dd_tags = info.xpath("dt/text()"), info.xpath("dd")
+ data = {}
+ for dt_txt, dd in zip(dt_txt_ls, dd_tags):
+ dt_txt = dt_txt.strip()
+ a_tag = dd.xpath('a')
+ if len(a_tag) == 0:
+ dd_txt = dd.text.strip()
+ else:
+ dd_txt = [i.text.strip() for i in a_tag]
+ if isinstance(dd_txt, list) and dt_txt != 'AV女優名': # 只有女优名以列表的数据格式保留
+ dd_txt = dd_txt[0]
+ data[dt_txt] = dd_txt
+
+ ATTR_MAP = {'メーカー': 'producer', 'AV女優名': 'actress', 'メーカー品番': 'dvdid', 'シリーズ': 'serial', '配信開始日': 'publish_date'}
+ for key, attr in ATTR_MAP.items():
+ setattr(movie, attr, data.get(key))
+ movie.title = title
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await AvWikiCrawler.create()
+ movie = MovieInfo("259LUXU-593")
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/dl_getchu.py b/javsp/crawlers/sites/dl_getchu.py
new file mode 100644
index 000000000..c2ab0814f
--- /dev/null
+++ b/javsp/crawlers/sites/dl_getchu.py
@@ -0,0 +1,131 @@
+"""从dl.getchu官网抓取数据"""
+import re
+import logging
+
+from javsp.config import CrawlerID
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.crawlers.interface import Crawler
+from javsp.network.client import get_client
+from javsp.network.utils import resolve_site_fallback
+from javsp.crawlers.exceptions import *
+from javsp.datatype import MovieInfo
+from lxml import html
+from lxml.html import HtmlElement
+
+def get_movie_title(tree: HtmlElement):
+ container = tree.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[2]")
+ if len(container) > 0:
+ container = container[0]
+ rows = container.xpath('.//tr')
+ title = ''
+ for row in rows:
+ for cell in row.xpath('.//td/div'):
+ # 获取单元格文本内容
+ if cell.text:
+ title = str(cell.text).strip()
+ return title
+
+
+def get_movie_img(tree: HtmlElement, getchu_id: str):
+ img_src = ''
+ container = tree.xpath(f'//img[contains(@src, "{getchu_id}top.jpg")]')
+ if len(container) > 0:
+ container = container[0]
+ img_src = container.get('src')
+ return img_src
+
+
+def get_movie_preview(tree: HtmlElement, getchu_id: str):
+ preview_pics = []
+ container = tree.xpath(f'//img[contains(@src, "{getchu_id}_")]')
+ if len(container) > 0:
+ for c in container:
+ preview_pics.append(c.get('src'))
+ return preview_pics
+
+
+DURATION_PATTERN = re.compile(r'(?:動画)?(\d+)分')
+
+
+class DlGetchuCrawler(Crawler):
+ id = CrawlerID.dl_getchu
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://dl.getchu.com')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ # 去除番号中的'GETCHU'字样
+ id_uc = movie.dvdid.upper()
+ if not id_uc.startswith('GETCHU-'):
+ raise ValueError('Invalid GETCHU number: ' + movie.dvdid)
+ getchu_id = id_uc.replace('GETCHU-', '')
+ # 抓取网页
+ url = f'{self.base_url}/i/item{getchu_id}'
+ r = await self.client.get(url)
+ if r.status_code == 404:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ tree = html.fromstring(r.text)
+ container = tree.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[3]")
+ if len(container) > 0:
+ container = container[0]
+ # 将表格提取为键值对
+ rows = container.xpath('.//table/tr')
+ kv_rows = [i for i in rows if len(i) == 2]
+ data = {}
+ for row in kv_rows:
+ # 获取单元格文本内容
+ key = row.xpath("td[@class='bluetext']/text()")[0]
+ # 是否包含a标签: 有的属性是用表示的,不是text
+ a_tags = row.xpath("td[2]/a")
+ if a_tags:
+ value = [i.text for i in a_tags]
+ else:
+ # 获取第2个td标签的内容(下标从1开始计数)
+ value = row.xpath("td[2]/text()")
+ data[key] = value
+
+ for key, value in data.items():
+ if key == 'サークル':
+ movie.producer = value[0]
+ elif key == '作者':
+ # 暂时没有在getchu找到多个actress的片子
+ movie.actress = [i.strip() for i in value]
+ elif key == '画像数&ページ数':
+ match = DURATION_PATTERN.search(' '.join(value))
+ if match:
+ movie.duration = match.group(1)
+ elif key == '配信開始日':
+ movie.publish_date = value[0].replace('/', '-')
+ elif key == '趣向':
+ movie.genre = value
+ elif key == '作品内容':
+ idx = -1
+ for i, line in enumerate(value):
+ if line.lstrip().startswith('※'):
+ idx = i
+ break
+ movie.plot = ''.join(value[:idx])
+
+ movie.title = get_movie_title(tree)
+ movie.cover = get_movie_img(tree, getchu_id)
+ movie.preview_pics = get_movie_preview(tree, getchu_id)
+ movie.dvdid = id_uc
+ movie.url = url
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await DlGetchuCrawler.create()
+ movie = MovieInfo('getchu-4041026')
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/fanza.py b/javsp/crawlers/sites/fanza.py
new file mode 100644
index 000000000..66b895df5
--- /dev/null
+++ b/javsp/crawlers/sites/fanza.py
@@ -0,0 +1,246 @@
+"""从fanza抓取数据"""
+
+import re
+import json
+import logging
+from typing import Dict, List, Tuple
+
+from httpx import Response
+
+
+from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.config import Cfg
+from javsp.datatype import MovieInfo
+
+from lxml import html
+from lxml.html import HtmlElement
+
+logger = logging.getLogger(__name__)
+
+
+_PRODUCT_PRIORITY = {'digital': 10, 'mono': 5, 'monthly': 2, 'rental': 1}
+_TYPE_PRIORITY = {'videoa': 10, 'anime': 8, 'nikkatsu': 6, 'doujin': 4, 'dvd': 3, 'ppr': 2, 'paradisetv': 1}
+def sort_search_result(result: List[Dict]):
+ """排序搜索结果"""
+ scores = {i['url']:(_PRODUCT_PRIORITY.get(i['product'], 0), _TYPE_PRIORITY.get(i['type'], 0)) for i in result}
+ sorted_result = sorted(result, key=lambda x:scores[x['url']], reverse=True)
+ return sorted_result
+
+
+def resp2html_wrapper(resp: Response) -> HtmlElement:
+ tree = html.fromstring(resp.text)
+ if 'not available in your region' in tree.text_content():
+ raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置')
+ elif '/login/' in str(resp.url):
+ raise SiteBlocked('FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP')
+ return tree
+
+
+
+
+def parse_anime_page(movie: MovieInfo, tree: HtmlElement):
+ """解析动画影片的页面布局"""
+ title = tree.xpath("//h1[@id='title']/text()")[0]
+ container = tree.xpath("//table[@class='mg-b12']/tr/td")[0]
+ cover = container.xpath("//img[@name='package-image']/@src")[0]
+ date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip()
+ publish_date = date_str.replace('/', '-')
+ duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")
+ if duration_tag:
+ movie.duration = duration_tag[0].strip().replace('分', '')
+ serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()")
+ if serial_tag:
+ movie.serial = serial_tag[0].strip()
+ producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()")
+ if producer_tag:
+ movie.producer = producer_tag[0].strip()
+ genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ genre.append(tag.text.strip())
+ genre_id.append(tag.get('href').split('=')[-1].strip('/'))
+ cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip()
+ plot = container.xpath("//div[@class='mg-b20 lh4']/p")[0].text_content().strip()
+ preview_pics = container.xpath("//a[@name='sample-image']/img/@data-lazy")
+ score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0]
+ score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50
+
+ movie.cid = cid
+ movie.title = title
+ movie.cover = cover
+ movie.publish_date = publish_date
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.plot = plot
+ movie.score = f'{score/5:.2f}' # 转换为10分制
+ movie.preview_pics = preview_pics
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+
+
+# parse_dvd_page = parse_videoa_page # 118wtktabf067
+# parse_ppr_page = parse_videoa_page
+# parse_nikkatsu_page = parse_videoa_page
+# parse_doujin_page = parse_anime_page
+
+class FanzaCrawler(Crawler):
+ id = CrawlerID.fanza
+
+ async def get_urls_of_cid(self, cid: str) -> Tuple[str, str]:
+ """搜索cid可能的影片URL"""
+ r = await self.client.get(f"{self.base_url}search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0")
+ if r.status_code == 404:
+ raise MovieNotFoundError(__name__, cid)
+ r.raise_for_status()
+ tree = resp2html_wrapper(r)
+ result = tree.xpath("//ul[@id='list']/li/div/p/a/@href")
+ parsed_result = {}
+ for url in result:
+ items = url.split('/')
+ type_, cid = None, None
+ for i, part in enumerate(items):
+ if part == '-':
+ product, type_ = items[i-2], items[i-1]
+ elif part.startswith('cid='):
+ cid = part[4:]
+ new_url = '/'.join(i for i in items if not i.startswith('?')) + '/'
+ parsed_result.setdefault(cid, []).append({'product': product, 'type': type_, 'url': new_url})
+ break
+ if cid not in parsed_result:
+ if len(result) > 0:
+ logger.debug(f"Unknown URL in search result: " + ', '.join(result))
+ raise MovieNotFoundError(__name__, cid)
+ sorted_result = sort_search_result(parsed_result[cid])
+ return sorted_result
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.dmm.co.jp')
+ self.base_url = str(url)
+ self.client = get_client(url)
+
+ # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面)
+ self.client.cookies = {'age_check_done': '1'}
+ self.client.headers['Accept-Language'] = 'ja,en-US;q=0.9'
+ return self
+
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ default_url = f'{self.base_url}digital/videoa/-/detail/=/cid={movie.cid}/'
+ r0 = await self.client.get(default_url)
+ if r0.status_code == 404:
+ urls = await self.get_urls_of_cid(movie.cid)
+ for d in urls:
+ func_name = f"parse_{d['type']}_page"
+ if func_name in globals():
+ parse_func = globals()[func_name]
+ else:
+ logger.debug(f"不知道怎么解析 fanza {d['type']} 的页面: {d['url']}")
+ continue
+ r = await self.client.get(d['url'])
+ tree = resp2html_wrapper(r)
+ try:
+ parse_func(movie, tree)
+ movie.url = d['url']
+ break
+ except:
+ logger.debug(f"Fail to parse {d['url']}", exc_info=True)
+ if d is urls[-1]:
+ logger.warning(f"在fanza查找到的cid={movie.cid}的影片页面均解析失败")
+ raise
+ else:
+ tree = resp2html_wrapper(r0)
+ await self.parse_videoa_page(movie, tree)
+ movie.url = default_url
+
+ async def parse_videoa_page(self, movie: MovieInfo, tree: HtmlElement):
+ """解析AV影片的页面布局"""
+ title = tree.xpath("//div[@class='hreview']/h1/text()")[0]
+ # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来
+ container = tree.xpath("//table[@class='mg-b12']/tr/td")[0]
+ cover = container.xpath("//div[@id='sample-video']/a/@href")[0]
+ # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083
+ date_tag = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()")
+ if date_tag:
+ movie.publish_date = date_tag[0].strip().replace('/', '-')
+ duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip()
+ match = re.search(r'\d+', duration_str)
+ if match:
+ movie.duration = match.group(0)
+ # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况
+ actress = container.xpath("//span[@id='performer']/a/text()")
+ director_tag = container.xpath("//td[text()='監督:']/following-sibling::td/a/text()")
+ if director_tag:
+ movie.director = director_tag[0].strip()
+ serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()")
+ if serial_tag:
+ movie.serial = serial_tag[0].strip()
+ producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()")
+ if producer_tag:
+ movie.producer = producer_tag[0].strip()
+ # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
+ # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()")
+ # if label_tag:
+ # label = label_tag[0].strip()
+ # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选
+ genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ genre.append(tag.text.strip())
+ genre_id.append(tag.get('href').split('=')[-1].strip('/'))
+ cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip()
+ plot = container.xpath("//div[contains(@class, 'mg-b20 lh4')]/text()")[0].strip()
+ preview_pics = container.xpath("//a[@name='sample-image']/img/@src")
+ score_tag = container.xpath("//p[@class='d-review__average']/strong/text()")
+ if score_tag:
+ match = re.search(r'\d+', score_tag[0].strip())
+ if match:
+ score = float(match.group()) * 2
+ movie.score = f'{score:.2f}'
+ else:
+ score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0]
+ movie.score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50
+
+ if Cfg().crawler.hardworking:
+ # 预览视频是动态加载的,不在静态网页中
+ video_url = f'{self.base_url}service/digitalapi/-/html5_player/=/cid={movie.cid}'
+ resp = await self.client.get(video_url)
+ tree2 = html.fromstring(resp.text)
+ # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据
+ script = tree2.xpath("//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()")[0].strip()
+ match = re.search(r'\{.*\}', script)
+ # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配
+ try:
+ data = json.loads(match.group())
+ video_url = data.get('src')
+ if video_url and video_url.startswith('//'):
+ video_url = 'https:' + video_url
+ movie.preview_video = video_url
+ except Exception as e:
+ logger.debug('解析视频地址时异常: ' + repr(e))
+
+ movie.cid = cid
+ movie.title = title
+ movie.cover = cover
+ movie.actress = actress
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.plot = plot
+ movie.preview_pics = preview_pics
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await FanzaCrawler.create()
+ movie = MovieInfo(cid="d_aisoft3356")
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/fc2.py b/javsp/crawlers/sites/fc2.py
new file mode 100644
index 000000000..0ce072b90
--- /dev/null
+++ b/javsp/crawlers/sites/fc2.py
@@ -0,0 +1,120 @@
+"""从FC2官网抓取数据"""
+import logging
+
+from lxml import html
+
+
+from javsp.crawlers.exceptions import *
+from javsp.config import Cfg
+from javsp.lib import strftime_to_minutes
+from javsp.datatype import MovieInfo
+from javsp.crawlers.interface import Crawler
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.config import CrawlerID
+
+
+logger = logging.getLogger(__name__)
+
+class Fc2Crawler(Crawler):
+ id = CrawlerID.fc2
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://adult.contents.fc2.com')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ return self
+
+ async def get_movie_score(self, fc2_id: str) -> float | None:
+ """通过评论数据来计算FC2的影片评分(10分制),无法获得评分时返回None"""
+ resp = await self.client.get(f'{self.base_url}/article/{fc2_id}/review')
+ tree = html.fromstring(resp.text)
+ review_tags = tree.xpath("//ul[@class='items_comment_headerReviewInArea']/li")
+ reviews = {}
+ for tag in review_tags:
+ score = int(tag.xpath("div/span/text()")[0])
+ vote = int(tag.xpath("span")[0].text_content())
+ reviews[score] = vote
+ total_votes = sum(reviews.values())
+ if (total_votes >= 2): # 至少也该有两个人评价才有参考意义一点吧
+ summary = sum([k*v for k, v in reviews.items()])
+ final_score = summary / total_votes * 2 # 乘以2转换为10分制
+ return final_score
+
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ # 去除番号中的'FC2'字样
+ id_uc = movie.dvdid.upper()
+ if not id_uc.startswith('FC2-'):
+ raise ValueError('Invalid FC2 number: ' + movie.dvdid)
+ fc2_id = id_uc.replace('FC2-', '')
+ # 抓取网页
+ url = f'{self.base_url}/article/{fc2_id}/'
+ resp = await self.client.get(url)
+ if '/id.fc2.com/' in str(resp.url):
+ raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP')
+ tree = html.fromstring(resp.text)
+ container = tree.xpath("//div[@class='items_article_left']")
+ if len(container) > 0:
+ container = container[0]
+ else:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ # FC2 标题增加反爬乱码,使用数组合并标题
+ title_arr = container.xpath("//div[@class='items_article_headerInfo']/h3/text()")
+ title = ''.join(title_arr)
+ thumb_tag = container.xpath("//div[@class='items_article_MainitemThumb']")[0]
+ thumb_pic = thumb_tag.xpath("span/img/@src")[0]
+ duration_str = thumb_tag.xpath("span/p[@class='items_article_info']/text()")[0]
+ # FC2没有制作商和发行商的区分,作为个人市场,影片页面的'by'更接近于制作商
+ producer = container.xpath("//li[text()='by ']/a/text()")[0]
+ genre = container.xpath("//a[@class='tag tagTag']/text()")
+ date_str = container.xpath("//div[@class='items_article_Releasedate']/p/text()")[0]
+ publish_date = date_str[-10:].replace('/', '-') # '販売日 : 2017/11/30'
+ preview_pics = container.xpath("//ul[@data-feed='sample-images']/li/a/@href")
+
+ if Cfg().crawler.hardworking:
+ # 通过评论数据来计算准确的评分
+ score = await self.get_movie_score(fc2_id)
+ if score:
+ movie.score = f'{score:.2f}'
+ # 预览视频是动态加载的,不在静态网页中
+ desc_frame_url = container.xpath("//section[@class='items_article_Contents']/iframe/@src")[0]
+ key = desc_frame_url.split('=')[-1] # /widget/article/718323/description?ac=60fc08fa...
+ api_url = f'{self.base_url}/api/v2/videos/{fc2_id}/sample?key={key}'
+ resp = await self.client.get(api_url)
+ j = resp.json()
+ movie.preview_video = j['path']
+ else:
+ # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星
+ score_tag_attr = container.xpath("//a[@class='items_article_Stars']/p/span/@class")[0]
+ score = int(score_tag_attr[-1]) * 2
+ movie.score = f'{score:.2f}'
+
+ movie.dvdid = id_uc
+ movie.url = url
+ movie.title = title
+ movie.genre = genre
+ movie.producer = producer
+ movie.duration = str(strftime_to_minutes(duration_str))
+ movie.publish_date = publish_date
+ movie.preview_pics = preview_pics
+ # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
+ if movie.preview_pics:
+ movie.cover = preview_pics[0]
+ else:
+ movie.cover = thumb_pic
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await Fc2Crawler.create()
+ movie = MovieInfo("FC2-718323")
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/fc2ppvdb.py b/javsp/crawlers/sites/fc2ppvdb.py
new file mode 100644
index 000000000..fbba590c2
--- /dev/null
+++ b/javsp/crawlers/sites/fc2ppvdb.py
@@ -0,0 +1,92 @@
+"""从FC2PPVDB抓取数据"""
+
+# BUG: This crawler doesn't work, seemed due to cloudflare
+
+from typing import List
+
+
+from javsp.crawlers.exceptions import *
+from javsp.lib import strftime_to_minutes
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+
+class Fc2PpvDbCrawler(Crawler):
+ id = CrawlerID.fc2ppvdb
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://fc2ppvdb.com')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+
+ def get_list_first(list: List):
+ return list[0] if list and len(list) > 0 else None
+
+ # 去除番号中的'FC2'字样
+ id_uc = movie.dvdid.upper()
+ if not id_uc.startswith('FC2-'):
+ raise ValueError('Invalid FC2 number: ' + movie.dvdid)
+ fc2_id = id_uc.replace('FC2-', '')
+ # 抓取网页
+ url = f'{self.base_url}/articles/{fc2_id}'
+ resp = await self.client.get(url)
+ tree = html.fromstring(resp.content)
+ # html = get_html(url)
+ container = tree.xpath("//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]")
+ if len(container) > 0:
+ container = container[0]
+ else:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ title = container.xpath("//h2/a/text()")
+ thumb_pic = container.xpath(f"//img[@alt='{fc2_id}']/@src")
+ duration_str = container.xpath("//div[starts-with(text(),'収録時間:')]/span/text()")
+ actress = container.xpath("//div[starts-with(text(),'女優:')]/span/a/text()")
+ genre = container.xpath("//div[starts-with(text(),'タグ:')]/span/a/text()")
+ publish_date = container.xpath("//div[starts-with(text(),'販売日:')]/span/text()")
+ publisher = container.xpath("//div[starts-with(text(),'販売者:')]/span/a/text()")
+ uncensored_str = container.xpath("//div[starts-with(text(),'モザイク:')]/span/text()")
+ uncensored_str_f = get_list_first(uncensored_str);
+ uncensored = True if uncensored_str_f == '無' else False if uncensored_str_f == '有' else None
+ preview_pics = None
+ preview_video = container.xpath("//a[starts-with(text(),'サンプル動画')]/@href")
+
+ movie.dvdid = id_uc
+ movie.url = url
+ movie.title = get_list_first(title)
+ movie.genre = genre
+ movie.actress = actress
+ movie.duration = str(strftime_to_minutes(get_list_first(duration_str)))
+ movie.publish_date = get_list_first(publish_date)
+ movie.publisher = get_list_first(publisher)
+ movie.uncensored = uncensored
+ movie.preview_pics = preview_pics
+ movie.preview_video = get_list_first(preview_video)
+
+ # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
+ if movie.preview_pics:
+ movie.cover = preview_pics[0]
+ else:
+ movie.cover = get_list_first(thumb_pic)
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await Fc2PpvDbCrawler.create()
+ movie = MovieInfo('FC2-4497837')
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/gyutto.py b/javsp/crawlers/sites/gyutto.py
new file mode 100644
index 000000000..b30200284
--- /dev/null
+++ b/javsp/crawlers/sites/gyutto.py
@@ -0,0 +1,106 @@
+"""从https://gyutto.com/官网抓取数据"""
+import logging
+import time
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+from lxml.html import HtmlElement
+
+logger = logging.getLogger(__name__)
+
+def get_movie_title(tree: HtmlElement) -> str:
+ container = tree.xpath("//h1")
+ if len(container) > 0:
+ container = container[0]
+ title = container.text
+
+ return title
+
+def get_movie_img(tree: HtmlElement, index = 1) -> list[str]:
+ images = []
+ container = tree.xpath("//a[@class='highslide']/img")
+ if len(container) > 0:
+ if index == 0:
+ return container[0].get('src')
+
+ for row in container:
+ images.append(row.get('src'))
+
+ return images
+
+class GyuttoCrawler(Crawler):
+ id = CrawlerID.gyutto
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'http://gyutto.com')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ # 去除番号中的'gyutto'字样
+ id_uc = movie.dvdid.upper()
+ if not id_uc.startswith('GYUTTO-'):
+ raise ValueError('Invalid gyutto number: ' + movie.dvdid)
+ gyutto_id = id_uc.replace('GYUTTO-', '')
+ # 抓取网页
+ url = f'{self.base_url}/i/item{gyutto_id}?select_uaflag=1'
+ r = await self.client.get(url)
+ if r.status_code == 404:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ tree = html.fromstring(r.text)
+ container = tree.xpath("//dl[@class='BasicInfo clearfix']")
+
+ producer = None
+ genre = None
+ date = None
+ publish_date = None
+
+ for row in container:
+ key = row.xpath(".//dt/text()")
+ if key[0] == "サークル":
+ producer = ''.join(row.xpath(".//dd/a/text()"))
+ elif key[0] == "ジャンル":
+ genre = row.xpath(".//dd/a/text()")
+ elif key[0] == "配信開始日":
+ date = row.xpath(".//dd/text()")
+ date_str = ''.join(date)
+ date_time = time.strptime(date_str, "%Y年%m月%d日")
+ publish_date = time.strftime("%Y-%m-%d", date_time)
+
+ plot = tree.xpath("//div[@class='unit_DetailLead']/p/text()")[0]
+
+ movie.title = get_movie_title(tree)
+ movie.cover = get_movie_img(tree, 0)
+ movie.preview_pics = get_movie_img(tree)
+ movie.dvdid = id_uc
+ movie.url = url
+ movie.producer = producer
+ # movie.actress = actress
+ # movie.duration = duration
+ movie.publish_date = publish_date
+ movie.genre = genre
+ movie.plot = plot
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await GyuttoCrawler.create()
+ movie = MovieInfo('gyutto-266923')
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/jav321.py b/javsp/crawlers/sites/jav321.py
new file mode 100644
index 000000000..6a20a98ec
--- /dev/null
+++ b/javsp/crawlers/sites/jav321.py
@@ -0,0 +1,117 @@
+"""从jav321抓取数据"""
+import re
+import logging
+
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+
+logger = logging.getLogger(__name__)
+
+class Jav321Crawler(Crawler):
+ id = CrawlerID.jav321
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.jav321.com')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+
+ """解析指定番号的影片数据"""
+ resp = await self.client.post(f'{self.base_url}/search', data={'sn': movie.dvdid})
+ tree = html.fromstring(resp.text)
+ page_url = tree.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0]
+ #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542
+ cid = page_url.split('/')[-1] # /video/ipx00177
+ # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片
+ if cid == 'search':
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ title = tree.xpath("//div[@class='panel-heading']/h3/text()")[0]
+ info = tree.xpath("//div[@class='col-md-9']")[0]
+ # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签
+ company_tags = info.xpath("a[contains(@href,'/company/')]/text()")
+ if company_tags:
+ movie.producer = company_tags[0]
+ # actress, actress_pics
+ # jav321现在连女优信息都没有了,首页通过女优栏跳转过去也全是空白
+ actress, actress_pics = [], {}
+ actress_tags = tree.xpath("//div[@class='thumbnail']/a[contains(@href,'/star/')]/img")
+ for tag in actress_tags:
+ name = tag.tail.strip()
+ pic_url = tag.get('src')
+ actress.append(name)
+ # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url,
+ # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据
+ actress_pics[name] = pic_url
+ # genre, genre_id
+ genre_tags = info.xpath("a[contains(@href,'/genre/')]")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ genre.append(tag.text)
+ genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1
+ dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper()
+ publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '')
+ duration_str = info.xpath("b[text()='収録時間']")[0].tail
+ match = re.search(r'\d+', duration_str)
+ if match:
+ movie.duration = match.group(0)
+ # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星
+ score_tag = info.xpath("//b[text()='平均評価']/following-sibling::img/@data-original")
+ if score_tag:
+ score = int(score_tag[0][5:7])/5 # /10*2
+ movie.score = str(score)
+ serial_tag = info.xpath("a[contains(@href,'/series/')]/text()")
+ if serial_tag:
+ movie.serial = serial_tag[0]
+ preview_video_tag = info.xpath("//video/source/@src")
+ if preview_video_tag:
+ movie.preview_video = preview_video_tag[0]
+ plot_tag = info.xpath("//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()")
+ if plot_tag:
+ movie.plot = plot_tag[0]
+ preview_pics = tree.xpath("//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src")
+ if len(preview_pics) == 0:
+ # 尝试搜索另一种布局下的封面,需要使用onerror过滤掉明明没有封面时网站往里面塞的默认URL
+ preview_pics = tree.xpath("//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src")
+ # 有的图片链接里有多个//,网站质量堪忧……
+ preview_pics = [i[:8] + i[8:].replace('//', '/') for i in preview_pics]
+ # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析
+
+ movie.url = page_url
+ movie.cid = cid
+ movie.dvdid = dvdid
+ movie.title = title
+ movie.actress = actress
+ movie.actress_pics = actress_pics
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.publish_date = publish_date
+ # preview_pics的第一张图始终是封面,剩下的才是预览图
+ if len(preview_pics) > 0:
+ movie.cover = preview_pics[0]
+ movie.preview_pics = preview_pics[1:]
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await Jav321Crawler.create()
+ movie = MovieInfo('SCUTE-1177')
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/javbus.py b/javsp/crawlers/sites/javbus.py
new file mode 100644
index 000000000..b3efaa8dd
--- /dev/null
+++ b/javsp/crawlers/sites/javbus.py
@@ -0,0 +1,129 @@
+"""从JavBus抓取数据"""
+import logging
+
+
+from javsp.crawlers.exceptions import *
+from javsp.func import *
+from javsp.config import CrawlerID
+from javsp.datatype import MovieInfo, GenreMap
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+
+from javsp.crawlers.interface import Crawler
+from lxml import html
+
+
+logger = logging.getLogger(__name__)
+
+class JavbusCrawler(Crawler):
+ id = CrawlerID.javbus
+ genre_map: GenreMap
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.javbus.com')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ self.client.cookies = {'age': 'verified', 'dv': '1'}
+ self.genre_map = GenreMap('data/genre_javbus.csv')
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+
+ """从网页抓取并解析指定番号的数据
+ Args:
+ movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
+ """
+ url = f'{self.base_url}/{movie.dvdid}'
+ resp = await self.client.get(url)
+
+ tree = html.fromstring(resp.content)
+ # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息
+ # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404
+ page_title = tree.xpath('/html/head/title/text()')
+ if page_title and page_title[0].startswith('404 Page Not Found!'):
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ container = tree.xpath("//div[@class='container']")[0]
+ title = container.xpath("h3/text()")[0]
+ cover = container.xpath("//a[@class='bigImage']/img/@src")[0]
+ preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href")
+ info = container.xpath("//div[@class='col-md-3 info']")[0]
+ dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text
+ publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip()
+ duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip()
+ director_tag = info.xpath("p/span[text()='導演:']")
+ if director_tag: # xpath没有匹配时将得到空列表
+ movie.director = director_tag[0].getnext().text.strip()
+ producer_tag = info.xpath("p/span[text()='製作商:']")
+ if producer_tag:
+ text = producer_tag[0].getnext().text
+ if text:
+ movie.producer = text.strip()
+ publisher_tag = info.xpath("p/span[text()='發行商:']")
+ if publisher_tag:
+ movie.publisher = publisher_tag[0].getnext().text.strip()
+ serial_tag = info.xpath("p/span[text()='系列:']")
+ if serial_tag:
+ movie.serial = serial_tag[0].getnext().text
+ # genre, genre_id
+ genre_tags = info.xpath("//span[@class='genre']/label/a")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ tag_url = tag.get('href')
+ pre_id = tag_url.split('/')[-1]
+ genre.append(tag.text)
+ if 'uncensored' in tag_url:
+ movie.uncensored = True
+ genre_id.append('uncensored-' + pre_id)
+ else:
+ movie.uncensored = False
+ genre_id.append(pre_id)
+ # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析
+ # actress, actress_pics
+ actress, actress_pics = [], {}
+ actress_tags = tree.xpath("//a[@class='avatar-box']/div/img")
+ for tag in actress_tags:
+ name = tag.get('title')
+ pic_url = tag.get('src')
+ actress.append(name)
+ if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像
+ actress_pics[name] = pic_url
+ # 整理数据并更新movie的相应属性
+ movie.url = f'{self.base_url}/{movie.dvdid}'
+ movie.dvdid = dvdid
+ movie.title = title.replace(dvdid, '').strip()
+ movie.cover = cover
+ movie.preview_pics = preview_pics
+ if publish_date != '0000-00-00': # 丢弃无效的发布日期
+ movie.publish_date = publish_date
+ movie.duration = duration if int(duration) else None
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.actress = actress
+ movie.actress_pics = actress_pics
+
+ async def crawl_and_fill_cleaned(self, movie: MovieInfo):
+ """解析指定番号的影片数据并进行清洗"""
+ await self.crawl_and_fill(movie)
+ movie.genre_norm = self.genre_map.map(movie.genre_id)
+ movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换)
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await JavbusCrawler.create()
+ print(crawler.client.headers)
+ movie = MovieInfo('NANP-030')
+ # try:
+ await crawler.crawl_and_fill_cleaned(movie)
+ print(movie)
+ # except Exception as e:
+ # print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/javdb.py b/javsp/crawlers/sites/javdb.py
new file mode 100644
index 000000000..ab23e18bd
--- /dev/null
+++ b/javsp/crawlers/sites/javdb.py
@@ -0,0 +1,350 @@
+"""从JavDB抓取数据"""
+import os
+import re
+import logging
+
+from httpx import Cookies
+
+from javsp.func import *
+from javsp.avid import guess_av_type
+from javsp.config import CrawlerID
+from javsp.datatype import MovieInfo, GenreMap
+from javsp.chromium import get_browsers_cookies
+
+from javsp.crawlers.exceptions import CredentialError, MovieDuplicateError, MovieNotFoundError, SiteBlocked, SitePermissionError, WebsiteError
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+
+from javsp.crawlers.interface import Crawler
+from lxml import html
+
+logger = logging.getLogger(__name__)
+
+class JavDbCrawler(Crawler):
+ id = CrawlerID.javdb
+ genre_map: GenreMap
+ cookies_pool: list[Cookies]
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.javdb.com')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ self.client.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5'
+ self.genre_map = GenreMap('data/genre_javdb.csv')
+ self.cookies_pool = []
+ return self
+
+ async def get_html_wrapper(self, url: str):
+ """包装外发的request请求并负责转换为可xpath的html,同时处理Cookies无效等问题"""
+ r = await self.client.get(url)
+ if r.status_code == 200:
+ # 发生重定向可能仅仅是域名重定向,因此还要检查url以判断是否被跳转到了登录页
+ if r.history and '/login' in str(r.url):
+ # 仅在需要时去读取Cookies
+ if len(self.cookies_pool) == 0:
+ try:
+ self.cookies_pool = get_browsers_cookies()
+ except (PermissionError, OSError) as e:
+ logger.warning(f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", exc_info=True)
+ cookies_pool = []
+ except Exception as e:
+ logger.warning(f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", exc_info=True)
+ cookies_pool = []
+ if len(self.cookies_pool) > 0:
+ item = self.cookies_pool.pop()
+ # 更换Cookies时需要创建新的request实例,否则cloudscraper会保留它内部第一次发起网络访问时获得的Cookies
+ self.client.cookies = item['cookies']
+ cookies_source = (item['profile'], item['site'])
+ logger.debug(f'未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}')
+ return self.get_html_wrapper(url)
+ else:
+ raise CredentialError('JavDB: 所有浏览器Cookies均已过期')
+ elif r.history and 'pay' in str(r.url).split('/')[-1]:
+ raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'")
+ else:
+
+ return html.fromstring(r.text)
+ elif r.status_code in (403, 503):
+ tree = html.fromstring(r.text)
+ code_tag = tree.xpath("//span[@class='code-label']/span")
+ error_code = code_tag[0].text if code_tag else None
+ if error_code:
+ if error_code == '1020':
+ block_msg = f'JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器'
+ else:
+ block_msg = f'JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})'
+ else:
+ block_msg = f'JavDB: {r.status_code} 禁止访问: {url}'
+ raise SiteBlocked(block_msg)
+ else:
+ raise WebsiteError(f'JavDB: {r.status_code} 非预期状态码: {url}')
+
+
+ async def get_user_info(self, site: str, cookies: Cookies):
+ """获取cookies对应的JavDB用户信息"""
+ try:
+ self.client.cookies = cookies
+ resp = await self.client.get(f'https://{site}/users/profile')
+
+ html_str = resp.text
+ tree = html.fromstring(html_str)
+ except Exception as e:
+ logger.info('JavDB: 获取用户信息时出错')
+ logger.debug(e, exc_info=1)
+ return
+ # 扫描浏览器得到的Cookies对应的临时域名可能会过期,因此需要先判断域名是否仍然指向JavDB的站点
+ if 'JavDB' in html_str:
+ email = tree.xpath("//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()")[0].strip()
+ username = tree.xpath("//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()")[0].strip()
+ return email, username
+ else:
+ logger.debug('JavDB: 域名已过期: ' + site)
+
+
+ async def get_valid_cookies(self):
+ """扫描浏览器,获取一个可用的Cookies"""
+ # 经测试,Cookies所发往的域名不需要和登录时的域名保持一致,只要Cookies有效即可在多个域名间使用
+ for d in self.cookies_pool:
+ info = await self.get_user_info(d['site'], d['cookies'])
+ if info:
+ return d['cookies']
+ else:
+ logger.debug(f"{d['profile']}, {d['site']}: Cookies无效")
+
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """从网页抓取并解析指定番号的数据
+ Args:
+ movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
+ """
+ # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个
+ tree = await self.get_html_wrapper(f'{self.base_url}/search?q={movie.dvdid}')
+ ids = list(map(str.lower, tree.xpath("//div[@class='video-title']/strong/text()")))
+ movie_urls = tree.xpath("//a[@class='box']/@href")
+ match_count = len([i for i in ids if i == movie.dvdid.lower()])
+ if match_count == 0:
+ raise MovieNotFoundError(__name__, movie.dvdid, ids)
+ elif match_count == 1:
+ index = ids.index(movie.dvdid.lower())
+ new_url = movie_urls[index]
+ try:
+ html2 = await self.get_html_wrapper(new_url)
+ except (SitePermissionError, CredentialError):
+ # 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面
+ box = tree.xpath("//a[@class='box']")[index]
+ movie.url = new_url
+ movie.title = box.get('title')
+ movie.cover = box.xpath("div/img/@src")[0]
+ score_str = box.xpath("div[@class='score']/span/span")[0].tail
+ score = re.search(r'([\d.]+)分', score_str).group(1)
+ movie.score = "{:.2f}".format(float(score)*2)
+ movie.publish_date = box.xpath("div[@class='meta']/text()")[0].strip()
+ return
+ else:
+ raise MovieDuplicateError(__name__, movie.dvdid, match_count)
+
+ container = html2.xpath("/html/body/section/div/div[@class='video-detail']")[0]
+ info = container.xpath("//nav[@class='panel movie-panel-info']")[0]
+ title = container.xpath("h2/strong[@class='current-title']/text()")[0]
+ show_orig_title = container.xpath("//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]")
+ if show_orig_title:
+ movie.ori_title = container.xpath("h2/span[@class='origin-title']/text()")[0]
+ cover = container.xpath("//img[@class='video-cover']/@src")[0]
+ preview_pics = container.xpath("//a[@class='tile-item'][@data-fancybox='gallery']/@href")
+ preview_video_tag = container.xpath("//video[@id='preview-video']/source/@src")
+ if preview_video_tag:
+ preview_video = preview_video_tag[0]
+ if preview_video.startswith('//'):
+ preview_video = 'https:' + preview_video
+ movie.preview_video = preview_video
+ dvdid = info.xpath("div/span")[0].text_content()
+ publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text
+ duration = info.xpath("div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip()
+ director_tag = info.xpath("div/strong[text()='導演:']")
+ if director_tag:
+ movie.director = director_tag[0].getnext().text_content().strip()
+ av_type = guess_av_type(movie.dvdid)
+ if av_type != 'fc2':
+ producer_tag = info.xpath("div/strong[text()='片商:']")
+ else:
+ producer_tag = info.xpath("div/strong[text()='賣家:']")
+ if producer_tag:
+ movie.producer = producer_tag[0].getnext().text_content().strip()
+ publisher_tag = info.xpath("div/strong[text()='發行:']")
+ if publisher_tag:
+ movie.publisher = publisher_tag[0].getnext().text_content().strip()
+ serial_tag = info.xpath("div/strong[text()='系列:']")
+ if serial_tag:
+ movie.serial = serial_tag[0].getnext().text_content().strip()
+ score_tag = info.xpath("//span[@class='score-stars']")
+ if score_tag:
+ score_str = score_tag[0].tail
+ score = re.search(r'([\d.]+)分', score_str).group(1)
+ movie.score = "{:.2f}".format(float(score)*2)
+ genre_tags = info.xpath("//strong[text()='類別:']/../span/a")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ pre_id = tag.get('href').split('/')[-1]
+ genre.append(tag.text)
+ genre_id.append(pre_id)
+ # 判定影片有码/无码
+ subsite = pre_id.split('?')[0]
+ movie.uncensored = {'uncensored': True, 'tags':False}.get(subsite)
+ # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优
+ actors_tag = info.xpath("//strong[text()='演員:']/../span")[0]
+ all_actors = actors_tag.xpath("a/text()")
+ genders = actors_tag.xpath("strong/text()")
+ actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀']
+ magnet = container.xpath("//div[@class='magnet-name column is-four-fifths']/a/@href")
+
+ movie.dvdid = dvdid
+ movie.url = self.base_url
+ movie.title = title.replace(dvdid, '').strip()
+ movie.cover = cover
+ movie.preview_pics = preview_pics
+ movie.publish_date = publish_date
+ movie.duration = duration
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.actress = actress
+ movie.magnet = [i.replace('[javdb.com]','') for i in magnet]
+
+
+ async def crawl_and_fill_cleaned(self, movie: MovieInfo):
+ """解析指定番号的影片数据并进行清洗"""
+ try:
+ await self.crawl_and_fill(movie)
+ # 检查封面URL是否真的存在对应图片
+ if movie.cover is not None:
+ r = await self.client.head(movie.cover)
+ if r.status_code != 200:
+ movie.cover = None
+ except SiteBlocked:
+ raise
+ logger.error('JavDB: 可能触发了反爬虫机制,请稍后再试')
+ if movie.genre_id and (not movie.genre_id[0].startswith('fc2?')):
+ movie.genre_norm = self.genre_map.map(movie.genre_id)
+ movie.genre_id = None # 没有别的地方需要再用到,清空genre id(表明已经完成转换)
+
+
+ async def collect_actress_alias(self, type=0, use_original=True):
+ """
+ 收集女优的别名
+ type: 0-有码, 1-无码, 2-欧美
+ use_original: 是否使用原名而非译名,True-田中レモン,False-田中檸檬
+ """
+ import json
+ import time
+ import random
+
+ actressAliasMap = {}
+
+ actressAliasFilePath = "data/actress_alias.json"
+ # 检查文件是否存在
+ if not os.path.exists(actressAliasFilePath):
+ # 如果文件不存在,创建文件并写入空字典
+ with open(actressAliasFilePath, "w", encoding="utf-8") as file:
+ json.dump({}, file)
+
+ typeList = ["censored", "uncensored", "western"]
+ page_url = f"{self.base_url}/actors/{typeList[type]}"
+ while True:
+ try:
+ tree = await self.get_html_wrapper(page_url)
+ actors = tree.xpath("//div[@class='box actor-box']/a")
+
+ count = 0
+ for actor in actors:
+ count += 1
+ actor_name = actor.xpath("strong/text()")[0].strip()
+ actor_url = actor.xpath("@href")[0]
+ # actor_url = f"https://javdb.com{actor_url}" # 构造演员主页的完整URL
+
+ # 进入演员主页,获取更多信息
+ actor_html = await self.get_html_wrapper(actor_url)
+ # 解析演员所有名字信息
+ names_span = actor_html.xpath("//span[@class='actor-section-name']")[0]
+ aliases_span_list = actor_html.xpath("//span[@class='section-meta']")
+ aliases_span = aliases_span_list[0]
+
+ names_list = [name.strip() for name in names_span.text.split(",")]
+ if len(aliases_span_list) > 1:
+ aliases_list = [
+ alias.strip() for alias in aliases_span.text.split(",")
+ ]
+ else:
+ aliases_list = []
+
+ # 将信息添加到actressAliasMap中
+ actressAliasMap[names_list[-1 if use_original else 0]] = (
+ names_list + aliases_list
+ )
+ print(
+ f"{count} --- {names_list[-1 if use_original else 0]}: {names_list + aliases_list}"
+ )
+
+ if count == 10:
+ # 将数据写回文件
+ with open(actressAliasFilePath, "r", encoding="utf-8") as file:
+ existing_data = json.load(file)
+
+ # 合并现有数据和新爬取的数据
+ existing_data.update(actressAliasMap)
+
+ # 将合并后的数据写回文件
+ with open(actressAliasFilePath, "w", encoding="utf-8") as file:
+ json.dump(existing_data, file, ensure_ascii=False, indent=2)
+
+ actressAliasMap = {} # 重置actressAliasMap
+
+ print(
+ f"已爬取 {count} 个女优,数据已更新并写回文件:",
+ actressAliasFilePath,
+ )
+
+ # 重置计数器
+ count = 0
+
+ time.sleep(max(1, 10 * random.random())) # 随机等待 1-10 秒
+
+ # 判断是否有下一页按钮
+ next_page_link = tree.xpath(
+ "//a[@rel='next' and @class='pagination-next']/@href"
+ )
+ if not next_page_link:
+ break # 没有下一页,结束循环
+ else:
+ next_page_url = f"{next_page_link[0]}"
+ page_url = next_page_url
+
+ except SiteBlocked:
+ raise
+
+ with open(actressAliasFilePath, "r", encoding="utf-8") as file:
+ existing_data = json.load(file)
+
+ # 合并现有数据和新爬取的数据
+ existing_data.update(actressAliasMap)
+
+ # 将合并后的数据写回文件
+ with open(actressAliasFilePath, "w", encoding="utf-8") as file:
+ json.dump(existing_data, file, ensure_ascii=False, indent=2)
+
+ print(f"已爬取 {count} 个女优,数据已更新并写回文件:", actressAliasFilePath)
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await JavDbCrawler.create()
+ movie = MovieInfo('FC2-2735981')
+ try:
+ await crawler.crawl_and_fill_cleaned(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/javlib.py b/javsp/crawlers/sites/javlib.py
new file mode 100644
index 000000000..c71a5f336
--- /dev/null
+++ b/javsp/crawlers/sites/javlib.py
@@ -0,0 +1,115 @@
+"""从JavLibrary抓取数据"""
+import logging
+from urllib.parse import urlsplit
+
+from httpx._transports import base
+
+from javsp.crawlers.exceptions import MovieDuplicateError, MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+logger = logging.getLogger(__name__)
+
+class JavLibCrawler(Crawler):
+ id = CrawlerID.jav321
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.javlibrary.com')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ return self
+
+ # TODO: 发现JavLibrary支持使用cid搜索,会直接跳转到对应的影片页面,也许可以利用这个功能来做cid到dvdid的转换
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ url = new_url = f'{self.base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}'
+ resp = await self.client.get(url)
+ tree = html.fromstring(resp.text)
+ if resp.history and urlsplit(str(resp.url)).netloc == urlsplit(self.base_url).netloc:
+ # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果
+ new_url = resp.url
+ else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果
+ video_tags = tree.xpath("//div[@class='video'][@id]/a")
+ # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果
+ pre_choose = []
+ for tag in video_tags:
+ tag_dvdid = tag.xpath("div[@class='id']/text()")[0]
+ if tag_dvdid.upper() == movie.dvdid.upper():
+ pre_choose.append(tag)
+ pre_choose_urls = [i.get('href') for i in pre_choose]
+ match_count = len(pre_choose)
+ if match_count == 0:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ elif match_count == 1:
+ new_url = pre_choose_urls[0]
+ elif match_count == 2:
+ no_blueray = []
+ for tag in pre_choose:
+ if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc
+ no_blueray.append(tag)
+ no_blueray_count = len(no_blueray)
+ if no_blueray_count == 1:
+ new_url = no_blueray[0].get('href')
+ logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}")
+ else:
+ # 两个结果中没有谁是蓝光影片,说明影片番号重复了
+ raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls)
+ else:
+ # 存在不同影片但是番号相同的情况,如MIDV-010
+ raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls)
+ # 重新抓取网页
+ resp = await self.client.get(new_url)
+ tree = html.fromstring(resp.text)
+ container = tree.xpath("/html/body/div/div[@id='rightcolumn']")[0]
+ title_tag = container.xpath("div/h3/a/text()")
+ title = title_tag[0]
+ cover = container.xpath("//img[@id='video_jacket_img']/@src")[0]
+ info = container.xpath("//div[@id='video_info']")[0]
+ dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0]
+ publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0]
+ duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0]
+ director_tag = info.xpath("//span[@class='director']/a/text()")
+ if director_tag:
+ movie.director = director_tag[0]
+ producer = info.xpath("//span[@class='maker']/a/text()")[0]
+ publisher_tag = info.xpath("//span[@class='label']/a/text()")
+ if publisher_tag:
+ movie.publisher = publisher_tag[0]
+ score_tag = info.xpath("//span[@class='score']/text()")
+ if score_tag:
+ movie.score = score_tag[0].strip('()')
+ genre = info.xpath("//span[@class='genre']/a/text()")
+ actress = info.xpath("//span[@class='star']/a/text()")
+
+ movie.dvdid = dvdid
+ movie.url = new_url
+ movie.title = title.replace(dvdid, '').strip()
+ if cover.startswith('//'): # 补全URL中缺少的协议段
+ cover = 'https:' + cover
+ movie.cover = cover
+ movie.publish_date = publish_date
+ movie.duration = duration
+ movie.producer = producer
+ movie.genre = genre
+ movie.actress = actress
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await JavLibCrawler.create()
+ movie = MovieInfo('IPX-177')
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/javmenu.py b/javsp/crawlers/sites/javmenu.py
new file mode 100644
index 000000000..6553d86a1
--- /dev/null
+++ b/javsp/crawlers/sites/javmenu.py
@@ -0,0 +1,100 @@
+"""从JavMenu抓取数据"""
+import logging
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+logger = logging.getLogger(__name__)
+
+class JavMenuCrawler(Crawler):
+ id = CrawlerID.javmenu
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.javmenu.com')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """从网页抓取并解析指定番号的数据
+ Args:
+ movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
+ """
+ # JavMenu网页做得很不走心,将就了
+ url = f'{self.base_url}zh/{movie.dvdid}'
+ r = await self.client.get(url)
+ if r.history:
+ # 被重定向到主页说明找不到影片资源
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ tree = html.fromstring(r.text)
+ container = tree.xpath("//div[@class='col-md-9 px-0']")[0]
+ title = container.xpath("div[@class='col-12 mb-3']/h1/strong/text()")[0]
+ # 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站
+ title = title.replace(' | JAV目錄大全 | 每日更新', '')
+ title = title.replace(' 免費在線看', '').replace(' 免費AV在線看', '')
+ cover_tag = container.xpath("//div[@class='single-video']")
+ if len(cover_tag) > 0:
+ video_tag = cover_tag[0].find('video')
+ # URL首尾竟然也有空格……
+ movie.cover = video_tag.get('data-poster').strip()
+ # 预览影片改为blob了,无法获取
+ # movie.preview_video = video_tag.find('source').get('src').strip()
+ else:
+ cover_img_tag = container.xpath("//img[@class='lazy rounded']/@data-src")
+ if cover_img_tag:
+ movie.cover = cover_img_tag[0].strip()
+ info = container.xpath("//div[@class='card-body']")[0]
+ publish_date = info.xpath("div/span[contains(text(), '日期:')]")[0].getnext().text
+ duration = info.xpath("div/span[contains(text(), '时长:')]")[0].getnext().text.replace('分钟', '')
+ producer = info.xpath("div/span[contains(text(), '製作:')]/following-sibling::a/span/text()")
+ if producer:
+ movie.producer = producer[0]
+ genre_tags = info.xpath("//a[@class='genre']")
+ genre, genre_id = [], []
+ for tag in genre_tags:
+ items = tag.get('href').split('/')
+ pre_id = items[-3] + '/' + items[-1]
+ genre.append(tag.text.strip())
+ genre_id.append(pre_id)
+ # genre的链接中含有censored字段,但是无法用来判断影片是否有码,因为完全不可靠……
+ actress = info.xpath("div/span[contains(text(), '女优:')]/following-sibling::*/a/text()") or None
+ magnet_table = container.xpath("//table[contains(@class, 'magnet-table')]/tbody")
+ if magnet_table:
+ magnet_links = magnet_table[0].xpath("tr/td/a/@href")
+ # 它的FC2数据是从JavDB抓的,JavDB更换图片服务器后它也跟上了,似乎数据更新频率还可以
+ movie.magnet = [i.replace('[javdb.com]','') for i in magnet_links]
+ preview_pics = container.xpath("//a[@data-fancybox='gallery']/@href")
+
+ if (not movie.cover) and preview_pics:
+ movie.cover = preview_pics[0]
+ movie.url = url
+ movie.title = title.replace(movie.dvdid, '').strip()
+ movie.preview_pics = preview_pics
+ movie.publish_date = publish_date
+ movie.duration = duration
+ movie.genre = genre
+ movie.genre_id = genre_id
+ movie.actress = actress
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await JavMenuCrawler.create()
+ movie = MovieInfo('FC2-718323')
+ # try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ # except Exception as e:
+ # print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/mgstage.py b/javsp/crawlers/sites/mgstage.py
new file mode 100644
index 000000000..bd9d76840
--- /dev/null
+++ b/javsp/crawlers/sites/mgstage.py
@@ -0,0 +1,127 @@
+"""从蚊香社-mgstage抓取数据"""
+import re
+import logging
+
+
+from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.crawlers.interface import Crawler
+from javsp.config import Cfg, CrawlerID
+from lxml import html
+
+
+logger = logging.getLogger(__name__)
+
+class MgstageCrawler(Crawler):
+ id = CrawlerID.mgstage
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.mgstage.com')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ # 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面)
+ self.client.cookies = {'adc': '1'}
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ url = f'{self.base_url}/product/product_detail/{movie.dvdid}/'
+ resp = await self.client.get(url)
+ if resp.status_code == 403:
+ raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理')
+ # url不存在时会被重定向至主页。history非空时说明发生了重定向
+ elif resp.history:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ tree = html.fromstring(resp.text)
+ # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除
+ title = tree.xpath("//div[@class='common_detail_cover']/h1/text()")[0].strip()
+ container = tree.xpath("//div[@class='detail_left']")[0]
+ cover = container.xpath("//a[@id='EnlargeImage']/@href")[0]
+ # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表
+ actress_text = container.xpath("//th[text()='出演:']/following-sibling::td/text()")
+ actress_link = container.xpath("//th[text()='出演:']/following-sibling::td/a/text()")
+ actress = [i.strip() for i in actress_text + actress_link]
+ actress = [i for i in actress if i] # 移除空字符串
+ producer = container.xpath("//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip()
+ duration_str = container.xpath("//th[text()='収録時間:']/following-sibling::td/text()")[0]
+ match = re.search(r'\d+', duration_str)
+ if match:
+ movie.duration = match.group(0)
+ dvdid = container.xpath("//th[text()='品番:']/following-sibling::td/text()")[0]
+ date_str = container.xpath("//th[text()='配信開始日:']/following-sibling::td/text()")[0]
+ publish_date = date_str.replace('/', '-')
+ serial_tag = container.xpath("//th[text()='シリーズ:']/following-sibling::td/a/text()")
+ if serial_tag:
+ movie.serial = serial_tag[0].strip()
+ # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
+ # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip()
+ genre_tags = container.xpath("//th[text()='ジャンル:']/following-sibling::td/a")
+ genre = [i.text.strip() for i in genre_tags]
+ score_str = container.xpath("//td[@class='review']/span")[0].tail.strip()
+ match = re.search(r'^[\.\d]+', score_str)
+ if match:
+ score = float(match.group()) * 2
+ movie.score = f'{score:.2f}'
+ # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签
+ plots = []
+ plot_p_tags = container.xpath("//dl[@id='introduction']/dd/p[not(@class='more')]")
+ for p in plot_p_tags:
+ children = p.getchildren()
+ # 没有children时表明plot不含有格式,此时简单地提取文本就可以
+ if not children:
+ plots.append(p.text_content())
+ continue
+ for child in children:
+ if child.tag == 'br' and plots[-1] != '\n':
+ plots.append('\n')
+ else:
+ if child.text:
+ plots.append(child.text)
+ if child.tail:
+ plots.append(child.tail)
+ plot = ''.join(plots).strip()
+ preview_pics = container.xpath("//a[@class='sample_image']/@href")
+
+ if Cfg().crawler.hardworking:
+ # 预览视频是点击按钮后再加载的,不在静态网页中
+ btn_url = container.xpath("//a[@class='button_sample']/@href")[0]
+ video_pid = btn_url.split('/')[-1]
+ req_url = f'{self.base_url}/sampleplayer/sampleRespons.php?pid={video_pid}'
+ resp = await self.client.get(req_url)
+ j = resp.json()
+ video_url = j.get('url')
+ if video_url:
+ # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX
+ preview_video = video_url.split('.ism/')[0] + '.mp4'
+ movie.preview_video = preview_video
+
+ movie.dvdid = dvdid
+ movie.url = url
+ movie.title = title
+ movie.cover = cover
+ movie.actress = actress
+ movie.producer = producer
+ movie.publish_date = publish_date
+ movie.genre = genre
+ movie.plot = plot
+ movie.preview_pics = preview_pics
+ movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
+
+
+if __name__ == "__main__":
+ async def test_main():
+ crawler = await MgstageCrawler.create()
+ movie = MovieInfo('ABF-153')
+ # try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ # except Exception as e:
+ # print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/njav.py b/javsp/crawlers/sites/njav.py
new file mode 100644
index 000000000..5787397c9
--- /dev/null
+++ b/javsp/crawlers/sites/njav.py
@@ -0,0 +1,150 @@
+"""从NJAV抓取数据"""
+import re
+import logging
+from typing import List
+
+from javsp.crawlers.exceptions import MovieNotFoundError
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from javsp.lib import strftime_to_minutes
+from lxml import html
+
+
+logger = logging.getLogger(__name__)
+
+def get_list_first(list: List):
+ return list[0] if list and len(list) > 0 else None
+
+class NjavCrawler(Crawler):
+ id = CrawlerID.njav
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.njav.tv/')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ return self
+
+ async def search_video(self, movie: MovieInfo) -> str:
+ id_uc = movie.dvdid
+ # 抓取网页
+ url = f'{self.base_url}ja/search?keyword={id_uc}'
+ resp = await self.client.get(url)
+ tree = html.fromstring(resp.text)
+ list = tree.xpath("//div[@class='box-item']/div[@class='detail']/a")
+ video_url = None
+ for item in list:
+ search_title = item.xpath("text()")[0]
+ if id_uc in search_title:
+ video_url = item.xpath("@href")
+ break
+ if id_uc.startswith("FC2-"):
+ fc2id = id_uc.replace('FC2-', '')
+ if "FC2" in search_title and fc2id in search_title:
+ video_url = item.xpath("@href")
+ break
+
+ return get_list_first(video_url)
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """解析指定番号的影片数据"""
+ # 抓取网页
+ url = await self.search_video(movie)
+ url = self.base_url + "ja/" + url
+ if not url:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ resp = await self.client.get(url)
+ tree = html.fromstring(resp.text)
+ container = tree.xpath("//div[@class='container']/div/div[@class='col']")
+ if len(container) > 0:
+ container = container[0]
+ else:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ title = container.xpath("//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()")[0]
+ thumb_pic = container.xpath("//div[@id='player']/@data-poster")
+ plot = " ".join(container.xpath("//div[@class='description']/p/text()"))
+ magnet = container.xpath("//div[@class='magnet']/a/@href")
+ real_id = None
+ publish_date = None
+ duration_str = None
+ uncensored = None
+ preview_pics = None
+ preview_video = None
+ serial = None
+ publisher = None
+ producer = None
+ genre = []
+ actress = []
+
+ for item in container.xpath("//div[@class='detail-item']/div"):
+ item_title = item.xpath('span/text()')[0]
+ if "タグ:" in item_title:
+ genre += item.xpath("span")[1].xpath("a/text()")
+ elif "ジャンル:" in item_title:
+ genre += item.xpath("span")[1].xpath("a/text()")
+ elif "レーベル:" in item_title:
+ genre += item.xpath("span")[1].xpath("a/text()")
+ elif "女優:" in item_title:
+ actress = item.xpath("span")[1].xpath("a/text()")
+ elif "シリーズ:" in item_title:
+ serial = get_list_first(item.xpath("span")[1].xpath("a/text()"))
+ elif "メーカー:" in item_title:
+ producer = get_list_first(item.xpath("span")[1].xpath("a/text()"))
+ elif "コード:" in item_title:
+ real_id = get_list_first(item.xpath("span")[1].xpath("text()"))
+ elif "公開日:" in item_title:
+ publish_date = get_list_first(item.xpath("span")[1].xpath("text()"))
+ elif "再生時間:" in item_title:
+ duration_str = get_list_first(item.xpath("span")[1].xpath("text()"))
+
+ # 清除标题里的番号字符
+ keywords = [real_id, " "]
+ if movie.dvdid.startswith("FC2"):
+ keywords += ["FC2","PPV","-"] + [movie.dvdid.split("-")[-1]]
+ for keyword in keywords:
+ title = re.sub(re.escape(keyword), "", title, flags=re.I)
+
+ # 判断是否无码
+ uncensored_arr = magnet + [title]
+ for uncensored_str in uncensored_arr:
+ if 'uncensored' in uncensored_str.lower():
+ uncensored = True
+
+ movie.url = url
+ movie.title = title
+ movie.genre = genre
+ movie.actress = actress
+ movie.duration = str(strftime_to_minutes(duration_str))
+ movie.publish_date = publish_date
+ movie.publisher = publisher
+ movie.producer = producer
+ movie.uncensored = uncensored
+ movie.preview_pics = preview_pics
+ movie.preview_video = preview_video
+ movie.plot = plot
+ movie.serial = serial
+ movie.magnet = magnet
+
+ # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
+ if movie.preview_pics:
+ movie.cover = preview_pics[0]
+ else:
+ movie.cover = get_list_first(thumb_pic)
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await NjavCrawler.create()
+ movie = MovieInfo('012023_002')
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/crawlers/sites/prestige.py b/javsp/crawlers/sites/prestige.py
new file mode 100644
index 000000000..bc0734554
--- /dev/null
+++ b/javsp/crawlers/sites/prestige.py
@@ -0,0 +1,101 @@
+"""从蚊香社-prestige抓取数据"""
+import re
+import logging
+
+
+
+from javsp.crawlers.exceptions import MovieNotFoundError, SiteBlocked
+from javsp.datatype import MovieInfo
+from javsp.network.utils import resolve_site_fallback
+from javsp.network.client import get_client
+from javsp.crawlers.interface import Crawler
+from javsp.config import CrawlerID
+from lxml import html
+
+
+logger = logging.getLogger(__name__)
+
+
+class PrestigeCrawler(Crawler):
+ id = CrawlerID.prestige
+
+ @classmethod
+ async def create(cls):
+ self = cls()
+ url = await resolve_site_fallback(self.id, 'https://www.prestige-av.com')
+ self.base_url = str(url)
+ self.client = get_client(url)
+ # prestige要求访问者携带已通过R18认证的cookies才能够获得完整数据,否则会被重定向到认证页面
+ # (其他多数网站的R18认证只是在网页上遮了一层,完整数据已经传回,不影响爬虫爬取)
+ self.client.cookies = {'__age_auth__': 'true'}
+ return self
+
+ async def crawl_and_fill(self, movie: MovieInfo) -> None:
+ """从网页抓取并解析指定番号的数据
+ Args:
+ movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
+ """
+ url = f'{self.base_url}/goods/goods_detail.php?sku={movie.dvdid}'
+ resp = await self.client.get(url)
+ if resp.status_code == 500:
+ # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试
+ raise MovieNotFoundError(__name__, movie.dvdid)
+ elif resp.status_code == 403:
+ raise SiteBlocked('prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理')
+ resp.raise_for_status()
+ tree = html.fromstring(resp.text)
+ container_tags = tree.xpath("//section[@class='px-4 mb-4 md:px-8 md:mb-16']")
+ if not container_tags:
+ raise MovieNotFoundError(__name__, movie.dvdid)
+
+ container = container_tags[0]
+ title = container.xpath("h1/span")[0].tail.strip()
+ cover = container.xpath("//div[@class='c-ratio-image mr-8']/picture/source/img/@src")[0]
+ cover = cover.split('?')[0]
+ actress = container.xpath("//p[text()='出演者:']/following-sibling::div/p/a/text()")
+ # 移除女优名中的空格,使女优名与其他网站保持一致
+ actress = [i.strip().replace(' ', '') for i in actress]
+ duration_str = container.xpath("//p[text()='収録時間:']")[0].getnext().text_content()
+ match = re.search(r'\d+', duration_str)
+ if match:
+ movie.duration = match.group(0)
+ date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[0]
+ publish_date = date_url.split('?date=')[-1]
+ producer = container.xpath("//p[text()='メーカー:']/following-sibling::div/a/text()")[0].strip()
+ dvdid = container.xpath("//p[text()='品番:']/following-sibling::div/p/text()")[0]
+ genre_tags = container.xpath("//p[text()='ジャンル:']/following-sibling::div/a")
+ genre = [tag.text.strip() for tag in genre_tags]
+ serial = container.xpath("//p[text()='レーベル:']/following-sibling::div/a/text()")[0].strip()
+ plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[0].text.strip()
+ preview_pics = container.xpath("//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src")
+ preview_pics = [i.split('?')[0] for i in preview_pics]
+
+ # prestige改版后已经无法获取高清封面,此前已经获取的高清封面地址也已失效
+ movie.url = url
+ movie.dvdid = dvdid
+ movie.title = title
+ movie.cover = cover
+ movie.actress = actress
+ movie.publish_date = publish_date
+ movie.producer = producer
+ movie.genre = genre
+ movie.serial = serial
+ movie.plot = plot
+ movie.preview_pics = preview_pics
+ movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片
+
+
+
+if __name__ == "__main__":
+
+ async def test_main():
+ crawler = await PrestigeCrawler.create()
+ movie = MovieInfo('ABP-647')
+ try:
+ await crawler.crawl_and_fill(movie)
+ print(movie)
+ except Exception as e:
+ print(repr(e))
+
+ import asyncio
+ asyncio.run(test_main())
diff --git a/javsp/func.py b/javsp/func.py
index 042afea5c..6232747fd 100644
--- a/javsp/func.py
+++ b/javsp/func.py
@@ -16,6 +16,8 @@
from pathlib import Path
import importlib.metadata as meta
+from pydantic_core import Url
+
# 判断系统是否可以使用tk
USE_GUI = True
try:
@@ -23,7 +25,7 @@
except ImportError:
USE_GUI = False
-from javsp.web.base import *
+from javsp.network.utils import get_client, url_download
from javsp.lib import re_escape, resource_path
@@ -150,7 +152,7 @@ def split_by_punc(s):
return ls
-def check_update(allow_check=True, auto_update=True):
+async def check_update(allow_check=True, auto_update=True):
"""检查版本更新"""
def print_header(title, info=[]):
@@ -181,7 +183,9 @@ def print_header(title, info=[]):
release_url = 'https://github.com/Yuukiy/JavSP/releases/latest'
print('正在检查更新...', end='')
try:
- data = request_get(api_url, timeout=3).json()
+ client = get_client(Url(api_url))
+ resp = await client.get(api_url)
+ data = resp.json()
latest_version = data['tag_name']
release_time = utc2local(data['published_at'])
release_date = release_time.isoformat().split('T')[0]
@@ -233,7 +237,7 @@ def print_header(title, info=[]):
if auto_update:
try:
logger.info('尝试自动更新到新版本: ' + latest_version + " (按'Ctrl+C'取消)")
- download_update(data)
+ await download_update(data)
except KeyboardInterrupt:
logger.info('用户取消更新')
except Exception as e:
@@ -243,7 +247,7 @@ def print_header(title, info=[]):
print() # 输出空行,作为新旧程序的分隔
-def download_update(rel_info):
+async def download_update(rel_info):
"""下载版本更新
Args:
@@ -253,7 +257,8 @@ def download_update(rel_info):
down_url = rel_info['assets'][0]['browser_download_url']
asset_name = rel_info['assets'][0]['name']
desc = '下载更新' if shutil.get_terminal_size().columns < 120 else '下载更新: '+asset_name
- download(down_url, asset_name, desc=desc)
+ await url_download(Url(down_url), asset_name, desc=desc)
+ # download(down_url, asset_name, desc=desc)
if os.path.exists(asset_name):
# 备份原有的程序
basepath, ext = os.path.splitext(sys.executable)
@@ -270,8 +275,3 @@ def download_update(rel_info):
p.wait()
p.terminate()
sys.exit(0)
-
-
-if __name__ == "__main__":
- setattr(sys, 'javsp_version', 'v0')
- check_update()
diff --git a/javsp/network/client.py b/javsp/network/client.py
new file mode 100644
index 000000000..33232b677
--- /dev/null
+++ b/javsp/network/client.py
@@ -0,0 +1,45 @@
+"""网络请求的统一接口"""
+
+from typing import Dict
+from pydantic_core import Url
+
+from httpx import AsyncClient, AsyncHTTPTransport
+
+from javsp.config import Cfg
+
+default_headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
+}
+
+def get_proxy(unproxied: bool):
+ if Cfg().network.proxy_server is None or unproxied:
+ return None
+ else:
+ return str(Cfg().network.proxy_server)
+
+client_dictionary: Dict[str, AsyncClient] = {}
+def get_client(url: Url) -> AsyncClient:
+ if url.host is None:
+ raise Exception(f"Unknown url {url}")
+ else:
+ index = url.host
+ if index in client_dictionary:
+ return client_dictionary[index]
+ else:
+ unproxied = url.host in Cfg().network.unproxied
+
+ transport = AsyncHTTPTransport(
+ proxy=get_proxy(unproxied),
+ retries=Cfg().network.retries)
+
+ client = AsyncClient(
+ transport=transport,
+ # 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效
+ headers=default_headers.copy(),
+ timeout=Cfg().network.timeout.total_seconds(),
+ follow_redirects=True,
+ )
+
+ client_dictionary[index] = client
+
+ return client
diff --git a/javsp/network/utils.py b/javsp/network/utils.py
new file mode 100644
index 000000000..34caf68da
--- /dev/null
+++ b/javsp/network/utils.py
@@ -0,0 +1,105 @@
+from datetime import timedelta
+import logging
+import time
+from tqdm.asyncio import tqdm
+from typing import Any, Coroutine, NamedTuple
+import aiofiles
+from pretty_errors import os
+from pydantic.types import ByteSize
+from pydantic_core import Url
+
+from pydantic_extra_types.pendulum_dt import Duration
+
+from javsp.config import Cfg, CrawlerID
+from javsp.network.client import get_client
+
+import asyncio
+
+logger = logging.getLogger(__name__)
+
+class DownloadInfo(NamedTuple):
+ size: ByteSize
+ elapsed: timedelta
+
+ def get_rate(self) -> float:
+ """get rate of this download, unit: Mbps"""
+ return self.size.to("mbit") / self.elapsed.total_seconds()
+
+async def url_download(url: Url, target_path: str, desc: str | None = None) -> DownloadInfo:
+ url_str = str(url)
+
+ if url.scheme == 'file':
+ path: str = url.path
+ start_time: float = time.time()
+ async with aiofiles.open(path, "rb") as src:
+ async with aiofiles.open(target_path, "wb") as dest:
+ await dest.write(await src.read())
+ filesize = os.path.getsize(path)
+ elapsed = time.time() - start_time
+ return DownloadInfo(ByteSize(filesize), Duration(seconds=elapsed))
+
+ if not desc:
+ desc = url_str.split('/')[-1]
+
+ client = get_client(url)
+
+ # REF: https://www.python-httpx.org/advanced/clients/#monitoring-download-progress
+ async with aiofiles.open(target_path, 'wb') as download_file:
+ # NOTE: Create a client for each request for now, need further refactor
+ async with client.stream("GET", url_str) as response:
+ total = int(response.headers["Content-Length"])
+
+ with tqdm(total=total, unit_scale=True, unit_divisor=1024, unit="B") as progress:
+ num_bytes_downloaded = response.num_bytes_downloaded
+ for chunk in response.iter_bytes():
+ await download_file.write(chunk)
+ progress.update(response.num_bytes_downloaded - num_bytes_downloaded)
+ num_bytes_downloaded = response.num_bytes_downloaded
+
+ return DownloadInfo(ByteSize(response.num_bytes_downloaded), response.elapsed)
+
+async def test_connect(url_str: str, timeout: Duration) -> bool:
+ """测试与指定url的连接,不使用映射,但使用代理"""
+ try:
+ client = get_client(Url(url_str))
+ response = \
+ await client.get(
+ url_str,
+ timeout=timeout.total_seconds(),
+ )
+ return response.status_code == 200
+ except Exception as e:
+ logger.debug(f"Not connectable: {url_str}\n" + repr(e))
+ return False
+
+async def choose_one_connectable(urls: list[str]) -> str | None:
+ co_connectables: list[Coroutine[Any, Any, bool]] = []
+ for url in urls:
+ co_connectables.append(test_connect(url, Duration(seconds=3)))
+
+ connectables = await asyncio.gather(*co_connectables)
+ for i, connectable in enumerate(connectables):
+ if connectable:
+ return urls[i]
+ return None
+
+async def resolve_site_fallback(cr_id: CrawlerID, default: str) -> Url:
+ if cr_id not in Cfg().network.fallback:
+ return Url(default)
+
+ fallbacks = Cfg().network.fallback[cr_id]
+ chosen = await choose_one_connectable(fallbacks)
+ if chosen is None:
+ return Url(default)
+ else:
+ return Url(chosen)
+
+
+if __name__ == '__main__':
+ async def aentry():
+ print(await choose_one_connectable(['http://iandown.what', 'http://www.baidu.com']))
+
+ # async def aentry():
+ # print(await test_connect("https://www.y78k.com/", Duration(seconds=3)))
+
+ asyncio.run(aentry())
diff --git a/javsp/web/translate.py b/javsp/translate.py
similarity index 94%
rename from javsp/web/translate.py
rename to javsp/translate.py
index 2e762cb15..1f202209a 100644
--- a/javsp/web/translate.py
+++ b/javsp/translate.py
@@ -6,7 +6,7 @@
import random
import logging
from pydantic_core import Url
-import requests
+import httpx
from hashlib import md5
@@ -15,7 +15,7 @@
from javsp.config import BaiduTranslateEngine, BingTranslateEngine, Cfg, ClaudeTranslateEngine, GoogleTranslateEngine, OpenAITranslateEngine, TranslateEngine
from javsp.datatype import MovieInfo
-from javsp.web.base import read_proxy
+from javsp.network.client import get_proxy
logger = logging.getLogger(__name__)
@@ -49,13 +49,7 @@ def translate_movie_info(info: MovieInfo):
return False
return True
-def translate(texts, engine: Union[
- BaiduTranslateEngine,
- BingTranslateEngine,
- ClaudeTranslateEngine,
- OpenAITranslateEngine,
- None
- ], actress=[]):
+def translate(texts, engine: TranslateEngine, actress=[]):
"""
翻译入口:对错误进行处理并且统一返回格式
@@ -146,7 +140,7 @@ def baidu_translate(texts, app_id, api_key, to='zh'):
wait = 1.0 - (now - last_access)
if wait > 0:
time.sleep(wait)
- r = requests.post(api_url, params=payload, headers=headers)
+ r = httpx.post(api_url, params=payload, headers=headers)
result = r.json()
baidu_translate._last_access = time.perf_counter()
return result
@@ -163,7 +157,7 @@ def bing_translate(texts, api_key, to='zh-Hans'):
'X-ClientTraceId': str(uuid.uuid4())
}
body = [{'text': texts}]
- r = requests.post(api_url, params=params, headers=headers, json=body)
+ r = httpx.post(api_url, params=params, headers=headers, json=body)
result = r.json()
return result
@@ -175,12 +169,12 @@ def google_trans(texts, to='zh_CN'):
# client参数的选择: https://github.com/lmk123/crx-selection-translate/issues/223#issue-184432017
global _google_trans_wait
url = f"https://translate.google.com.hk/translate_a/single?client=gtx&dt=t&dj=1&ie=UTF-8&sl=auto&tl={to}&q={texts}"
- proxies = read_proxy()
- r = requests.get(url, proxies=proxies)
+ proxies = get_proxy(False)
+ r = httpx.get(url, proxies=proxies)
while r.status_code == 429:
logger.warning(f"HTTP {r.status_code}: {r.reason}: Google翻译请求超限,将等待{_google_trans_wait}秒后重试")
time.sleep(_google_trans_wait)
- r = requests.get(url, proxies=proxies)
+ r = httpx.get(url, proxies=proxies)
if r.status_code == 429:
_google_trans_wait += random.randint(60, 90)
if r.status_code == 200:
@@ -204,7 +198,7 @@ def claude_translate(texts, api_key, to="zh_CN"):
"max_tokens": 1024,
"messages": [{"role": "user", "content": texts}],
}
- r = requests.post(api_url, headers=headers, json=data)
+ r = httpx.post(api_url, headers=headers, json=data)
if r.status_code == 200:
result = r.json().get("content", [{}])[0].get("text", "").strip()
else:
@@ -236,7 +230,7 @@ def openai_translate(texts, url: Url, api_key: str, model: str, to="zh_CN"):
"temperature": 0,
"max_tokens": 1024,
}
- r = requests.post(api_url, headers=headers, json=data)
+ r = httpx.post(api_url, headers=headers, json=data)
if r.status_code == 200:
if 'error' in r.json():
result = {
diff --git a/javsp/web/airav.py b/javsp/web/airav.py
deleted file mode 100644
index 22e9fdbf7..000000000
--- a/javsp/web/airav.py
+++ /dev/null
@@ -1,118 +0,0 @@
-"""从airav抓取数据"""
-import re
-import logging
-from html import unescape
-
-
-from javsp.web.base import Request
-from javsp.web.exceptions import *
-from javsp.config import Cfg
-from javsp.datatype import MovieInfo
-
-# 初始化Request实例
-request = Request(use_scraper=True)
-request.headers['Accept-Language'] = 'zh-TW,zh;q=0.9'
-# 近期airav服务器似乎不稳定,时好时坏,单次查询平均在17秒左右,timeout时间增加到20秒
-request.timeout = 20
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://www.airav.wiki'
-
-
-def search_movie(dvdid):
- """通过搜索番号获取指定的影片在网站上的ID"""
- # 部分影片的ID并不直接等于番号(如012717-360),此时需要尝试通过搜索来寻找影片
- page = 0
- count = 1
- result = []
- while len(result) < count:
- url = f'{base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}'
- r = request.get(url).json()
- # {"offset": 2460, "count": 12345, "result": [...], "status": "ok"}
- if r['result']:
- result.extend(r['result'])
- count = r['count']
- page += 1
- else: # 结果为空,结束循环
- break
- # 如果什么都没搜索到,直接返回
- if not result:
- raise MovieNotFoundError(__name__, dvdid)
- # 排序,以优先选择更符合预期的结果(如'012717_472'对应的'1pondo_012717_472'和'_1pondo_012717_472')
- result.sort(key=lambda x:x['barcode'])
- # 从所有搜索结果中选择最可能的番号,返回它的URL
- target = dvdid.replace('-', '_')
- for item in result:
- # {'vid': '', 'slug': '', 'name': '', 'url': '', 'view': '', 'img_url': '', 'barcode': ''}
- barcode = item['barcode'].replace('-', '_')
- if target in barcode:
- return item['barcode']
- raise MovieNotFoundError(__name__, dvdid, result)
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据
- url = f'{base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW'
- resp = request.get(url).json()
- # 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息
- if resp['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid):
- barcode = search_movie(movie.dvdid)
- if barcode:
- url = f'{base_url}/api/video/barcode/{barcode}?lng=zh-TW'
- resp = request.get(url).json()
- if resp['count'] == 0:
- raise MovieNotFoundError(__name__, movie.dvdid, resp)
-
- # 从API返回的数据中提取需要的字段
- # TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展
- data = resp['result']
- dvdid = data['barcode']
- movie.dvdid = dvdid
- movie.url = base_url + '/video/' + dvdid
- # plot和title中可能含有HTML的转义字符,需要进行解转义处理
- movie.plot = unescape(data['description']) or None
- movie.cover = data['img_url']
- # airav的genre是以搜索关键词的形式组织的,没有特定的genre_id
- movie.genre = [i['name'] for i in data['tags']]
- movie.title = unescape(data['name'])
- movie.actress = [i['name'] for i in data['actors']]
- movie.publish_date = data['publish_date']
- movie.preview_pics = data['images'] or []
- if data['factories']:
- movie.producer = data['factories'][0]['name']
-
- if Cfg().crawler.hardworking:
- # 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472')
- video_url = f"{base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
- resp = request.get(video_url).json()
- # 如果失败,结果如 {'msg': 'fail', 'status': 'fail'}
- if 'data' in resp:
- # 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址
- # TODO: 发现部分影片(如080719-976)的传统格式预览片错误
- movie.preview_video = resp['data'].get('url')
-
- # airav上部分影片会被标记为'馬賽克破壞版'等,这些影片的title、plot和genre都不再准确
- for keyword in ('馬賽克破壞版', '馬賽克破解版', '無碼流出版'):
- if movie.title and keyword in movie.title:
- movie.title = None
- movie.genre = []
- if movie.plot and keyword in movie.plot:
- movie.plot = None
- movie.genre = []
- if not any([movie.title, movie.plot, movie.genre]):
- break
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('DSAD-938')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/arzon.py b/javsp/web/arzon.py
deleted file mode 100644
index 433949018..000000000
--- a/javsp/web/arzon.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""从arzon抓取数据"""
-import os
-import sys
-import logging
-import re
-
-from javsp.web.base import request_get
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-import requests
-from lxml import html
-
-logger = logging.getLogger(__name__)
-base_url = "https://www.arzon.jp"
-
-def get_cookie():
- # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
- skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1"
- session = requests.Session()
- session.get(skip_verify_url, timeout=(12, 7))
- return session.cookies.get_dict()
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- full_id = movie.dvdid
- cookies = get_cookie()
- url = f'{base_url}/itemlist.html?t=&m=all&s=&q={full_id}'
- # url = f'{base_url}/imagelist.html?q={full_id}'
- r = request_get(url, cookies, delay_raise=True)
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
- data = html.fromstring(r.content)
-
- urls = data.xpath("//h2/a/@href")
- if len(urls) == 0:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- item_url = base_url + urls[0]
- e = request_get(item_url, cookies, delay_raise=True)
- item = html.fromstring(e.content)
-
- title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0]
- cover = item.xpath("//td[@align='center']//a/img/@src")[0]
- item_text = item.xpath("//div[@class='item_text']/text()")
- plot = [item.strip() for item in item_text if item.strip() != ''][0]
- preview_pics_arr = item.xpath("//div[@class='detail_img']//img/@src")
- # 使用列表推导式添加 "http:" 并去除 "m_"
- preview_pics = [("https:" + url).replace("m_", "") for url in preview_pics_arr]
-
- container = item.xpath("//div[@class='item_register']/table//tr")
- for row in container:
- key = row.xpath("./td[1]/text()")[0]
- contents = row.xpath("./td[2]//text()")
- content = [item.strip() for item in contents if item.strip() != '']
- index = 0
- value = content[index] if content and index < len(content) else None
- if key == "AV女優:":
- movie.actress = content
- if key == "AVメーカー:":
- movie.producer = value
- if key == "AVレーベル:":
- video_type = value
- if key == "シリーズ:":
- movie.serial = value
- if key == "監督:":
- movie.director = value
- if key == "発売日:" and value:
- movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
- if key == "収録時間:" and value:
- movie.duration = re.search(r'([\d.]+)分', value).group(1)
- if key == "品番:":
- dvd_id = value
- elif key == "タグ:":
- genre = value
-
- genres = ''
- if video_type:
- genres = [video_type]
- if(genre != None):
- genres.append(genre)
-
- movie.genre = genres
- movie.url = item_url
- movie.title = title
- movie.plot = plot
- movie.cover = f'https:{cover}'
- movie.preview_pics = preview_pics
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('csct-011')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/arzon_iv.py b/javsp/web/arzon_iv.py
deleted file mode 100644
index 3ea7a322f..000000000
--- a/javsp/web/arzon_iv.py
+++ /dev/null
@@ -1,93 +0,0 @@
-"""从arzon抓取数据"""
-import os
-import sys
-import logging
-import re
-
-from javsp.web.base import request_get
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-import requests
-from lxml import html
-
-logger = logging.getLogger(__name__)
-base_url = "https://www.arzon.jp"
-
-def get_cookie():
- # https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
- skip_verify_url = "http://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1"
- session = requests.Session()
- session.get(skip_verify_url, timeout=(12, 7))
- return session.cookies.get_dict()
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- full_id = movie.dvdid
- cookies = get_cookie()
- url = f'{base_url}/imagelist.html?q={full_id}'
- r = request_get(url, cookies, delay_raise=True)
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- # https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
- data = html.fromstring(r.content)
-
- urls = data.xpath("//h2/a/@href")
- if len(urls) == 0:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- item_url = base_url + urls[0]
- e = request_get(item_url, cookies, delay_raise=True)
- item = html.fromstring(e.content)
-
- title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0]
- cover = item.xpath("//td[@align='center']//a/img/@src")[0]
- item_text = item.xpath("//div[@class='item_text']/text()")
- plot = [item.strip() for item in item_text if item.strip() != ''][0]
-
- container = item.xpath("//div[@class='item_register']/table//tr")
- for row in container:
- key = row.xpath("./td[1]/text()")[0]
- contents = row.xpath("./td[2]//text()")
- content = [item.strip() for item in contents if item.strip() != '']
- index = 0
- value = content[index] if content and index < len(content) else None
- if key == "タレント:":
- movie.actress = content
- if key == "イメージメーカー:":
- movie.producer = value
- if key == "イメージレーベル:":
- video_type = value
- if key == "監督:":
- movie.director = value
- if key == "発売日:" and value:
- movie.publish_date = re.search(r"\d{4}/\d{2}/\d{2}", value).group(0).replace("/", "-")
- if key == "収録時間:" and value:
- movie.duration = re.search(r'([\d.]+)分', value).group(1)
- if key == "品番:":
- dvd_id = value
- elif key == "タグ:":
- genre = value
-
- genres = ''
- if video_type:
- genres = [video_type]
- if(genre != None):
- genres.append(genre)
-
- movie.genre = genres
- movie.url = item_url
- movie.title = title
- movie.plot = plot
- movie.cover = f'https:{cover}'
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('KIDM-1137B')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/avsox.py b/javsp/web/avsox.py
deleted file mode 100644
index ea96d6cc3..000000000
--- a/javsp/web/avsox.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""从avsox抓取数据"""
-import logging
-
-from javsp.web.base import get_html
-from javsp.web.exceptions import *
-from javsp.config import Cfg, CrawlerID
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = str(Cfg().network.proxy_free[CrawlerID.avsox])
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # avsox无法直接跳转到影片的网页,因此先搜索再从搜索结果中寻找目标网页
- full_id = movie.dvdid
- if full_id.startswith('FC2-'):
- full_id = full_id.replace('FC2-', 'FC2-PPV-')
- html = get_html(f'{base_url}tw/search/{full_id}')
- ids = html.xpath("//div[@class='photo-info']/span/date[1]/text()")
- urls = html.xpath("//a[contains(@class, 'movie-box')]/@href")
- ids_lower = list(map(str.lower, ids))
- if full_id.lower() in ids_lower:
- url = urls[ids_lower.index(full_id.lower())]
- url = url.replace('/tw/', '/cn/', 1)
- else:
- raise MovieNotFoundError(__name__, movie.dvdid, ids)
-
- # 提取影片信息
- html = get_html(url)
- container = html.xpath("/html/body/div[@class='container']")[0]
- title = container.xpath("h3/text()")[0]
- cover = container.xpath("//a[@class='bigImage']/@href")[0]
- info = container.xpath("div/div[@class='col-md-3 info']")[0]
- dvdid = info.xpath("p/span[@style]/text()")[0]
- publish_date = info.xpath("p/span[text()='发行时间:']")[0].tail.strip()
- duration = info.xpath("p/span[text()='长度:']")[0].tail.replace('分钟', '').strip()
- producer, serial = None, None
- producer_tag = info.xpath("p[text()='制作商: ']")[0].getnext().xpath("a")
- if producer_tag:
- producer = producer_tag[0].text_content()
- serial_tag = info.xpath("p[text()='系列:']")
- if serial_tag:
- serial = serial_tag[0].getnext().xpath("a/text()")[0]
- genre = info.xpath("p/span[@class='genre']/a/text()")
- actress = container.xpath("//a[@class='avatar-box']/span/text()")
-
- movie.dvdid = dvdid.replace('FC2-PPV-', 'FC2-')
- movie.url = url
- movie.title = title.replace(dvdid, '').strip()
- movie.cover = cover
- movie.publish_date = publish_date
- movie.duration = duration
- movie.genre = genre
- movie.actress = actress
- if full_id.startswith('FC2-'):
- # avsox把FC2作品的拍摄者归类到'系列'而制作商固定为'FC2-PPV',这既不合理也与其他的站点不兼容,因此进行调整
- movie.producer = serial
- else:
- movie.producer = producer
- movie.serial = serial
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('082713-417')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/avwiki.py b/javsp/web/avwiki.py
deleted file mode 100644
index fbd4ecbb3..000000000
--- a/javsp/web/avwiki.py
+++ /dev/null
@@ -1,72 +0,0 @@
-"""从av-wiki抓取数据"""
-import logging
-
-
-from javsp.web.base import *
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-logger = logging.getLogger(__name__)
-base_url = 'https://av-wiki.net'
-
-
-def parse_data(movie: MovieInfo):
- """从网页抓取并解析指定番号的数据
- Args:
- movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
- """
- movie.url = url = f'{base_url}/{movie.dvdid}'
- resp = request_get(url, delay_raise=True)
- if resp.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- html = resp2html(resp)
-
- cover_tag = html.xpath("//header/div/a[@class='image-link-border']/img")
- if cover_tag:
- try:
- srcset = cover_tag[0].get('srcset').split(', ')
- src_set_urls = {}
- for src in srcset:
- url, width = src.split()
- width = int(width.rstrip('w'))
- src_set_urls[width] = url
- max_pic = sorted(src_set_urls.items(), key=lambda x:x[0], reverse=True)
- movie.cover = max_pic[0][1]
- except:
- movie.cover = cover_tag[0].get('src')
- body = html.xpath("//section[@class='article-body']")[0]
- title = body.xpath("div/p/text()")[0]
- title = title.replace(f"【{movie.dvdid}】", '')
- cite_url = body.xpath("div/cite/a/@href")[0]
- cite_url = cite_url.split('?aff=')[0]
- info = body.xpath("dl[@class='dltable']")[0]
- dt_txt_ls, dd_tags = info.xpath("dt/text()"), info.xpath("dd")
- data = {}
- for dt_txt, dd in zip(dt_txt_ls, dd_tags):
- dt_txt = dt_txt.strip()
- a_tag = dd.xpath('a')
- if len(a_tag) == 0:
- dd_txt = dd.text.strip()
- else:
- dd_txt = [i.text.strip() for i in a_tag]
- if isinstance(dd_txt, list) and dt_txt != 'AV女優名': # 只有女优名以列表的数据格式保留
- dd_txt = dd_txt[0]
- data[dt_txt] = dd_txt
-
- ATTR_MAP = {'メーカー': 'producer', 'AV女優名': 'actress', 'メーカー品番': 'dvdid', 'シリーズ': 'serial', '配信開始日': 'publish_date'}
- for key, attr in ATTR_MAP.items():
- setattr(movie, attr, data.get(key))
- movie.title = title
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
-
- movie = MovieInfo('259LUXU-593')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/base.py b/javsp/web/base.py
deleted file mode 100644
index 717b5168a..000000000
--- a/javsp/web/base.py
+++ /dev/null
@@ -1,270 +0,0 @@
-"""网络请求的统一接口"""
-import os
-import sys
-import time
-import shutil
-import logging
-import requests
-import contextlib
-import cloudscraper
-import lxml.html
-from tqdm import tqdm
-from lxml import etree
-from lxml.html.clean import Cleaner
-from requests.models import Response
-
-
-from javsp.config import Cfg
-from javsp.web.exceptions import *
-
-
-__all__ = ['Request', 'get_html', 'post_html', 'request_get', 'resp2html', 'is_connectable', 'download', 'get_resp_text', 'read_proxy']
-
-
-headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
-
-logger = logging.getLogger(__name__)
-# 删除js脚本相关的tag,避免网页检测到没有js运行环境时强行跳转,影响调试
-cleaner = Cleaner(kill_tags=['script', 'noscript'])
-
-def read_proxy():
- if Cfg().network.proxy_server is None:
- return {}
- else:
- proxy = str(Cfg().network.proxy_server)
- return {'http': proxy, 'https': proxy}
-
-# 与网络请求相关的功能汇总到一个模块中以方便处理,但是不同站点的抓取器又有自己的需求(针对不同网站
-# 需要使用不同的UA、语言等)。每次都传递参数很麻烦,而且会面临函数参数越加越多的问题。因此添加这个
-# 处理网络请求的类,它带有默认的属性,但是也可以在各个抓取器模块里进行进行定制
-class Request():
- """作为网络请求出口并支持各个模块定制功能"""
- def __init__(self, use_scraper=False) -> None:
- # 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效
- self.headers = headers.copy()
- self.cookies = {}
-
- self.proxies = read_proxy()
- self.timeout = Cfg().network.timeout.total_seconds()
- if not use_scraper:
- self.scraper = None
- self.__get = requests.get
- self.__post = requests.post
- self.__head = requests.head
- else:
- self.scraper = cloudscraper.create_scraper()
- self.__get = self._scraper_monitor(self.scraper.get)
- self.__post = self._scraper_monitor(self.scraper.post)
- self.__head = self._scraper_monitor(self.scraper.head)
-
- def _scraper_monitor(self, func):
- """监控cloudscraper的工作状态,遇到不支持的Challenge时尝试退回常规的requests请求"""
- def wrapper(*args, **kw):
- try:
- return func(*args, **kw)
- except Exception as e:
- logger.debug(f"无法通过CloudFlare检测: '{e}', 尝试退回常规的requests请求")
- if func == self.scraper.get:
- return requests.get(*args, **kw)
- else:
- return requests.post(*args, **kw)
- return wrapper
-
- def get(self, url, delay_raise=False):
- r = self.__get(url,
- headers=self.headers,
- proxies=self.proxies,
- cookies=self.cookies,
- timeout=self.timeout)
- if not delay_raise:
- r.raise_for_status()
- return r
-
- def post(self, url, data, delay_raise=False):
- r = self.__post(url,
- data=data,
- headers=self.headers,
- proxies=self.proxies,
- cookies=self.cookies,
- timeout=self.timeout)
- if not delay_raise:
- r.raise_for_status()
- return r
-
- def head(self, url, delay_raise=True):
- r = self.__head(url,
- headers=self.headers,
- proxies=self.proxies,
- cookies=self.cookies,
- timeout=self.timeout)
- if not delay_raise:
- r.raise_for_status()
- return r
-
- def get_html(self, url):
- r = self.get(url)
- html = resp2html(r)
- return html
-
-
-class DownloadProgressBar(tqdm):
- def update_to(self, b=1, bsize=1, tsize=None):
- if tsize is not None:
- self.total = tsize
- self.update(b * bsize - self.n)
-
-
-def request_get(url, cookies={}, timeout=None, delay_raise=False):
- """获取指定url的原始请求"""
- if timeout is None:
- timeout = Cfg().network.timeout.seconds
-
- r = requests.get(url, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout)
- if not delay_raise:
- if r.status_code == 403 and b'>Just a moment...<' in r.content:
- raise SiteBlocked(f"403 Forbidden: 无法通过CloudFlare检测: {url}")
- else:
- r.raise_for_status()
- return r
-
-
-def request_post(url, data, cookies={}, timeout=None, delay_raise=False):
- """向指定url发送post请求"""
- if timeout is None:
- timeout = Cfg().network.timeout.seconds
- r = requests.post(url, data=data, headers=headers, proxies=read_proxy(), cookies=cookies, timeout=timeout)
- if not delay_raise:
- r.raise_for_status()
- return r
-
-
-def get_resp_text(resp: Response, encoding=None):
- """提取Response的文本"""
- if encoding:
- resp.encoding = encoding
- else:
- resp.encoding = resp.apparent_encoding
- return resp.text
-
-
-def get_html(url, encoding='utf-8'):
- """使用get方法访问指定网页并返回经lxml解析后的document"""
- resp = request_get(url)
- text = get_resp_text(resp, encoding=encoding)
- html = lxml.html.fromstring(text)
- html.make_links_absolute(url, resolve_base_href=True)
- # 清理功能仅应在需要的时候用来调试网页(如prestige),否则可能反过来影响调试(如JavBus)
- # html = cleaner.clean_html(html)
- if hasattr(sys, 'javsp_debug_mode'):
- lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug
- return html
-
-
-def resp2html(resp, encoding='utf-8') -> lxml.html.HtmlComment:
- """将request返回的response转换为经lxml解析后的document"""
- text = get_resp_text(resp, encoding=encoding)
- html = lxml.html.fromstring(text)
- html.make_links_absolute(resp.url, resolve_base_href=True)
- # html = cleaner.clean_html(html)
- if hasattr(sys, 'javsp_debug_mode'):
- lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug
- return html
-
-
-def post_html(url, data, encoding='utf-8', cookies={}):
- """使用post方法访问指定网页并返回经lxml解析后的document"""
- resp = request_post(url, data, cookies=cookies)
- text = get_resp_text(resp, encoding=encoding)
- html = lxml.html.fromstring(text)
- # jav321提供ed2k形式的资源链接,其中的非ASCII字符可能导致转换失败,因此要先进行处理
- ed2k_tags = html.xpath("//a[starts-with(@href,'ed2k://')]")
- for tag in ed2k_tags:
- tag.attrib['ed2k'], tag.attrib['href'] = tag.attrib['href'], ''
- html.make_links_absolute(url, resolve_base_href=True)
- for tag in ed2k_tags:
- tag.attrib['href'] = tag.attrib['ed2k']
- tag.attrib.pop('ed2k')
- # html = cleaner.clean_html(html)
- # lxml.html.open_in_browser(html, encoding=encoding) # for develop and debug
- return html
-
-
-def dump_xpath_node(node, filename=None):
- """将xpath节点dump到文件"""
- if not filename:
- filename = node.tag + '.html'
- with open(filename, 'wt', encoding='utf-8') as f:
- content = etree.tostring(node, pretty_print=True).decode('utf-8')
- f.write(content)
-
-
-def is_connectable(url, timeout=3):
- """测试与指定url的连接"""
- try:
- r = requests.get(url, headers=headers, timeout=timeout)
- return True
- except requests.exceptions.RequestException as e:
- logger.debug(f"Not connectable: {url}\n" + repr(e))
- return False
-
-
-def urlretrieve(url, filename=None, reporthook=None, headers=None):
- if "arzon" in url:
- headers["Referer"] = "https://www.arzon.jp/"
- """使用requests实现urlretrieve"""
- # https://blog.csdn.net/qq_38282706/article/details/80253447
- with contextlib.closing(requests.get(url, headers=headers,
- proxies=read_proxy(), stream=True)) as r:
- header = r.headers
- with open(filename, 'wb+') as fp:
- bs = 1024
- size = -1
- blocknum = 0
- if "content-length" in header:
- size = int(header["Content-Length"]) # 文件总大小(理论值)
- if reporthook: # 写入前运行一次回调函数
- reporthook(blocknum, bs, size)
- for chunk in r.iter_content(chunk_size=1024):
- if chunk:
- fp.write(chunk)
- fp.flush()
- blocknum += 1
- if reporthook:
- reporthook(blocknum, bs, size) # 每写入一次运行一次回调函数
-
-
-def download(url, output_path, desc=None):
- """下载指定url的资源"""
- # 支持“下载”本地资源,以供fc2fan的本地镜像所使用
- if not url.startswith('http'):
- start_time = time.time()
- shutil.copyfile(url, output_path)
- filesize = os.path.getsize(url)
- elapsed = time.time() - start_time
- info = {'total': filesize, 'elapsed': elapsed, 'rate': filesize/elapsed}
- return info
- if not desc:
- desc = url.split('/')[-1]
- referrer = headers.copy()
- referrer['referer'] = url[:url.find('/', 8)+1] # 提取base_url部分
- with DownloadProgressBar(unit='B', unit_scale=True,
- miniters=1, desc=desc, leave=False) as t:
- urlretrieve(url, filename=output_path, reporthook=t.update_to, headers=referrer)
- info = {k: t.format_dict[k] for k in ('total', 'elapsed', 'rate')}
- return info
-
-
-def open_in_chrome(url, new=0, autoraise=True):
- """使用指定的Chrome Profile打开url,便于调试"""
- import subprocess
- chrome = R'C:\Program Files\Google\Chrome\Application\chrome.exe'
- subprocess.run(f'"{chrome}" --profile-directory="Profile 2" {url}', shell=True)
-
-import webbrowser
-webbrowser.open = open_in_chrome
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- download('https://www.javbus.com/pics/cover/6n54_b.jpg', 'cover.jpg')
diff --git a/javsp/web/dl_getchu.py b/javsp/web/dl_getchu.py
deleted file mode 100644
index 15267f1f7..000000000
--- a/javsp/web/dl_getchu.py
+++ /dev/null
@@ -1,122 +0,0 @@
-"""从dl.getchu官网抓取数据"""
-import re
-import logging
-
-from javsp.web.base import resp2html, request_get
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-logger = logging.getLogger(__name__)
-
-# https://dl.getchu.com/i/item4045373
-base_url = 'https://dl.getchu.com'
-# dl.getchu用utf-8会乱码
-base_encode = 'euc-jp'
-
-
-def get_movie_title(html):
- container = html.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[2]")
- if len(container) > 0:
- container = container[0]
- rows = container.xpath('.//tr')
- title = ''
- for row in rows:
- for cell in row.xpath('.//td/div'):
- # 获取单元格文本内容
- if cell.text:
- title = str(cell.text).strip()
- return title
-
-
-def get_movie_img(html, getchu_id):
- img_src = ''
- container = html.xpath(f'//img[contains(@src, "{getchu_id}top.jpg")]')
- if len(container) > 0:
- container = container[0]
- img_src = container.get('src')
- return img_src
-
-
-def get_movie_preview(html, getchu_id):
- preview_pics = []
- container = html.xpath(f'//img[contains(@src, "{getchu_id}_")]')
- if len(container) > 0:
- for c in container:
- preview_pics.append(c.get('src'))
- return preview_pics
-
-
-DURATION_PATTERN = re.compile(r'(?:動画)?(\d+)分')
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # 去除番号中的'GETCHU'字样
- id_uc = movie.dvdid.upper()
- if not id_uc.startswith('GETCHU-'):
- raise ValueError('Invalid GETCHU number: ' + movie.dvdid)
- getchu_id = id_uc.replace('GETCHU-', '')
- # 抓取网页
- url = f'{base_url}/i/item{getchu_id}'
- r = request_get(url, delay_raise=True)
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- html = resp2html(r, base_encode)
- container = html.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[3]")
- if len(container) > 0:
- container = container[0]
- # 将表格提取为键值对
- rows = container.xpath('.//table/tr')
- kv_rows = [i for i in rows if len(i) == 2]
- data = {}
- for row in kv_rows:
- # 获取单元格文本内容
- key = row.xpath("td[@class='bluetext']/text()")[0]
- # 是否包含a标签: 有的属性是用表示的,不是text
- a_tags = row.xpath("td[2]/a")
- if a_tags:
- value = [i.text for i in a_tags]
- else:
- # 获取第2个td标签的内容(下标从1开始计数)
- value = row.xpath("td[2]/text()")
- data[key] = value
-
- for key, value in data.items():
- if key == 'サークル':
- movie.producer = value[0]
- elif key == '作者':
- # 暂时没有在getchu找到多个actress的片子
- movie.actress = [i.strip() for i in value]
- elif key == '画像数&ページ数':
- match = DURATION_PATTERN.search(' '.join(value))
- if match:
- movie.duration = match.group(1)
- elif key == '配信開始日':
- movie.publish_date = value[0].replace('/', '-')
- elif key == '趣向':
- movie.genre = value
- elif key == '作品内容':
- idx = -1
- for i, line in enumerate(value):
- if line.lstrip().startswith('※'):
- idx = i
- break
- movie.plot = ''.join(value[:idx])
-
- movie.title = get_movie_title(html)
- movie.cover = get_movie_img(html, getchu_id)
- movie.preview_pics = get_movie_preview(html, getchu_id)
- movie.dvdid = id_uc
- movie.url = url
-
-
-if __name__ == "__main__":
- import pretty_errors
-
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('getchu-4041026')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/fanza.py b/javsp/web/fanza.py
deleted file mode 100644
index e975c4c8f..000000000
--- a/javsp/web/fanza.py
+++ /dev/null
@@ -1,231 +0,0 @@
-"""从fanza抓取数据"""
-import os
-import re
-import sys
-import json
-import logging
-from typing import Dict, List, Tuple
-
-
-from javsp.web.base import Request, resp2html
-from javsp.web.exceptions import *
-from javsp.config import Cfg
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://www.dmm.co.jp'
-# 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面)
-request = Request()
-request.cookies = {'age_check_done': '1'}
-request.headers['Accept-Language'] = 'ja,en-US;q=0.9'
-
-
-_PRODUCT_PRIORITY = {'digital': 10, 'mono': 5, 'monthly': 2, 'rental': 1}
-_TYPE_PRIORITY = {'videoa': 10, 'anime': 8, 'nikkatsu': 6, 'doujin': 4, 'dvd': 3, 'ppr': 2, 'paradisetv': 1}
-def sort_search_result(result: List[Dict]):
- """排序搜索结果"""
- scores = {i['url']:(_PRODUCT_PRIORITY.get(i['product'], 0), _TYPE_PRIORITY.get(i['type'], 0)) for i in result}
- sorted_result = sorted(result, key=lambda x:scores[x['url']], reverse=True)
- return sorted_result
-
-
-def get_urls_of_cid(cid: str) -> Tuple[str, str]:
- """搜索cid可能的影片URL"""
- r = request.get(f"https://www.dmm.co.jp/search/?redirect=1&enc=UTF-8&category=&searchstr={cid}&commit.x=0&commit.y=0")
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, cid)
- r.raise_for_status()
- html = resp2html_wrapper(r)
- result = html.xpath("//ul[@id='list']/li/div/p/a/@href")
- parsed_result = {}
- for url in result:
- items = url.split('/')
- type_, cid = None, None
- for i, part in enumerate(items):
- if part == '-':
- product, type_ = items[i-2], items[i-1]
- elif part.startswith('cid='):
- cid = part[4:]
- new_url = '/'.join(i for i in items if not i.startswith('?')) + '/'
- parsed_result.setdefault(cid, []).append({'product': product, 'type': type_, 'url': new_url})
- break
- if cid not in parsed_result:
- if len(result) > 0:
- logger.debug(f"Unknown URL in search result: " + ', '.join(result))
- raise MovieNotFoundError(__name__, cid)
- sorted_result = sort_search_result(parsed_result[cid])
- return sorted_result
-
-
-def resp2html_wrapper(resp):
- html = resp2html(resp)
- if 'not available in your region' in html.text_content():
- raise SiteBlocked('FANZA不允许从当前IP所在地区访问,请检查你的网络和代理服务器设置')
- elif '/login/' in resp.url:
- raise SiteBlocked('FANZA要求当前IP登录账号才可访问,请尝试更换为日本IP')
- return html
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- default_url = f'{base_url}/digital/videoa/-/detail/=/cid={movie.cid}/'
- r0 = request.get(default_url, delay_raise=True)
- if r0.status_code == 404:
- urls = get_urls_of_cid(movie.cid)
- for d in urls:
- func_name = f"parse_{d['type']}_page"
- if func_name in globals():
- parse_func = globals()[func_name]
- else:
- logger.debug(f"不知道怎么解析 fanza {d['type']} 的页面: {d['url']}")
- continue
- r = request.get(d['url'])
- html = resp2html_wrapper(r)
- try:
- parse_func(movie, html)
- movie.url = d['url']
- break
- except:
- logger.debug(f"Fail to parse {d['url']}", exc_info=True)
- if d is urls[-1]:
- logger.warning(f"在fanza查找到的cid={movie.cid}的影片页面均解析失败")
- raise
- else:
- html = resp2html_wrapper(r0)
- parse_videoa_page(movie, html)
- movie.url = default_url
-
-
-def parse_videoa_page(movie: MovieInfo, html):
- """解析AV影片的页面布局"""
- title = html.xpath("//div[@class='hreview']/h1/text()")[0]
- # 注意: 浏览器在渲染时会自动加上了'tbody'字段,但是原始html网页中并没有,因此xpath解析时还是要按原始网页的来
- container = html.xpath("//table[@class='mg-b12']/tr/td")[0]
- cover = container.xpath("//div[@id='sample-video']/a/@href")[0]
- # 采用'配信開始日'作为发布日期: https://www.zhihu.com/question/57513172/answer/153219083
- date_tag = container.xpath("//td[text()='配信開始日:']/following-sibling::td/text()")
- if date_tag:
- movie.publish_date = date_tag[0].strip().replace('/', '-')
- duration_str = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")[0].strip()
- match = re.search(r'\d+', duration_str)
- if match:
- movie.duration = match.group(0)
- # 女优、导演、系列:字段不存在时,匹配将得到空列表。暂未发现有名字不显示在a标签中的情况
- actress = container.xpath("//span[@id='performer']/a/text()")
- director_tag = container.xpath("//td[text()='監督:']/following-sibling::td/a/text()")
- if director_tag:
- movie.director = director_tag[0].strip()
- serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()")
- if serial_tag:
- movie.serial = serial_tag[0].strip()
- producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()")
- if producer_tag:
- movie.producer = producer_tag[0].strip()
- # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
- # label_tag = container.xpath("//td[text()='レーベル:']/following-sibling::td/a/text()")
- # if label_tag:
- # label = label_tag[0].strip()
- # fanza会把促销信息也写进genre……因此要根据tag指向的链接类型进行筛选
- genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'?keyword=') or contains(@href,'article=keyword')]")
- genre, genre_id = [], []
- for tag in genre_tags:
- genre.append(tag.text.strip())
- genre_id.append(tag.get('href').split('=')[-1].strip('/'))
- cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip()
- plot = container.xpath("//div[contains(@class, 'mg-b20 lh4')]/text()")[0].strip()
- preview_pics = container.xpath("//a[@name='sample-image']/img/@src")
- score_tag = container.xpath("//p[@class='d-review__average']/strong/text()")
- if score_tag:
- match = re.search(r'\d+', score_tag[0].strip())
- if match:
- score = float(match.group()) * 2
- movie.score = f'{score:.2f}'
- else:
- score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0]
- movie.score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50
-
- if Cfg().crawler.hardworking:
- # 预览视频是动态加载的,不在静态网页中
- video_url = f'{base_url}/service/digitalapi/-/html5_player/=/cid={movie.cid}'
- html2 = request.get_html(video_url)
- # 目前用到js脚本的地方不多,所以不使用专门的js求值模块,先用正则提取文本然后用json解析数据
- script = html2.xpath("//script[contains(text(),'getElementById(\"dmmplayer\")')]/text()")[0].strip()
- match = re.search(r'\{.*\}', script)
- # 主要是为了捕捉json.loads的异常,但是也借助try-except判断是否正则表达式是否匹配
- try:
- data = json.loads(match.group())
- video_url = data.get('src')
- if video_url and video_url.startswith('//'):
- video_url = 'https:' + video_url
- movie.preview_video = video_url
- except Exception as e:
- logger.debug('解析视频地址时异常: ' + repr(e))
-
- movie.cid = cid
- movie.title = title
- movie.cover = cover
- movie.actress = actress
- movie.genre = genre
- movie.genre_id = genre_id
- movie.plot = plot
- movie.preview_pics = preview_pics
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
-
-
-def parse_anime_page(movie: MovieInfo, html):
- """解析动画影片的页面布局"""
- title = html.xpath("//h1[@id='title']/text()")[0]
- container = html.xpath("//table[@class='mg-b12']/tr/td")[0]
- cover = container.xpath("//img[@name='package-image']/@src")[0]
- date_str = container.xpath("//td[text()='発売日:']/following-sibling::td/text()")[0].strip()
- publish_date = date_str.replace('/', '-')
- duration_tag = container.xpath("//td[text()='収録時間:']/following-sibling::td/text()")
- if duration_tag:
- movie.duration = duration_tag[0].strip().replace('分', '')
- serial_tag = container.xpath("//td[text()='シリーズ:']/following-sibling::td/a/text()")
- if serial_tag:
- movie.serial = serial_tag[0].strip()
- producer_tag = container.xpath("//td[text()='メーカー:']/following-sibling::td/a/text()")
- if producer_tag:
- movie.producer = producer_tag[0].strip()
- genre_tags = container.xpath("//td[text()='ジャンル:']/following-sibling::td/a[contains(@href,'article=keyword')]")
- genre, genre_id = [], []
- for tag in genre_tags:
- genre.append(tag.text.strip())
- genre_id.append(tag.get('href').split('=')[-1].strip('/'))
- cid = container.xpath("//td[text()='品番:']/following-sibling::td/text()")[0].strip()
- plot = container.xpath("//div[@class='mg-b20 lh4']/p")[0].text_content().strip()
- preview_pics = container.xpath("//a[@name='sample-image']/img/@data-lazy")
- score_img = container.xpath("//td[text()='平均評価:']/following-sibling::td/img/@src")[0]
- score = int(score_img.split('/')[-1].split('.')[0]) # 00, 05 ... 50
-
- movie.cid = cid
- movie.title = title
- movie.cover = cover
- movie.publish_date = publish_date
- movie.genre = genre
- movie.genre_id = genre_id
- movie.plot = plot
- movie.score = f'{score/5:.2f}' # 转换为10分制
- movie.preview_pics = preview_pics
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
-
-
-# parse_dvd_page = parse_videoa_page # 118wtktabf067
-parse_ppr_page = parse_videoa_page
-parse_nikkatsu_page = parse_videoa_page
-parse_doujin_page = parse_anime_page
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo(cid='d_aisoft3356')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/fc2.py b/javsp/web/fc2.py
deleted file mode 100644
index 66be7ae4e..000000000
--- a/javsp/web/fc2.py
+++ /dev/null
@@ -1,105 +0,0 @@
-"""从FC2官网抓取数据"""
-import logging
-
-
-from javsp.web.base import get_html, request_get, resp2html
-from javsp.web.exceptions import *
-from javsp.config import Cfg
-from javsp.lib import strftime_to_minutes
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://adult.contents.fc2.com'
-
-
-def get_movie_score(fc2_id):
- """通过评论数据来计算FC2的影片评分(10分制),无法获得评分时返回None"""
- html = get_html(f'{base_url}/article/{fc2_id}/review')
- review_tags = html.xpath("//ul[@class='items_comment_headerReviewInArea']/li")
- reviews = {}
- for tag in review_tags:
- score = int(tag.xpath("div/span/text()")[0])
- vote = int(tag.xpath("span")[0].text_content())
- reviews[score] = vote
- total_votes = sum(reviews.values())
- if (total_votes >= 2): # 至少也该有两个人评价才有参考意义一点吧
- summary = sum([k*v for k, v in reviews.items()])
- final_score = summary / total_votes * 2 # 乘以2转换为10分制
- return final_score
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # 去除番号中的'FC2'字样
- id_uc = movie.dvdid.upper()
- if not id_uc.startswith('FC2-'):
- raise ValueError('Invalid FC2 number: ' + movie.dvdid)
- fc2_id = id_uc.replace('FC2-', '')
- # 抓取网页
- url = f'{base_url}/article/{fc2_id}/'
- resp = request_get(url)
- if '/id.fc2.com/' in resp.url:
- raise SiteBlocked('FC2要求当前IP登录账号才可访问,请尝试更换为日本IP')
- html = resp2html(resp)
- container = html.xpath("//div[@class='items_article_left']")
- if len(container) > 0:
- container = container[0]
- else:
- raise MovieNotFoundError(__name__, movie.dvdid)
- # FC2 标题增加反爬乱码,使用数组合并标题
- title_arr = container.xpath("//div[@class='items_article_headerInfo']/h3/text()")
- title = ''.join(title_arr)
- thumb_tag = container.xpath("//div[@class='items_article_MainitemThumb']")[0]
- thumb_pic = thumb_tag.xpath("span/img/@src")[0]
- duration_str = thumb_tag.xpath("span/p[@class='items_article_info']/text()")[0]
- # FC2没有制作商和发行商的区分,作为个人市场,影片页面的'by'更接近于制作商
- producer = container.xpath("//li[text()='by ']/a/text()")[0]
- genre = container.xpath("//a[@class='tag tagTag']/text()")
- date_str = container.xpath("//div[@class='items_article_Releasedate']/p/text()")[0]
- publish_date = date_str[-10:].replace('/', '-') # '販売日 : 2017/11/30'
- preview_pics = container.xpath("//ul[@data-feed='sample-images']/li/a/@href")
-
- if Cfg().crawler.hardworking:
- # 通过评论数据来计算准确的评分
- score = get_movie_score(fc2_id)
- if score:
- movie.score = f'{score:.2f}'
- # 预览视频是动态加载的,不在静态网页中
- desc_frame_url = container.xpath("//section[@class='items_article_Contents']/iframe/@src")[0]
- key = desc_frame_url.split('=')[-1] # /widget/article/718323/description?ac=60fc08fa...
- api_url = f'{base_url}/api/v2/videos/{fc2_id}/sample?key={key}'
- r = request_get(api_url).json()
- movie.preview_video = r['path']
- else:
- # 获取影片评分。影片页面的评分只能粗略到星级,且没有分数,要通过类名来判断,如'items_article_Star5'表示5星
- score_tag_attr = container.xpath("//a[@class='items_article_Stars']/p/span/@class")[0]
- score = int(score_tag_attr[-1]) * 2
- movie.score = f'{score:.2f}'
-
- movie.dvdid = id_uc
- movie.url = url
- movie.title = title
- movie.genre = genre
- movie.producer = producer
- movie.duration = str(strftime_to_minutes(duration_str))
- movie.publish_date = publish_date
- movie.preview_pics = preview_pics
- # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
- if movie.preview_pics:
- movie.cover = preview_pics[0]
- else:
- movie.cover = thumb_pic
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('FC2-718323')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/fc2fan.py b/javsp/web/fc2fan.py
deleted file mode 100644
index 229b3e3df..000000000
--- a/javsp/web/fc2fan.py
+++ /dev/null
@@ -1,80 +0,0 @@
-"""解析fc2fan本地镜像的数据"""
-# FC2官网的影片下架就无法再抓取数据,如果用户有fc2fan的镜像,那可以尝试从镜像中解析影片数据
-import os
-import re
-import logging
-import lxml.html
-import requests
-
-
-from javsp.web.base import resp2html
-from javsp.web.exceptions import *
-from javsp.config import Cfg
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_path = str(Cfg().crawler.fc2fan_local_path)
-use_local_mirror = os.path.exists(base_path)
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- if use_local_mirror:
- html_file = f'{base_path}/{movie.dvdid}.html'
- if not os.path.exists(html_file):
- raise MovieNotFoundError(__name__, movie.dvdid, html_file)
- html = lxml.html.parse(html_file)
- else:
- url = f"https://fc2club.top/html/{movie.dvdid}.html"
- r = requests.get(url)
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- elif r.text == '':
- raise WebsiteError(f'fc2fan: 站点不可用 (HTTP {r.status_code}): {url}')
- html = resp2html(r)
- try:
- container = html.xpath("//div[@class='col-sm-8']")[0]
- except IndexError:
- raise WebsiteError(f'fc2fan: 站点不可用')
- title = container.xpath("h3/text()")[0]
- score_str = container.xpath("h5/strong[text()='影片评分']")[0].tail.strip()
- match = re.search(r'\d+', score_str)
- if match:
- score = int(match.group()) / 10 # fc2fan站长是按100分来打分的
- movie.score = f'{score:.1f}'
- resource_info = container.xpath("h5/strong[text()='资源参数']")[0].tail
- if '无码' in resource_info:
- movie.uncensored = True
- elif '有码' in resource_info:
- movie.uncensored = False
- # FC2没有制作商和发行商的区分,作为个人市场,卖家更接近于制作商
- producer = container.xpath("h5/strong[text()='卖家信息']")[0].getnext().text
- if producer:
- movie.producer = producer.strip()
- genre = container.xpath("h5/strong[text()='影片标签']/../a/text()")
- actress = container.xpath("h5/strong[text()='女优名字']/../a/text()")
- preview_pics = container.xpath("//ul[@class='slides']/li/img/@src")
- if use_local_mirror:
- preview_pics = [os.path.normpath(os.path.join(base_path, i)) for i in preview_pics]
- # big_preview = container.xpath("//img[@id='thumbpic']/../@href")[0] # 影片真实截图,目前暂时用不到
-
- movie.title = title
- movie.genre = genre
- movie.actress = actress
- if preview_pics:
- movie.preview_pics = preview_pics
- movie.cover = preview_pics[0]
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('FC2-1879420')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/fc2ppvdb.py b/javsp/web/fc2ppvdb.py
deleted file mode 100644
index b0ad60892..000000000
--- a/javsp/web/fc2ppvdb.py
+++ /dev/null
@@ -1,76 +0,0 @@
-"""从FC2PPVDB抓取数据"""
-import logging
-from typing import List
-
-
-from javsp.web.base import get_html
-from javsp.web.exceptions import *
-from javsp.lib import strftime_to_minutes
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://fc2ppvdb.com'
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # 去除番号中的'FC2'字样
- id_uc = movie.dvdid.upper()
- if not id_uc.startswith('FC2-'):
- raise ValueError('Invalid FC2 number: ' + movie.dvdid)
- fc2_id = id_uc.replace('FC2-', '')
- # 抓取网页
- url = f'{base_url}/articles/{fc2_id}'
- html = get_html(url)
- container = html.xpath("//div[@class='container lg:px-5 px-2 py-12 mx-auto']/div[1]")
- if len(container) > 0:
- container = container[0]
- else:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- title = container.xpath("//h2/a/text()")
- thumb_pic = container.xpath(f"//img[@alt='{fc2_id}']/@src")
- duration_str = container.xpath("//div[starts-with(text(),'収録時間:')]/span/text()")
- actress = container.xpath("//div[starts-with(text(),'女優:')]/span/a/text()")
- genre = container.xpath("//div[starts-with(text(),'タグ:')]/span/a/text()")
- publish_date = container.xpath("//div[starts-with(text(),'販売日:')]/span/text()")
- publisher = container.xpath("//div[starts-with(text(),'販売者:')]/span/a/text()")
- uncensored_str = container.xpath("//div[starts-with(text(),'モザイク:')]/span/text()")
- uncensored_str_f = get_list_first(uncensored_str);
- uncensored = True if uncensored_str_f == '無' else False if uncensored_str_f == '有' else None
- preview_pics = None
- preview_video = container.xpath("//a[starts-with(text(),'サンプル動画')]/@href")
-
- movie.dvdid = id_uc
- movie.url = url
- movie.title = get_list_first(title)
- movie.genre = genre
- movie.actress = actress
- movie.duration = str(strftime_to_minutes(get_list_first(duration_str)))
- movie.publish_date = get_list_first(publish_date)
- movie.publisher = get_list_first(publisher)
- movie.uncensored = uncensored
- movie.preview_pics = preview_pics
- movie.preview_video = get_list_first(preview_video)
-
- # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
- if movie.preview_pics:
- movie.cover = preview_pics[0]
- else:
- movie.cover = get_list_first(thumb_pic)
-
-def get_list_first(list:List):
- return list[0] if list and len(list) > 0 else None
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('FC2-4497837')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/gyutto.py b/javsp/web/gyutto.py
deleted file mode 100644
index db7d6c795..000000000
--- a/javsp/web/gyutto.py
+++ /dev/null
@@ -1,87 +0,0 @@
-"""从https://gyutto.com/官网抓取数据"""
-import logging
-import time
-
-from javsp.web.base import resp2html, request_get
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-logger = logging.getLogger(__name__)
-
-# https://dl.gyutto.com/i/item266923
-base_url = 'http://gyutto.com'
-base_encode = 'euc-jp'
-
-def get_movie_title(html):
- container = html.xpath("//h1")
- if len(container) > 0:
- container = container[0]
- title = container.text
-
- return title
-
-def get_movie_img(html, index = 1):
- images = []
- container = html.xpath("//a[@class='highslide']/img")
- if len(container) > 0:
- if index == 0:
- return container[0].get('src')
-
- for row in container:
- images.append(row.get('src'))
-
- return images
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # 去除番号中的'gyutto'字样
- id_uc = movie.dvdid.upper()
- if not id_uc.startswith('GYUTTO-'):
- raise ValueError('Invalid gyutto number: ' + movie.dvdid)
- gyutto_id = id_uc.replace('GYUTTO-', '')
- # 抓取网页
- url = f'{base_url}/i/item{gyutto_id}?select_uaflag=1'
- r = request_get(url, delay_raise=True)
- if r.status_code == 404:
- raise MovieNotFoundError(__name__, movie.dvdid)
- html = resp2html(r, base_encode)
- container = html.xpath("//dl[@class='BasicInfo clearfix']")
-
- for row in container:
- key = row.xpath(".//dt/text()")
- if key[0] == "サークル":
- producer = ''.join(row.xpath(".//dd/a/text()"))
- elif key[0] == "ジャンル":
- genre = row.xpath(".//dd/a/text()")
- elif key[0] == "配信開始日":
- date = row.xpath(".//dd/text()")
- date_str = ''.join(date)
- date_time = time.strptime(date_str, "%Y年%m月%d日")
- publish_date = time.strftime("%Y-%m-%d", date_time)
-
- plot = html.xpath("//div[@class='unit_DetailLead']/p/text()")[0]
-
- movie.title = get_movie_title(html)
- movie.cover = get_movie_img(html, 0)
- movie.preview_pics = get_movie_img(html)
- movie.dvdid = id_uc
- movie.url = url
- movie.producer = producer
- # movie.actress = actress
- # movie.duration = duration
- movie.publish_date = publish_date
- movie.genre = genre
- movie.plot = plot
-
-if __name__ == "__main__":
- import pretty_errors
-
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
- movie = MovieInfo('gyutto-266923')
-
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/jav321.py b/javsp/web/jav321.py
deleted file mode 100644
index 4e42617a5..000000000
--- a/javsp/web/jav321.py
+++ /dev/null
@@ -1,100 +0,0 @@
-"""从jav321抓取数据"""
-import re
-import logging
-
-
-from javsp.web.base import post_html
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://www.jav321.com'
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- html = post_html(f'{base_url}/search', data={'sn': movie.dvdid})
- page_url = html.xpath("//ul[@class='dropdown-menu']/li/a/@href")[0]
- #TODO: 注意cid是dmm的概念。如果影片来自MGSTAGE,这里的cid很可能是jav321自己添加的,例如 345SIMM-542
- cid = page_url.split('/')[-1] # /video/ipx00177
- # 如果从URL匹配到的cid是'search',说明还停留在搜索页面,找不到这部影片
- if cid == 'search':
- raise MovieNotFoundError(__name__, movie.dvdid)
- title = html.xpath("//div[@class='panel-heading']/h3/text()")[0]
- info = html.xpath("//div[@class='col-md-9']")[0]
- # jav321的不同信息字段间没有明显分隔,只能通过url来匹配目标标签
- company_tags = info.xpath("a[contains(@href,'/company/')]/text()")
- if company_tags:
- movie.producer = company_tags[0]
- # actress, actress_pics
- # jav321现在连女优信息都没有了,首页通过女优栏跳转过去也全是空白
- actress, actress_pics = [], {}
- actress_tags = html.xpath("//div[@class='thumbnail']/a[contains(@href,'/star/')]/img")
- for tag in actress_tags:
- name = tag.tail.strip()
- pic_url = tag.get('src')
- actress.append(name)
- # jav321的女优头像完全是应付了事:即使女优实际没有头像,也会有一个看起来像模像样的url,
- # 因而无法通过url判断女优头像图片是否有效。有其他选择时最好不要使用jav321的女优头像数据
- actress_pics[name] = pic_url
- # genre, genre_id
- genre_tags = info.xpath("a[contains(@href,'/genre/')]")
- genre, genre_id = [], []
- for tag in genre_tags:
- genre.append(tag.text)
- genre_id.append(tag.get('href').split('/')[-2]) # genre/4025/1
- dvdid = info.xpath("b[text()='品番']")[0].tail.replace(': ', '').upper()
- publish_date = info.xpath("b[text()='配信開始日']")[0].tail.replace(': ', '')
- duration_str = info.xpath("b[text()='収録時間']")[0].tail
- match = re.search(r'\d+', duration_str)
- if match:
- movie.duration = match.group(0)
- # 仅部分影片有评分且评分只能粗略到星级而没有分数,要通过星级的图片来判断,如'/img/35.gif'表示3.5星
- score_tag = info.xpath("//b[text()='平均評価']/following-sibling::img/@data-original")
- if score_tag:
- score = int(score_tag[0][5:7])/5 # /10*2
- movie.score = str(score)
- serial_tag = info.xpath("a[contains(@href,'/series/')]/text()")
- if serial_tag:
- movie.serial = serial_tag[0]
- preview_video_tag = info.xpath("//video/source/@src")
- if preview_video_tag:
- movie.preview_video = preview_video_tag[0]
- plot_tag = info.xpath("//div[@class='panel-body']/div[@class='row']/div[@class='col-md-12']/text()")
- if plot_tag:
- movie.plot = plot_tag[0]
- preview_pics = html.xpath("//div[@class='col-xs-12 col-md-12']/p/a/img[@class='img-responsive']/@src")
- if len(preview_pics) == 0:
- # 尝试搜索另一种布局下的封面,需要使用onerror过滤掉明明没有封面时网站往里面塞的默认URL
- preview_pics = html.xpath("//div/div/div[@class='col-md-3']/img[@onerror and @class='img-responsive']/@src")
- # 有的图片链接里有多个//,网站质量堪忧……
- preview_pics = [i[:8] + i[8:].replace('//', '/') for i in preview_pics]
- # 磁力和ed2k链接是依赖js脚本加载的,无法通过静态网页来解析
-
- movie.url = page_url
- movie.cid = cid
- movie.dvdid = dvdid
- movie.title = title
- movie.actress = actress
- movie.actress_pics = actress_pics
- movie.genre = genre
- movie.genre_id = genre_id
- movie.publish_date = publish_date
- # preview_pics的第一张图始终是封面,剩下的才是预览图
- if len(preview_pics) > 0:
- movie.cover = preview_pics[0]
- movie.preview_pics = preview_pics[1:]
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('SCUTE-1177')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/javbus.py b/javsp/web/javbus.py
deleted file mode 100644
index a98cd9974..000000000
--- a/javsp/web/javbus.py
+++ /dev/null
@@ -1,115 +0,0 @@
-"""从JavBus抓取数据"""
-import logging
-
-
-from javsp.web.base import *
-from javsp.web.exceptions import *
-from javsp.func import *
-from javsp.config import Cfg, CrawlerID
-from javsp.datatype import MovieInfo, GenreMap
-
-
-logger = logging.getLogger(__name__)
-genre_map = GenreMap('data/genre_javbus.csv')
-permanent_url = 'https://www.javbus.com'
-if Cfg().network.proxy_server is not None:
- base_url = permanent_url
-else:
- base_url = str(Cfg().network.proxy_free[CrawlerID.javbus])
-
-
-def parse_data(movie: MovieInfo):
- """从网页抓取并解析指定番号的数据
- Args:
- movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
- """
- url = f'{base_url}/{movie.dvdid}'
- resp = request_get(url, delay_raise=True)
- # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息
- if resp.history and resp.history[0].status_code == 302:
- html = resp2html(resp.history[0])
- else:
- html = resp2html(resp)
- # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404
- page_title = html.xpath('/html/head/title/text()')
- if page_title and page_title[0].startswith('404 Page Not Found!'):
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- container = html.xpath("//div[@class='container']")[0]
- title = container.xpath("h3/text()")[0]
- cover = container.xpath("//a[@class='bigImage']/img/@src")[0]
- preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href")
- info = container.xpath("//div[@class='col-md-3 info']")[0]
- dvdid = info.xpath("p/span[text()='識別碼:']")[0].getnext().text
- publish_date = info.xpath("p/span[text()='發行日期:']")[0].tail.strip()
- duration = info.xpath("p/span[text()='長度:']")[0].tail.replace('分鐘', '').strip()
- director_tag = info.xpath("p/span[text()='導演:']")
- if director_tag: # xpath没有匹配时将得到空列表
- movie.director = director_tag[0].getnext().text.strip()
- producer_tag = info.xpath("p/span[text()='製作商:']")
- if producer_tag:
- text = producer_tag[0].getnext().text
- if text:
- movie.producer = text.strip()
- publisher_tag = info.xpath("p/span[text()='發行商:']")
- if publisher_tag:
- movie.publisher = publisher_tag[0].getnext().text.strip()
- serial_tag = info.xpath("p/span[text()='系列:']")
- if serial_tag:
- movie.serial = serial_tag[0].getnext().text
- # genre, genre_id
- genre_tags = info.xpath("//span[@class='genre']/label/a")
- genre, genre_id = [], []
- for tag in genre_tags:
- tag_url = tag.get('href')
- pre_id = tag_url.split('/')[-1]
- genre.append(tag.text)
- if 'uncensored' in tag_url:
- movie.uncensored = True
- genre_id.append('uncensored-' + pre_id)
- else:
- movie.uncensored = False
- genre_id.append(pre_id)
- # JavBus的磁力链接是依赖js脚本加载的,无法通过静态网页来解析
- # actress, actress_pics
- actress, actress_pics = [], {}
- actress_tags = html.xpath("//a[@class='avatar-box']/div/img")
- for tag in actress_tags:
- name = tag.get('title')
- pic_url = tag.get('src')
- actress.append(name)
- if not pic_url.endswith('nowprinting.gif'): # 略过默认的头像
- actress_pics[name] = pic_url
- # 整理数据并更新movie的相应属性
- movie.url = f'{permanent_url}/{movie.dvdid}'
- movie.dvdid = dvdid
- movie.title = title.replace(dvdid, '').strip()
- movie.cover = cover
- movie.preview_pics = preview_pics
- if publish_date != '0000-00-00': # 丢弃无效的发布日期
- movie.publish_date = publish_date
- movie.duration = duration if int(duration) else None
- movie.genre = genre
- movie.genre_id = genre_id
- movie.actress = actress
- movie.actress_pics = actress_pics
-
-
-def parse_clean_data(movie: MovieInfo):
- """解析指定番号的影片数据并进行清洗"""
- parse_data(movie)
- movie.genre_norm = genre_map.map(movie.genre_id)
- movie.genre_id = None # 没有别的地方需要再用到,清空genre id(暗示已经完成转换)
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('NANP-030')
- try:
- parse_clean_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/javdb.py b/javsp/web/javdb.py
deleted file mode 100644
index 5120aae76..000000000
--- a/javsp/web/javdb.py
+++ /dev/null
@@ -1,333 +0,0 @@
-"""从JavDB抓取数据"""
-import os
-import re
-import logging
-
-from javsp.web.base import Request, resp2html
-from javsp.web.exceptions import *
-from javsp.func import *
-from javsp.avid import guess_av_type
-from javsp.config import Cfg, CrawlerID
-from javsp.datatype import MovieInfo, GenreMap
-from javsp.chromium import get_browsers_cookies
-
-
-# 初始化Request实例。使用scraper绕过CloudFlare后,需要指定网页语言,否则可能会返回其他语言网页,影响解析
-request = Request(use_scraper=True)
-request.headers['Accept-Language'] = 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en-US;q=0.7,en;q=0.6,ja;q=0.5'
-
-logger = logging.getLogger(__name__)
-genre_map = GenreMap('data/genre_javdb.csv')
-permanent_url = 'https://javdb.com'
-if Cfg().network.proxy_server is not None:
- base_url = permanent_url
-else:
- base_url = str(Cfg().network.proxy_free[CrawlerID.javdb])
-
-
-def get_html_wrapper(url):
- """包装外发的request请求并负责转换为可xpath的html,同时处理Cookies无效等问题"""
- global request, cookies_pool
- r = request.get(url, delay_raise=True)
- if r.status_code == 200:
- # 发生重定向可能仅仅是域名重定向,因此还要检查url以判断是否被跳转到了登录页
- if r.history and '/login' in r.url:
- # 仅在需要时去读取Cookies
- if 'cookies_pool' not in globals():
- try:
- cookies_pool = get_browsers_cookies()
- except (PermissionError, OSError) as e:
- logger.warning(f"无法从浏览器Cookies文件获取JavDB的登录凭据({e}),可能是安全软件在保护浏览器Cookies文件", exc_info=True)
- cookies_pool = []
- except Exception as e:
- logger.warning(f"获取JavDB的登录凭据时出错({e}),你可能使用的是国内定制版等非官方Chrome系浏览器", exc_info=True)
- cookies_pool = []
- if len(cookies_pool) > 0:
- item = cookies_pool.pop()
- # 更换Cookies时需要创建新的request实例,否则cloudscraper会保留它内部第一次发起网络访问时获得的Cookies
- request = Request(use_scraper=True)
- request.cookies = item['cookies']
- cookies_source = (item['profile'], item['site'])
- logger.debug(f'未携带有效Cookies而发生重定向,尝试更换Cookies为: {cookies_source}')
- return get_html_wrapper(url)
- else:
- raise CredentialError('JavDB: 所有浏览器Cookies均已过期')
- elif r.history and 'pay' in r.url.split('/')[-1]:
- raise SitePermissionError(f"JavDB: 此资源被限制为仅VIP可见: '{r.history[0].url}'")
- else:
- html = resp2html(r)
- return html
- elif r.status_code in (403, 503):
- html = resp2html(r)
- code_tag = html.xpath("//span[@class='code-label']/span")
- error_code = code_tag[0].text if code_tag else None
- if error_code:
- if error_code == '1020':
- block_msg = f'JavDB: {r.status_code} 禁止访问: 站点屏蔽了来自日本地区的IP地址,请使用其他地区的代理服务器'
- else:
- block_msg = f'JavDB: {r.status_code} 禁止访问: {url} (Error code: {error_code})'
- else:
- block_msg = f'JavDB: {r.status_code} 禁止访问: {url}'
- raise SiteBlocked(block_msg)
- else:
- raise WebsiteError(f'JavDB: {r.status_code} 非预期状态码: {url}')
-
-
-def get_user_info(site, cookies):
- """获取cookies对应的JavDB用户信息"""
- try:
- request.cookies = cookies
- html = request.get_html(f'https://{site}/users/profile')
- except Exception as e:
- logger.info('JavDB: 获取用户信息时出错')
- logger.debug(e, exc_info=1)
- return
- # 扫描浏览器得到的Cookies对应的临时域名可能会过期,因此需要先判断域名是否仍然指向JavDB的站点
- if 'JavDB' in html.text:
- email = html.xpath("//div[@class='user-profile']/ul/li[1]/span/following-sibling::text()")[0].strip()
- username = html.xpath("//div[@class='user-profile']/ul/li[2]/span/following-sibling::text()")[0].strip()
- return email, username
- else:
- logger.debug('JavDB: 域名已过期: ' + site)
-
-
-def get_valid_cookies():
- """扫描浏览器,获取一个可用的Cookies"""
- # 经测试,Cookies所发往的域名不需要和登录时的域名保持一致,只要Cookies有效即可在多个域名间使用
- for d in cookies_pool:
- info = get_user_info(d['site'], d['cookies'])
- if info:
- return d['cookies']
- else:
- logger.debug(f"{d['profile']}, {d['site']}: Cookies无效")
-
-
-def parse_data(movie: MovieInfo):
- """从网页抓取并解析指定番号的数据
- Args:
- movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
- """
- # JavDB搜索番号时会有多个搜索结果,从中查找匹配番号的那个
- html = get_html_wrapper(f'{base_url}/search?q={movie.dvdid}')
- ids = list(map(str.lower, html.xpath("//div[@class='video-title']/strong/text()")))
- movie_urls = html.xpath("//a[@class='box']/@href")
- match_count = len([i for i in ids if i == movie.dvdid.lower()])
- if match_count == 0:
- raise MovieNotFoundError(__name__, movie.dvdid, ids)
- elif match_count == 1:
- index = ids.index(movie.dvdid.lower())
- new_url = movie_urls[index]
- try:
- html2 = get_html_wrapper(new_url)
- except (SitePermissionError, CredentialError):
- # 不开VIP不让看,过分。决定榨出能获得的信息,毕竟有时候只有这里能找到标题和封面
- box = html.xpath("//a[@class='box']")[index]
- movie.url = new_url
- movie.title = box.get('title')
- movie.cover = box.xpath("div/img/@src")[0]
- score_str = box.xpath("div[@class='score']/span/span")[0].tail
- score = re.search(r'([\d.]+)分', score_str).group(1)
- movie.score = "{:.2f}".format(float(score)*2)
- movie.publish_date = box.xpath("div[@class='meta']/text()")[0].strip()
- return
- else:
- raise MovieDuplicateError(__name__, movie.dvdid, match_count)
-
- container = html2.xpath("/html/body/section/div/div[@class='video-detail']")[0]
- info = container.xpath("//nav[@class='panel movie-panel-info']")[0]
- title = container.xpath("h2/strong[@class='current-title']/text()")[0]
- show_orig_title = container.xpath("//a[contains(@class, 'meta-link') and not(contains(@style, 'display: none'))]")
- if show_orig_title:
- movie.ori_title = container.xpath("h2/span[@class='origin-title']/text()")[0]
- cover = container.xpath("//img[@class='video-cover']/@src")[0]
- preview_pics = container.xpath("//a[@class='tile-item'][@data-fancybox='gallery']/@href")
- preview_video_tag = container.xpath("//video[@id='preview-video']/source/@src")
- if preview_video_tag:
- preview_video = preview_video_tag[0]
- if preview_video.startswith('//'):
- preview_video = 'https:' + preview_video
- movie.preview_video = preview_video
- dvdid = info.xpath("div/span")[0].text_content()
- publish_date = info.xpath("div/strong[text()='日期:']")[0].getnext().text
- duration = info.xpath("div/strong[text()='時長:']")[0].getnext().text.replace('分鍾', '').strip()
- director_tag = info.xpath("div/strong[text()='導演:']")
- if director_tag:
- movie.director = director_tag[0].getnext().text_content().strip()
- av_type = guess_av_type(movie.dvdid)
- if av_type != 'fc2':
- producer_tag = info.xpath("div/strong[text()='片商:']")
- else:
- producer_tag = info.xpath("div/strong[text()='賣家:']")
- if producer_tag:
- movie.producer = producer_tag[0].getnext().text_content().strip()
- publisher_tag = info.xpath("div/strong[text()='發行:']")
- if publisher_tag:
- movie.publisher = publisher_tag[0].getnext().text_content().strip()
- serial_tag = info.xpath("div/strong[text()='系列:']")
- if serial_tag:
- movie.serial = serial_tag[0].getnext().text_content().strip()
- score_tag = info.xpath("//span[@class='score-stars']")
- if score_tag:
- score_str = score_tag[0].tail
- score = re.search(r'([\d.]+)分', score_str).group(1)
- movie.score = "{:.2f}".format(float(score)*2)
- genre_tags = info.xpath("//strong[text()='類別:']/../span/a")
- genre, genre_id = [], []
- for tag in genre_tags:
- pre_id = tag.get('href').split('/')[-1]
- genre.append(tag.text)
- genre_id.append(pre_id)
- # 判定影片有码/无码
- subsite = pre_id.split('?')[0]
- movie.uncensored = {'uncensored': True, 'tags':False}.get(subsite)
- # JavDB目前同时提供男女优信息,根据用来标识性别的符号筛选出女优
- actors_tag = info.xpath("//strong[text()='演員:']/../span")[0]
- all_actors = actors_tag.xpath("a/text()")
- genders = actors_tag.xpath("strong/text()")
- actress = [i for i in all_actors if genders[all_actors.index(i)] == '♀']
- magnet = container.xpath("//div[@class='magnet-name column is-four-fifths']/a/@href")
-
- movie.dvdid = dvdid
- movie.url = new_url.replace(base_url, permanent_url)
- movie.title = title.replace(dvdid, '').strip()
- movie.cover = cover
- movie.preview_pics = preview_pics
- movie.publish_date = publish_date
- movie.duration = duration
- movie.genre = genre
- movie.genre_id = genre_id
- movie.actress = actress
- movie.magnet = [i.replace('[javdb.com]','') for i in magnet]
-
-
-def parse_clean_data(movie: MovieInfo):
- """解析指定番号的影片数据并进行清洗"""
- try:
- parse_data(movie)
- # 检查封面URL是否真的存在对应图片
- if movie.cover is not None:
- r = request.head(movie.cover)
- if r.status_code != 200:
- movie.cover = None
- except SiteBlocked:
- raise
- logger.error('JavDB: 可能触发了反爬虫机制,请稍后再试')
- if movie.genre_id and (not movie.genre_id[0].startswith('fc2?')):
- movie.genre_norm = genre_map.map(movie.genre_id)
- movie.genre_id = None # 没有别的地方需要再用到,清空genre id(表明已经完成转换)
-
-
-def collect_actress_alias(type=0, use_original=True):
- """
- 收集女优的别名
- type: 0-有码, 1-无码, 2-欧美
- use_original: 是否使用原名而非译名,True-田中レモン,False-田中檸檬
- """
- import json
- import time
- import random
-
- actressAliasMap = {}
-
- actressAliasFilePath = "data/actress_alias.json"
- # 检查文件是否存在
- if not os.path.exists(actressAliasFilePath):
- # 如果文件不存在,创建文件并写入空字典
- with open(actressAliasFilePath, "w", encoding="utf-8") as file:
- json.dump({}, file)
-
- typeList = ["censored", "uncensored", "western"]
- page_url = f"{base_url}/actors/{typeList[type]}"
- while True:
- try:
- html = get_html_wrapper(page_url)
- actors = html.xpath("//div[@class='box actor-box']/a")
-
- count = 0
- for actor in actors:
- count += 1
- actor_name = actor.xpath("strong/text()")[0].strip()
- actor_url = actor.xpath("@href")[0]
- # actor_url = f"https://javdb.com{actor_url}" # 构造演员主页的完整URL
-
- # 进入演员主页,获取更多信息
- actor_html = get_html_wrapper(actor_url)
- # 解析演员所有名字信息
- names_span = actor_html.xpath("//span[@class='actor-section-name']")[0]
- aliases_span_list = actor_html.xpath("//span[@class='section-meta']")
- aliases_span = aliases_span_list[0]
-
- names_list = [name.strip() for name in names_span.text.split(",")]
- if len(aliases_span_list) > 1:
- aliases_list = [
- alias.strip() for alias in aliases_span.text.split(",")
- ]
- else:
- aliases_list = []
-
- # 将信息添加到actressAliasMap中
- actressAliasMap[names_list[-1 if use_original else 0]] = (
- names_list + aliases_list
- )
- print(
- f"{count} --- {names_list[-1 if use_original else 0]}: {names_list + aliases_list}"
- )
-
- if count == 10:
- # 将数据写回文件
- with open(actressAliasFilePath, "r", encoding="utf-8") as file:
- existing_data = json.load(file)
-
- # 合并现有数据和新爬取的数据
- existing_data.update(actressAliasMap)
-
- # 将合并后的数据写回文件
- with open(actressAliasFilePath, "w", encoding="utf-8") as file:
- json.dump(existing_data, file, ensure_ascii=False, indent=2)
-
- actressAliasMap = {} # 重置actressAliasMap
-
- print(
- f"已爬取 {count} 个女优,数据已更新并写回文件:",
- actressAliasFilePath,
- )
-
- # 重置计数器
- count = 0
-
- time.sleep(max(1, 10 * random.random())) # 随机等待 1-10 秒
-
- # 判断是否有下一页按钮
- next_page_link = html.xpath(
- "//a[@rel='next' and @class='pagination-next']/@href"
- )
- if not next_page_link:
- break # 没有下一页,结束循环
- else:
- next_page_url = f"{next_page_link[0]}"
- page_url = next_page_url
-
- except SiteBlocked:
- raise
-
- with open(actressAliasFilePath, "r", encoding="utf-8") as file:
- existing_data = json.load(file)
-
- # 合并现有数据和新爬取的数据
- existing_data.update(actressAliasMap)
-
- # 将合并后的数据写回文件
- with open(actressAliasFilePath, "w", encoding="utf-8") as file:
- json.dump(existing_data, file, ensure_ascii=False, indent=2)
-
- print(f"已爬取 {count} 个女优,数据已更新并写回文件:", actressAliasFilePath)
-
-
-if __name__ == "__main__":
- # collect_actress_alias()
- movie = MovieInfo('FC2-2735981')
- try:
- parse_clean_data(movie)
- print(movie)
- except CrawlerError as e:
- print(repr(e))
diff --git a/javsp/web/javlib.py b/javsp/web/javlib.py
deleted file mode 100644
index 85f77b75f..000000000
--- a/javsp/web/javlib.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""从JavLibrary抓取数据"""
-import logging
-from urllib.parse import urlsplit
-
-
-from javsp.web.base import Request, read_proxy, resp2html
-from javsp.web.exceptions import *
-from javsp.web.proxyfree import get_proxy_free_url
-from javsp.config import Cfg, CrawlerID
-from javsp.datatype import MovieInfo
-
-
-# 初始化Request实例
-request = Request(use_scraper=True)
-
-logger = logging.getLogger(__name__)
-permanent_url = 'https://www.javlibrary.com'
-base_url = ''
-
-
-def init_network_cfg():
- """设置合适的代理模式和base_url"""
- request.timeout = 5
- proxy_free_url = get_proxy_free_url('javlib')
- urls = [str(Cfg().network.proxy_free[CrawlerID.javlib]), permanent_url]
- if proxy_free_url and proxy_free_url not in urls:
- urls.insert(1, proxy_free_url)
- # 使用代理容易触发IUAM保护,先尝试不使用代理访问
- proxy_cfgs = [{}, read_proxy()] if Cfg().network.proxy_server else [{}]
- for proxies in proxy_cfgs:
- request.proxies = proxies
- for url in urls:
- if proxies == {} and url == permanent_url:
- continue
- try:
- resp = request.get(url, delay_raise=True)
- if resp.status_code == 200:
- request.timeout = Cfg().network.timeout.seconds
- return url
- except Exception as e:
- logger.debug(f"Fail to connect to '{url}': {e}")
- logger.warning('无法绕开JavLib的反爬机制')
- request.timeout = Cfg().network.timeout.seconds
- return permanent_url
-
-
-# TODO: 发现JavLibrary支持使用cid搜索,会直接跳转到对应的影片页面,也许可以利用这个功能来做cid到dvdid的转换
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- global base_url
- if not base_url:
- base_url = init_network_cfg()
- logger.debug(f"JavLib网络配置: {base_url}, proxy={request.proxies}")
- url = new_url = f'{base_url}/cn/vl_searchbyid.php?keyword={movie.dvdid}'
- resp = request.get(url)
- html = resp2html(resp)
- if resp.history:
- if urlsplit(resp.url).netloc == urlsplit(base_url).netloc:
- # 出现301重定向通常且新老地址netloc相同时,说明搜索到了影片且只有一个结果
- new_url = resp.url
- else:
- # 重定向到了不同的netloc时,新地址并不是影片地址。这种情况下新地址中丢失了path字段,
- # 为无效地址(应该是JavBus重定向配置有问题),需要使用新的base_url抓取数据
- base_url = 'https://' + urlsplit(resp.url).netloc
- logger.warning(f"请将配置文件中的JavLib免代理地址更新为: {base_url}")
- return parse_data(movie)
- else: # 如果有多个搜索结果则不会自动跳转,此时需要程序介入选择搜索结果
- video_tags = html.xpath("//div[@class='video'][@id]/a")
- # 通常第一部影片就是我们要找的,但是以免万一还是遍历所有搜索结果
- pre_choose = []
- for tag in video_tags:
- tag_dvdid = tag.xpath("div[@class='id']/text()")[0]
- if tag_dvdid.upper() == movie.dvdid.upper():
- pre_choose.append(tag)
- pre_choose_urls = [i.get('href') for i in pre_choose]
- match_count = len(pre_choose)
- if match_count == 0:
- raise MovieNotFoundError(__name__, movie.dvdid)
- elif match_count == 1:
- new_url = pre_choose_urls[0]
- elif match_count == 2:
- no_blueray = []
- for tag in pre_choose:
- if 'ブルーレイディスク' not in tag.get('title'): # Blu-ray Disc
- no_blueray.append(tag)
- no_blueray_count = len(no_blueray)
- if no_blueray_count == 1:
- new_url = no_blueray[0].get('href')
- logger.debug(f"'{movie.dvdid}': 存在{match_count}个同番号搜索结果,已自动选择封面比例正确的一个: {new_url}")
- else:
- # 两个结果中没有谁是蓝光影片,说明影片番号重复了
- raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls)
- else:
- # 存在不同影片但是番号相同的情况,如MIDV-010
- raise MovieDuplicateError(__name__, movie.dvdid, match_count, pre_choose_urls)
- # 重新抓取网页
- html = request.get_html(new_url)
- container = html.xpath("/html/body/div/div[@id='rightcolumn']")[0]
- title_tag = container.xpath("div/h3/a/text()")
- title = title_tag[0]
- cover = container.xpath("//img[@id='video_jacket_img']/@src")[0]
- info = container.xpath("//div[@id='video_info']")[0]
- dvdid = info.xpath("div[@id='video_id']//td[@class='text']/text()")[0]
- publish_date = info.xpath("div[@id='video_date']//td[@class='text']/text()")[0]
- duration = info.xpath("div[@id='video_length']//span[@class='text']/text()")[0]
- director_tag = info.xpath("//span[@class='director']/a/text()")
- if director_tag:
- movie.director = director_tag[0]
- producer = info.xpath("//span[@class='maker']/a/text()")[0]
- publisher_tag = info.xpath("//span[@class='label']/a/text()")
- if publisher_tag:
- movie.publisher = publisher_tag[0]
- score_tag = info.xpath("//span[@class='score']/text()")
- if score_tag:
- movie.score = score_tag[0].strip('()')
- genre = info.xpath("//span[@class='genre']/a/text()")
- actress = info.xpath("//span[@class='star']/a/text()")
-
- movie.dvdid = dvdid
- movie.url = new_url.replace(base_url, permanent_url)
- movie.title = title.replace(dvdid, '').strip()
- if cover.startswith('//'): # 补全URL中缺少的协议段
- cover = 'https:' + cover
- movie.cover = cover
- movie.publish_date = publish_date
- movie.duration = duration
- movie.producer = producer
- movie.genre = genre
- movie.actress = actress
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- base_url = permanent_url
- movie = MovieInfo('IPX-177')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- print(e)
diff --git a/javsp/web/javmenu.py b/javsp/web/javmenu.py
deleted file mode 100644
index 5296a69cd..000000000
--- a/javsp/web/javmenu.py
+++ /dev/null
@@ -1,88 +0,0 @@
-"""从JavMenu抓取数据"""
-import logging
-
-from javsp.web.base import Request, resp2html
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-
-request = Request()
-
-logger = logging.getLogger(__name__)
-base_url = 'https://mrzyx.xyz'
-
-
-def parse_data(movie: MovieInfo):
- """从网页抓取并解析指定番号的数据
- Args:
- movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
- """
- # JavMenu网页做得很不走心,将就了
- url = f'{base_url}/{movie.dvdid}'
- r = request.get(url)
- if r.history:
- # 被重定向到主页说明找不到影片资源
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- html = resp2html(r)
- container = html.xpath("//div[@class='col-md-9 px-0']")[0]
- title = container.xpath("div[@class='col-12 mb-3']/h1/strong/text()")[0]
- # 竟然还在标题里插广告,真的疯了。要不是我已经写了抓取器,才懒得维护这个破站
- title = title.replace(' | JAV目錄大全 | 每日更新', '')
- title = title.replace(' 免費在線看', '').replace(' 免費AV在線看', '')
- cover_tag = container.xpath("//div[@class='single-video']")
- if len(cover_tag) > 0:
- video_tag = cover_tag[0].find('video')
- # URL首尾竟然也有空格……
- movie.cover = video_tag.get('data-poster').strip()
- # 预览影片改为blob了,无法获取
- # movie.preview_video = video_tag.find('source').get('src').strip()
- else:
- cover_img_tag = container.xpath("//img[@class='lazy rounded']/@data-src")
- if cover_img_tag:
- movie.cover = cover_img_tag[0].strip()
- info = container.xpath("//div[@class='card-body']")[0]
- publish_date = info.xpath("div/span[contains(text(), '日期:')]")[0].getnext().text
- duration = info.xpath("div/span[contains(text(), '時長:')]")[0].getnext().text.replace('分鐘', '')
- producer = info.xpath("div/span[contains(text(), '製作:')]/following-sibling::a/span/text()")
- if producer:
- movie.producer = producer[0]
- genre_tags = info.xpath("//a[@class='genre']")
- genre, genre_id = [], []
- for tag in genre_tags:
- items = tag.get('href').split('/')
- pre_id = items[-3] + '/' + items[-1]
- genre.append(tag.text.strip())
- genre_id.append(pre_id)
- # genre的链接中含有censored字段,但是无法用来判断影片是否有码,因为完全不可靠……
- actress = info.xpath("div/span[contains(text(), '女優:')]/following-sibling::*/a/text()") or None
- magnet_table = container.xpath("//table[contains(@class, 'magnet-table')]/tbody")
- if magnet_table:
- magnet_links = magnet_table[0].xpath("tr/td/a/@href")
- # 它的FC2数据是从JavDB抓的,JavDB更换图片服务器后它也跟上了,似乎数据更新频率还可以
- movie.magnet = [i.replace('[javdb.com]','') for i in magnet_links]
- preview_pics = container.xpath("//a[@data-fancybox='gallery']/@href")
-
- if (not movie.cover) and preview_pics:
- movie.cover = preview_pics[0]
- movie.url = url
- movie.title = title.replace(movie.dvdid, '').strip()
- movie.preview_pics = preview_pics
- movie.publish_date = publish_date
- movie.duration = duration
- movie.genre = genre
- movie.genre_id = genre_id
- movie.actress = actress
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('FC2-718323')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/mgstage.py b/javsp/web/mgstage.py
deleted file mode 100644
index 4904e51db..000000000
--- a/javsp/web/mgstage.py
+++ /dev/null
@@ -1,114 +0,0 @@
-"""从蚊香社-mgstage抓取数据"""
-import re
-import logging
-
-
-from javsp.web.base import Request, resp2html
-from javsp.web.exceptions import *
-from javsp.config import Cfg
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://www.mgstage.com'
-# 初始化Request实例(要求携带已通过R18认证的cookies,否则会被重定向到认证页面)
-request = Request()
-request.cookies = {'adc': '1'}
-
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- url = f'{base_url}/product/product_detail/{movie.dvdid}/'
- resp = request.get(url, delay_raise=True)
- if resp.status_code == 403:
- raise SiteBlocked('mgstage不允许从当前IP所在地区访问,请尝试更换为日本地区代理')
- # url不存在时会被重定向至主页。history非空时说明发生了重定向
- elif resp.history:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- html = resp2html(resp)
- # mgstage的文本中含有大量的空白字符('\n \t'),需要使用strip去除
- title = html.xpath("//div[@class='common_detail_cover']/h1/text()")[0].strip()
- container = html.xpath("//div[@class='detail_left']")[0]
- cover = container.xpath("//a[@id='EnlargeImage']/@href")[0]
- # 有链接的女优和仅有文本的女优匹配方法不同,因此分别匹配以后合并列表
- actress_text = container.xpath("//th[text()='出演:']/following-sibling::td/text()")
- actress_link = container.xpath("//th[text()='出演:']/following-sibling::td/a/text()")
- actress = [i.strip() for i in actress_text + actress_link]
- actress = [i for i in actress if i] # 移除空字符串
- producer = container.xpath("//th[text()='メーカー:']/following-sibling::td/a/text()")[0].strip()
- duration_str = container.xpath("//th[text()='収録時間:']/following-sibling::td/text()")[0]
- match = re.search(r'\d+', duration_str)
- if match:
- movie.duration = match.group(0)
- dvdid = container.xpath("//th[text()='品番:']/following-sibling::td/text()")[0]
- date_str = container.xpath("//th[text()='配信開始日:']/following-sibling::td/text()")[0]
- publish_date = date_str.replace('/', '-')
- serial_tag = container.xpath("//th[text()='シリーズ:']/following-sibling::td/a/text()")
- if serial_tag:
- movie.serial = serial_tag[0].strip()
- # label: 大意是某个系列策划用同样的番号,例如ABS打头的番号label是'ABSOLUTELY PERFECT',暂时用不到
- # label = container.xpath("//th[text()='レーベル:']/following-sibling::td/text()")[0].strip()
- genre_tags = container.xpath("//th[text()='ジャンル:']/following-sibling::td/a")
- genre = [i.text.strip() for i in genre_tags]
- score_str = container.xpath("//td[@class='review']/span")[0].tail.strip()
- match = re.search(r'^[\.\d]+', score_str)
- if match:
- score = float(match.group()) * 2
- movie.score = f'{score:.2f}'
- # plot可能含有嵌套格式,为了保留plot中的换行关系,手动处理plot中的各个标签
- plots = []
- plot_p_tags = container.xpath("//dl[@id='introduction']/dd/p[not(@class='more')]")
- for p in plot_p_tags:
- children = p.getchildren()
- # 没有children时表明plot不含有格式,此时简单地提取文本就可以
- if not children:
- plots.append(p.text_content())
- continue
- for child in children:
- if child.tag == 'br' and plots[-1] != '\n':
- plots.append('\n')
- else:
- if child.text:
- plots.append(child.text)
- if child.tail:
- plots.append(child.tail)
- plot = ''.join(plots).strip()
- preview_pics = container.xpath("//a[@class='sample_image']/@href")
-
- if Cfg().crawler.hardworking:
- # 预览视频是点击按钮后再加载的,不在静态网页中
- btn_url = container.xpath("//a[@class='button_sample']/@href")[0]
- video_pid = btn_url.split('/')[-1]
- req_url = f'{base_url}/sampleplayer/sampleRespons.php?pid={video_pid}'
- resp = request.get(req_url).json()
- video_url = resp.get('url')
- if video_url:
- # /sample/shirouto/siro/3093/SIRO-3093_sample.ism/request?uid=XXX&pid=XXX
- preview_video = video_url.split('.ism/')[0] + '.mp4'
- movie.preview_video = preview_video
-
- movie.dvdid = dvdid
- movie.url = url
- movie.title = title
- movie.cover = cover
- movie.actress = actress
- movie.producer = producer
- movie.publish_date = publish_date
- movie.genre = genre
- movie.plot = plot
- movie.preview_pics = preview_pics
- movie.uncensored = False # 服务器在日本且面向日本国内公开发售,不会包含无码片
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('HRV-045')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/njav.py b/javsp/web/njav.py
deleted file mode 100644
index f94e943f3..000000000
--- a/javsp/web/njav.py
+++ /dev/null
@@ -1,134 +0,0 @@
-"""从NJAV抓取数据"""
-import re
-import logging
-from typing import List
-
-
-from javsp.web.base import get_html
-from javsp.web.exceptions import *
-from javsp.lib import strftime_to_minutes
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://njav.tv/ja'
-
-def search_video(movie: MovieInfo):
- id_uc = movie.dvdid
- # 抓取网页
- url = f'{base_url}/search?keyword={id_uc}'
- html = get_html(url)
- list = html.xpath("//div[@class='box-item']/div[@class='detail']/a")
- video_url = None
- for item in list:
- search_title = item.xpath("text()")[0]
- if id_uc in search_title:
- video_url = item.xpath("@href")
- break
- if id_uc.startswith("FC2-"):
- fc2id = id_uc.replace('FC2-', '')
- if "FC2" in search_title and fc2id in search_title:
- video_url = item.xpath("@href")
- break
-
- return get_list_first(video_url)
-
-def parse_data(movie: MovieInfo):
- """解析指定番号的影片数据"""
- # 抓取网页
- url = search_video(movie)
- if not url:
- raise MovieNotFoundError(__name__, movie.dvdid)
- html = get_html(url)
- container = html.xpath("//div[@class='container']/div/div[@class='col']")
- if len(container) > 0:
- container = container[0]
- else:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- title = container.xpath("//div[@class='d-flex justify-content-between align-items-start']/div/h1/text()")[0]
- thumb_pic = container.xpath("//div[@id='player']/@data-poster")
- plot = " ".join(container.xpath("//div[@class='description']/p/text()"))
- magnet = container.xpath("//div[@class='magnet']/a/@href")
- real_id = None
- publish_date = None
- duration_str = None
- uncensored = None
- preview_pics = None
- preview_video = None
- serial = None
- publisher = None
- producer = None
- genre = []
- actress = []
-
- detail_dic = {}
- for item in container.xpath("//div[@class='detail-item']/div"):
- item_title = item.xpath('span/text()')[0]
- if "タグ:" in item_title:
- genre += item.xpath("span")[1].xpath("a/text()")
- elif "ジャンル:" in item_title:
- genre += item.xpath("span")[1].xpath("a/text()")
- elif "レーベル:" in item_title:
- genre += item.xpath("span")[1].xpath("a/text()")
- elif "女優:" in item_title:
- actress = item.xpath("span")[1].xpath("a/text()")
- elif "シリーズ:" in item_title:
- serial = get_list_first(item.xpath("span")[1].xpath("a/text()"))
- elif "メーカー:" in item_title:
- producer = get_list_first(item.xpath("span")[1].xpath("a/text()"))
- elif "コード:" in item_title:
- real_id = get_list_first(item.xpath("span")[1].xpath("text()"))
- elif "公開日:" in item_title:
- publish_date = get_list_first(item.xpath("span")[1].xpath("text()"))
- elif "再生時間:" in item_title:
- duration_str = get_list_first(item.xpath("span")[1].xpath("text()"))
-
- # 清除标题里的番号字符
- keywords = [real_id, " "]
- if movie.dvdid.startswith("FC2"):
- keywords += ["FC2","PPV","-"] + [movie.dvdid.split("-")[-1]]
- for keyword in keywords:
- title = re.sub(re.escape(keyword), "", title, flags=re.I)
-
- # 判断是否无码
- uncensored_arr = magnet + [title]
- for uncensored_str in uncensored_arr:
- if 'uncensored' in uncensored_str.lower():
- uncensored = True
-
- movie.url = url
- movie.title = title
- movie.genre = genre
- movie.actress = actress
- movie.duration = str(strftime_to_minutes(duration_str))
- movie.publish_date = publish_date
- movie.publisher = publisher
- movie.producer = producer
- movie.uncensored = uncensored
- movie.preview_pics = preview_pics
- movie.preview_video = preview_video
- movie.plot = plot
- movie.serial = serial
- movie.magnet = magnet
-
- # FC2的封面是220x220的,和正常封面尺寸、比例都差太多。如果有预览图片,则使用第一张预览图作为封面
- if movie.preview_pics:
- movie.cover = preview_pics[0]
- else:
- movie.cover = get_list_first(thumb_pic)
-
-def get_list_first(list:List):
- return list[0] if list and len(list) > 0 else None
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('012023_002')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/prestige.py b/javsp/web/prestige.py
deleted file mode 100644
index f6884c658..000000000
--- a/javsp/web/prestige.py
+++ /dev/null
@@ -1,83 +0,0 @@
-"""从蚊香社-prestige抓取数据"""
-import re
-import logging
-
-
-from javsp.web.base import *
-from javsp.web.exceptions import *
-from javsp.datatype import MovieInfo
-
-
-logger = logging.getLogger(__name__)
-base_url = 'https://www.prestige-av.com'
-# prestige要求访问者携带已通过R18认证的cookies才能够获得完整数据,否则会被重定向到认证页面
-# (其他多数网站的R18认证只是在网页上遮了一层,完整数据已经传回,不影响爬虫爬取)
-cookies = {'__age_auth__': 'true'}
-
-
-def parse_data(movie: MovieInfo):
- """从网页抓取并解析指定番号的数据
- Args:
- movie (MovieInfo): 要解析的影片信息,解析后的信息直接更新到此变量内
- """
- url = f'{base_url}/goods/goods_detail.php?sku={movie.dvdid}'
- resp = request_get(url, cookies=cookies, delay_raise=True)
- if resp.status_code == 500:
- # 500错误表明prestige没有这部影片的数据,不是网络问题,因此不再重试
- raise MovieNotFoundError(__name__, movie.dvdid)
- elif resp.status_code == 403:
- raise SiteBlocked('prestige不允许从当前IP所在地区访问,请尝试更换为日本地区代理')
- resp.raise_for_status()
- html = resp2html(resp)
- container_tags = html.xpath("//section[@class='px-4 mb-4 md:px-8 md:mb-16']")
- if not container_tags:
- raise MovieNotFoundError(__name__, movie.dvdid)
-
- container = container_tags[0]
- title = container.xpath("h1/span")[0].tail.strip()
- cover = container.xpath("//div[@class='c-ratio-image mr-8']/picture/source/img/@src")[0]
- cover = cover.split('?')[0]
- actress = container.xpath("//p[text()='出演者:']/following-sibling::div/p/a/text()")
- # 移除女优名中的空格,使女优名与其他网站保持一致
- actress = [i.strip().replace(' ', '') for i in actress]
- duration_str = container.xpath("//p[text()='収録時間:']")[0].getnext().text_content()
- match = re.search(r'\d+', duration_str)
- if match:
- movie.duration = match.group(0)
- date_url = container.xpath("//p[text()='発売日:']/following-sibling::div/a/@href")[0]
- publish_date = date_url.split('?date=')[-1]
- producer = container.xpath("//p[text()='メーカー:']/following-sibling::div/a/text()")[0].strip()
- dvdid = container.xpath("//p[text()='品番:']/following-sibling::div/p/text()")[0]
- genre_tags = container.xpath("//p[text()='ジャンル:']/following-sibling::div/a")
- genre = [tag.text.strip() for tag in genre_tags]
- serial = container.xpath("//p[text()='レーベル:']/following-sibling::div/a/text()")[0].strip()
- plot = container.xpath("//h2[text()='商品紹介']/following-sibling::p")[0].text.strip()
- preview_pics = container.xpath("//h2[text()='サンプル画像']/following-sibling::div/div/picture/source/img/@src")
- preview_pics = [i.split('?')[0] for i in preview_pics]
-
- # prestige改版后已经无法获取高清封面,此前已经获取的高清封面地址也已失效
- movie.url = url
- movie.dvdid = dvdid
- movie.title = title
- movie.cover = cover
- movie.actress = actress
- movie.publish_date = publish_date
- movie.producer = producer
- movie.genre = genre
- movie.serial = serial
- movie.plot = plot
- movie.preview_pics = preview_pics
- movie.uncensored = False # prestige服务器在日本且面向日本国内公开发售,不会包含无码片
-
-
-if __name__ == "__main__":
- import pretty_errors
- pretty_errors.configure(display_link=True)
- logger.root.handlers[1].level = logging.DEBUG
-
- movie = MovieInfo('ABP-647')
- try:
- parse_data(movie)
- print(movie)
- except CrawlerError as e:
- logger.error(e, exc_info=1)
diff --git a/javsp/web/proxyfree.py b/javsp/web/proxyfree.py
deleted file mode 100644
index 89c1e63a4..000000000
--- a/javsp/web/proxyfree.py
+++ /dev/null
@@ -1,75 +0,0 @@
-"""获取各个网站的免代理地址"""
-import re
-import sys
-
-from javsp.web.base import is_connectable, get_html, get_resp_text, request_get
-
-
-def get_proxy_free_url(site_name: str, prefer_url=None) -> str:
- """获取指定网站的免代理地址
- Args:
- site_name (str): 站点名称
- prefer_url (str, optional): 优先测试此url是否可用
- Returns:
- str: 指定站点的免代理地址(失败时为空字符串)
- """
- if prefer_url and is_connectable(prefer_url, timeout=5):
- return prefer_url
- # 当prefer_url不可用时,尝试自动获取指定网站的免代理地址
- site_name = site_name.lower()
- func_name = f'_get_{site_name}_urls'
- get_funcs = [i for i in dir(sys.modules[__name__]) if i.startswith('_get_')]
- if func_name in get_funcs:
- get_urls = getattr(sys.modules[__name__], func_name)
- try:
- urls = get_urls()
- return _choose_one(urls)
- except:
- return ''
- else:
- raise Exception("Dont't know how to get proxy-free url for " + site_name)
-
-
-def _choose_one(urls) -> str:
- for url in urls:
- if is_connectable(url, timeout=5):
- return url
- return ''
-
-
-def _get_avsox_urls() -> list:
- html = get_html('https://tellme.pw/avsox')
- urls = html.xpath('//h4/strong/a/@href')
- return urls
-
-
-def _get_javbus_urls() -> list:
- html = get_html('https://www.javbus.one/')
- text = html.text_content()
- urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A)
- return urls
-
-
-def _get_javlib_urls() -> list:
- html = get_html('https://github.com/javlibcom')
- text = html.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content()
- match = re.search(r'[\w\.]+', text, re.A)
- if match:
- domain = f'https://www.{match.group(0)}.com'
- return [domain]
-
-
-def _get_javdb_urls() -> list:
- html = get_html('https://jav524.app')
- js_links = html.xpath("//script[@src]/@src")
- for link in js_links:
- if '/js/index' in link:
- text = get_resp_text(request_get(link))
- match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A)
- if match:
- return [match.group(1)]
-
-
-if __name__ == "__main__":
- print('javdb:\t', _get_javdb_urls())
- print('javlib:\t', _get_javlib_urls())
diff --git a/poetry.lock b/poetry.lock
index 1c92293a3..f9b1b8d77 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,5 +1,21 @@
# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+[[package]]
+name = "aiofiles"
+version = "24.1.0"
+description = "File support for asyncio."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "aiofiles-24.1.0-py3-none-any.whl", hash = "sha256:b4ec55f4195e3eb5d7abd1bf7e061763e864dd4954231fb8539a0ef8bb8260e5"},
+ {file = "aiofiles-24.1.0.tar.gz", hash = "sha256:22a075c9e5a3810f0c2e48f3008c94d68c65d763b9b03857924c99e57355166c"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
[[package]]
name = "annotated-types"
version = "0.7.0"
@@ -16,6 +32,33 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
+[[package]]
+name = "anyio"
+version = "4.6.0"
+description = "High level compatibility layer for multiple asynchronous event loop implementations"
+optional = false
+python-versions = ">=3.9"
+files = [
+ {file = "anyio-4.6.0-py3-none-any.whl", hash = "sha256:c7d2e9d63e31599eeb636c8c5c03a7e108d73b345f064f1c19fdc87b79036a9a"},
+ {file = "anyio-4.6.0.tar.gz", hash = "sha256:137b4559cbb034c477165047febb6ff83f390fc3b20bf181c1fc0a728cb8beeb"},
+]
+
+[package.dependencies]
+exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""}
+idna = ">=2.8"
+sniffio = ">=1.1"
+typing-extensions = {version = ">=4.1", markers = "python_version < \"3.11\""}
+
+[package.extras]
+doc = ["Sphinx (>=7.4,<8.0)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme"]
+test = ["anyio[trio]", "coverage[toml] (>=7)", "exceptiongroup (>=1.2.0)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.21.0b1)"]
+trio = ["trio (>=0.26.1)"]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
[[package]]
name = "certifi"
version = "2024.8.30"
@@ -116,131 +159,6 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
-[[package]]
-name = "charset-normalizer"
-version = "3.3.2"
-description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
-optional = false
-python-versions = ">=3.7.0"
-files = [
- {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-win32.whl", hash = "sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73"},
- {file = "charset_normalizer-3.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-win32.whl", hash = "sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab"},
- {file = "charset_normalizer-3.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-win32.whl", hash = "sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7"},
- {file = "charset_normalizer-3.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-win32.whl", hash = "sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4"},
- {file = "charset_normalizer-3.3.2-cp37-cp37m-win_amd64.whl", hash = "sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-win32.whl", hash = "sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25"},
- {file = "charset_normalizer-3.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-win32.whl", hash = "sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f"},
- {file = "charset_normalizer-3.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d"},
- {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
-]
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
-[[package]]
-name = "cloudscraper"
-version = "1.2.71"
-description = "A Python module to bypass Cloudflare's anti-bot page."
-optional = false
-python-versions = "*"
-files = [
- {file = "cloudscraper-1.2.71-py2.py3-none-any.whl", hash = "sha256:76f50ca529ed2279e220837befdec892626f9511708e200d48d5bb76ded679b0"},
- {file = "cloudscraper-1.2.71.tar.gz", hash = "sha256:429c6e8aa6916d5bad5c8a5eac50f3ea53c9ac22616f6cb21b18dcc71517d0d3"},
-]
-
-[package.dependencies]
-pyparsing = ">=2.4.7"
-requests = ">=2.9.2"
-requests-toolbelt = ">=0.9.1"
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
[[package]]
name = "colorama"
version = "0.4.4"
@@ -570,6 +488,79 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
+[[package]]
+name = "h11"
+version = "0.14.0"
+description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761"},
+ {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
+[[package]]
+name = "httpcore"
+version = "1.0.5"
+description = "A minimal low-level HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "httpcore-1.0.5-py3-none-any.whl", hash = "sha256:421f18bac248b25d310f3cacd198d55b8e6125c107797b609ff9b7a6ba7991b5"},
+ {file = "httpcore-1.0.5.tar.gz", hash = "sha256:34a38e2f9291467ee3b44e89dd52615370e152954ba21721378a87b2960f7a61"},
+]
+
+[package.dependencies]
+certifi = "*"
+h11 = ">=0.13,<0.15"
+
+[package.extras]
+asyncio = ["anyio (>=4.0,<5.0)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+trio = ["trio (>=0.22.0,<0.26.0)"]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
+[[package]]
+name = "httpx"
+version = "0.27.2"
+description = "The next generation HTTP client."
+optional = false
+python-versions = ">=3.8"
+files = [
+ {file = "httpx-0.27.2-py3-none-any.whl", hash = "sha256:7bb2708e112d8fdd7829cd4243970f0c223274051cb35ee80c03301ee29a3df0"},
+ {file = "httpx-0.27.2.tar.gz", hash = "sha256:f7c2be1d2f3c3c3160d441802406b206c2b76f5947b11115e6df10c6c65e66c2"},
+]
+
+[package.dependencies]
+anyio = "*"
+certifi = "*"
+httpcore = "==1.*"
+idna = "*"
+sniffio = "*"
+socksio = {version = "==1.*", optional = true, markers = "extra == \"socks\""}
+
+[package.extras]
+brotli = ["brotli", "brotlicffi"]
+cli = ["click (==8.*)", "pygments (==2.*)", "rich (>=10,<14)"]
+http2 = ["h2 (>=3,<5)"]
+socks = ["socksio (==1.*)"]
+zstd = ["zstandard (>=0.18.0)"]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
[[package]]
name = "idna"
version = "3.10"
@@ -1428,25 +1419,6 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
-[[package]]
-name = "pyparsing"
-version = "3.1.4"
-description = "pyparsing module - Classes and methods to define and execute parsing grammars"
-optional = false
-python-versions = ">=3.6.8"
-files = [
- {file = "pyparsing-3.1.4-py3-none-any.whl", hash = "sha256:a6a7ee4235a3f944aa1fa2249307708f893fe5717dc603503c6c7969c070fb7c"},
- {file = "pyparsing-3.1.4.tar.gz", hash = "sha256:f86ec8d1a83f11977c9a6ea7598e8c27fc5cddfa5b07ea2241edbbde1d7bc032"},
-]
-
-[package.extras]
-diagrams = ["jinja2", "railroad-diagrams"]
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
[[package]]
name = "pytest"
version = "8.3.3"
@@ -1623,51 +1595,6 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
-[[package]]
-name = "requests"
-version = "2.31.0"
-description = "Python HTTP for Humans."
-optional = false
-python-versions = ">=3.7"
-files = [
- {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"},
- {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"},
-]
-
-[package.dependencies]
-certifi = ">=2017.4.17"
-charset-normalizer = ">=2,<4"
-idna = ">=2.5,<4"
-urllib3 = ">=1.21.1,<3"
-
-[package.extras]
-socks = ["PySocks (>=1.5.6,!=1.5.7)"]
-use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
-[[package]]
-name = "requests-toolbelt"
-version = "1.0.0"
-description = "A utility belt for advanced users of python-requests"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
-files = [
- {file = "requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6"},
- {file = "requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06"},
-]
-
-[package.dependencies]
-requests = ">=2.0.1,<3.0.0"
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
[[package]]
name = "setuptools"
version = "75.1.0"
@@ -1748,6 +1675,38 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
+[[package]]
+name = "sniffio"
+version = "1.3.1"
+description = "Sniff out which async library your code is running under"
+optional = false
+python-versions = ">=3.7"
+files = [
+ {file = "sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2"},
+ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
+[[package]]
+name = "socksio"
+version = "1.0.0"
+description = "Sans-I/O implementation of SOCKS4, SOCKS4A, and SOCKS5."
+optional = false
+python-versions = ">=3.6"
+files = [
+ {file = "socksio-1.0.0-py3-none-any.whl", hash = "sha256:95dc1f15f9b34e8d7b16f06d74b8ccf48f609af32ab33c608d08761c5dcbb1f3"},
+ {file = "socksio-1.0.0.tar.gz", hash = "sha256:f88beb3da5b5c38b9890469de67d0cb0f9d494b78b106ca1845f96c10b91c4ac"},
+]
+
+[package.source]
+type = "legacy"
+url = "https://pypi.tuna.tsinghua.edu.cn/simple"
+reference = "mirrors"
+
[[package]]
name = "time-machine"
version = "2.15.0"
@@ -1992,28 +1951,6 @@ type = "legacy"
url = "https://pypi.tuna.tsinghua.edu.cn/simple"
reference = "mirrors"
-[[package]]
-name = "urllib3"
-version = "2.2.3"
-description = "HTTP library with thread-safe connection pooling, file post, and more."
-optional = false
-python-versions = ">=3.8"
-files = [
- {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
- {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
-]
-
-[package.extras]
-brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"]
-h2 = ["h2 (>=4,<5)"]
-socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
-zstd = ["zstandard (>=0.18.0)"]
-
-[package.source]
-type = "legacy"
-url = "https://pypi.tuna.tsinghua.edu.cn/simple"
-reference = "mirrors"
-
[[package]]
name = "zipp"
version = "3.20.2"
@@ -2041,4 +1978,4 @@ reference = "mirrors"
[metadata]
lock-version = "2.0"
python-versions = "<3.13,>=3.10"
-content-hash = "056b2f7a21b0286a04a5ecadb809f6472c636348fe07976ac42c9c47c620f04c"
+content-hash = "3c98b4c2562b1cc5d88474d6962ab34e60be1be488d840c691c0d0e1095d7285"
diff --git a/pyproject.toml b/pyproject.toml
index a5e1b4d10..a74d2bc1b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,11 +13,9 @@ format = "v{base}.{distance}"
[tool.poetry.dependencies]
python = "<3.13,>=3.10"
-cloudscraper = "1.2.71"
colorama = "0.4.4"
pillow = "10.2.0"
pretty-errors = "1.2.19"
-requests = "2.31.0"
tqdm = "4.59.0"
# https://stackoverflow.com/questions/446209/possible-values-from-sys-platform
pywin32 = {version = "^306", markers = "sys_platform == 'win32'"}
@@ -29,6 +27,8 @@ confz = "^2.0.1"
pydantic-extra-types = "^2.9.0"
pendulum = "^3.0.0"
slimeface = "^2024.9.27"
+httpx = {extras = ["socks"], version = "^0.27.2"}
+aiofiles = "^24.1.0"
[tool.poetry.scripts]
javsp = "javsp.__main__:entry"
diff --git a/tools/config_migration.py b/tools/config_migration.py
index 95adc45d6..f08f9ed67 100644
--- a/tools/config_migration.py
+++ b/tools/config_migration.py
@@ -76,13 +76,16 @@ def fix_pat(p):
# 设置代理服务器地址,支持 http, socks5/socks5h 代理,比如'http://127.0.0.1:1080'
# null表示禁用代理
proxy_server: {'null' if proxy_disabled else f"'{cfg['Network']['proxy']}'"}
- # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置
- proxy_free:
-{'\n'.join([f" {id}: '{url}'" for id, url in dict(cfg['ProxyFree']).items()])}
# 网络问题导致抓取数据失败时的重试次数,通常3次就差不多了
- retry: {cfg['Network']['retry']}
+ retries: {cfg['Network']['retry']}
# https://en.wikipedia.org/wiki/ISO_8601#Durations
timeout: PT{cfg['Network']['timeout']}S
+ # 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置
+ unproxied: [{
+ ', '.join(dict(cfg['ProxyFree']).values())
+}]
+ fallback:
+{'\n'.join([f" {id}: ['{url}']" for id, url in dict(cfg['ProxyFree']).items()])}
################################
crawler:
@@ -100,8 +103,6 @@ def fix_pat(p):
hardworking: {yes_to_true(cfg['Crawler']['hardworking_mode'])}
# 使用网页番号作为最终番号(启用时会对番号大小写等进行更正)
respect_site_avid: {yes_to_true(cfg['Crawler']['respect_site_avid'])}
- # fc2fan已关站。如果你有镜像,请设置本地镜像文件夹的路径,此文件夹内要有类似'FC2-12345.html'的网页文件
- fc2fan_local_path: '{cfg['Crawler']['fc2fan_local_path']}'
# 刮削一部电影后的等待时间(设置为0禁用此功能)
# https://en.wikipedia.org/wiki/ISO_8601#Durations
sleep_after_scraping: PT{cfg['Crawler']['sleep_after_scraping']}S
diff --git a/unittest/test_proxyfree.py b/unittest/test_proxyfree.py
index 1537d93ad..65151a9d4 100644
--- a/unittest/test_proxyfree.py
+++ b/unittest/test_proxyfree.py
@@ -1,18 +1,25 @@
-import os
-import sys
-
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
-from javsp.web.proxyfree import *
+import asyncio
+import tracemalloc
+from javsp.crawlers.proxyfree import get_proxy_free_url
+from javsp.config import CrawlerID
def test_get_url():
- assert get_proxy_free_url('javlib') != ''
- assert get_proxy_free_url('javdb') != ''
+ async def wrap():
+ assert await get_proxy_free_url(CrawlerID.javlib) != None
+ assert await get_proxy_free_url(CrawlerID.javdb) != None
+ asyncio.run(wrap())
def test_get_url_with_prefer():
- prefer_url = 'https://www.baidu.com'
- assert prefer_url == get_proxy_free_url('javlib', prefer_url)
+ async def wrap():
+ prefer_url = 'https://www.baidu.com'
+ assert prefer_url == await get_proxy_free_url(CrawlerID.javlib, prefer_url)
+ asyncio.run(wrap())
if __name__ == "__main__":
- print(get_proxy_free_url('javlib'))
+ async def aentry():
+ print(await get_proxy_free_url(CrawlerID.javlib))
+
+ tracemalloc.start()
+ asyncio.run(aentry(), debug=True)