Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

重构网络请求接口和Async Await #389

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 15 additions & 9 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,24 @@ network:
# 设置代理服务器地址,支持 http, socks5/socks5h 代理,比如'http://127.0.0.1:1080'
# null表示禁用代理
proxy_server: null
# 各个站点的免代理地址。地址失效时软件会自动尝试获取新地址,你也可以手动设置
proxy_free:
avsox: 'https://avsox.click'
javbus: 'https://www.seedmm.help'
javdb: 'https://javdb368.com'
javlib: 'https://www.y78k.com'
# 网络问题导致抓取数据失败时的重试次数,通常3次就差不多了
retry: 3
retries: 3
# https://en.wikipedia.org/wiki/ISO_8601#Durations
timeout: PT10S
# 对列表中的地址不使用梯子(如果启用了的话)
unproxied: [
'https://www.seedmm.help',
'https://javdb368.com',
'https://www.y78k.com',
'https://www.javbus.one',
'https://www.tellme.pw',
]
# 各个站点的代替地址。
# JavSP会按顺序尝试列表里的每一个服务器,如果都不行会使用默认的主站点地址
fallback:
javbus: ['https://www.seedmm.help']
javdb: ['https://javdb368.com']
javlib: ['https://www.y78k.com']

################################
crawler:
Expand All @@ -52,8 +60,6 @@ crawler:
hardworking: true
# 使用网页番号作为最终番号(启用时会对番号大小写等进行更正)
respect_site_avid: true
# fc2fan已关站。如果你有镜像,请设置本地镜像文件夹的路径,此文件夹内要有类似'FC2-12345.html'的网页文件
fc2fan_local_path: null
# 刮削一部电影后的等待时间(设置为0禁用此功能)
# https://en.wikipedia.org/wiki/ISO_8601#Durations
sleep_after_scraping: PT1S
Expand Down
210 changes: 82 additions & 128 deletions javsp/__main__.py

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions javsp/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,10 @@ class CrawlerID(str, Enum):

class Network(BaseConfig):
proxy_server: Url | None
retry: NonNegativeInt = 3
retries: NonNegativeInt = 3
timeout: Duration
proxy_free: Dict[CrawlerID, Url]
unproxied: List[Url]
fallback: Dict[CrawlerID, List[str]]

class CrawlerSelect(BaseConfig):
def items(self) -> List[tuple[str, list[CrawlerID]]]:
Expand Down Expand Up @@ -109,7 +110,6 @@ class Crawler(BaseConfig):
required_keys: list[MovieInfoField]
hardworking: bool
respect_site_avid: bool
fc2fan_local_path: Path | None
sleep_after_scraping: Duration
use_javdb_cover: UseJavDBCover
normalize_actress_name: bool
Expand Down
30 changes: 30 additions & 0 deletions javsp/crawlers/all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from collections.abc import Coroutine
from typing import Any, Dict
from javsp.config import CrawlerID
from javsp.crawlers.interface import Crawler
from javsp.crawlers.sites import \
airav, arzon, arzon_iv, avsox, avwiki, dl_getchu, fanza, fc2, fc2ppvdb, \
gyutto, jav321, javbus, javdb, javlib, javmenu, mgstage, njav, prestige

__all__ = ['crawlers']

crawlers: Dict[CrawlerID, type[Crawler]] = {
CrawlerID.airav: airav. AiravCrawler,
CrawlerID.arzon: arzon. ArzonCrawler,
CrawlerID.arzon_iv: arzon_iv. ArzonIvCrawler,
CrawlerID.avsox: avsox. AvsoxCrawler,
CrawlerID.avwiki: avwiki. AvWikiCrawler,
CrawlerID.dl_getchu: dl_getchu.DlGetchuCrawler,
CrawlerID.fanza: fanza. FanzaCrawler,
CrawlerID.fc2: fc2. Fc2Crawler,
CrawlerID.fc2ppvdb: fc2ppvdb. Fc2PpvDbCrawler,
CrawlerID.gyutto: gyutto. GyuttoCrawler,
CrawlerID.jav321: jav321. Jav321Crawler,
CrawlerID.javbus: javbus. JavbusCrawler,
CrawlerID.javdb: javdb. JavDbCrawler,
CrawlerID.javlib: javlib. JavLibCrawler,
CrawlerID.javmenu: javmenu. JavMenuCrawler,
CrawlerID.mgstage: mgstage. MgstageCrawler,
CrawlerID.njav: njav. NjavCrawler,
CrawlerID.prestige: prestige. PrestigeCrawler,
}
File renamed without changes.
21 changes: 21 additions & 0 deletions javsp/crawlers/interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from httpx import AsyncClient
from javsp.config import CrawlerID
from javsp.datatype import MovieInfo
from abc import ABC, abstractmethod
from typing import Self


class Crawler(ABC):
base_url: str
client: AsyncClient
id: CrawlerID


@classmethod
@abstractmethod
async def create(cls) -> Self:
pass

@abstractmethod
async def crawl_and_fill(self, movie: MovieInfo) -> None:
pass
98 changes: 98 additions & 0 deletions javsp/crawlers/proxyfree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
"""获取各个网站的免代理地址"""
from collections.abc import Callable, Coroutine
import re
from typing import Any, Dict

from pydantic_core import Url
from pydantic_extra_types.pendulum_dt import Duration
from lxml import html

from javsp.config import CrawlerID
from javsp.network.utils import test_connect, choose_one_connectable
from javsp.network.client import get_client


async def _get_avsox_urls() -> list[str]:
link = 'https://tellme.pw/avsox'
client = get_client(Url(link))
resp = await client.get(link)
tree = html.fromstring(resp.text)
urls = tree.xpath('//h4/strong/a/@href')
return urls


async def _get_javbus_urls() -> list[str]:
link = 'https://www.javbus.one/'
client = get_client(Url(link))
resp = await client.get(link)
text = resp.text
urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A)
return urls


async def _get_javlib_urls() -> list[str]:
link = 'https://github.com/javlibcom'
client = get_client(Url(link))
resp = await client.get(link)
tree = html.fromstring(resp.text)
text = tree.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content()
match = re.search(r'[\w\.]+', text, re.A)
if match:
domain = f'https://www.{match.group(0)}.com'
return [domain]
return []


async def _get_javdb_urls() -> list[str]:
root_link = 'https://jav524.app'
client = get_client(Url(root_link))
resp = await client.get(root_link)
tree = html.fromstring(resp.text)
js_links = tree.xpath("//script[@src]/@src")
for link in js_links:
if '/js/index' in link:
link = root_link + link
resp = await client.get(link)
text = resp.text
match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A)
if match:
return [match.group(1)]
return []

proxy_free_fns: Dict[CrawlerID, Callable[[], Coroutine[Any, Any, list[str]]]]= {
CrawlerID.avsox: _get_avsox_urls,
CrawlerID.javdb: _get_javdb_urls,
CrawlerID.javbus: _get_javbus_urls,
CrawlerID.javlib: _get_javlib_urls,
}

async def get_proxy_free_url(site_name: CrawlerID, prefer_url: str | None = None) -> str | None:
"""获取指定网站的免代理地址
Args:
site_name (str): 站点名称
prefer_url (str, optional): 优先测试此url是否可用
Returns:
str: 指定站点的免代理地址(失败时为空字符串)
"""
if prefer_url and await test_connect(prefer_url, Duration(seconds=5)):
return prefer_url

if site_name in proxy_free_fns:
try:
urls = await proxy_free_fns[site_name]()
return await choose_one_connectable(urls)
except:
return None
else:
raise Exception("Dont't know how to get proxy-free url for " + site_name)



if __name__ == "__main__":

async def test_main():
print('javdb:\t', await _get_javdb_urls())
print('javlib:\t', await _get_javlib_urls())

import asyncio
asyncio.run(test_main())
124 changes: 124 additions & 0 deletions javsp/crawlers/sites/airav.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
"""从airav抓取数据"""
import re
from html import unescape

from javsp.crawlers.exceptions import MovieNotFoundError
from javsp.network.client import get_client
from javsp.network.utils import resolve_site_fallback
from javsp.config import Cfg, CrawlerID
from javsp.datatype import MovieInfo
from javsp.crawlers.interface import Crawler


class AiravCrawler(Crawler):
id = CrawlerID.airav

@classmethod
async def create(cls):
self = cls()
url = await resolve_site_fallback(self.id, 'https://www.airav.wiki')
self.base_url = str(url)
self.client = get_client(url)
self.client.headers['Accept-Language'] = 'zh-TW,zh;q=0.9'
return self

async def search_movie(self, dvdid: str):
"""通过搜索番号获取指定的影片在网站上的ID"""
# 部分影片的ID并不直接等于番号(如012717-360),此时需要尝试通过搜索来寻找影片
page = 0
count = 1
result = []
while len(result) < count:
url = f'{self.base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}'
response = await self.client.get(url)
resp = response.json()
# {"offset": 2460, "count": 12345, "result": [...], "status": "ok"}
if resp['result']:
result.extend(resp['result'])
count = resp['count']
page += 1
else: # 结果为空,结束循环
break
# 如果什么都没搜索到,直接返回
if not result:
raise MovieNotFoundError(__name__, dvdid)
# 排序,以优先选择更符合预期的结果(如'012717_472'对应的'1pondo_012717_472'和'_1pondo_012717_472')
result.sort(key=lambda x:x['barcode'])
# 从所有搜索结果中选择最可能的番号,返回它的URL
target = dvdid.replace('-', '_')
for item in result:
# {'vid': '', 'slug': '', 'name': '', 'url': '', 'view': '', 'img_url': '', 'barcode': ''}
barcode = item['barcode'].replace('-', '_')
if target in barcode:
return item['barcode']
raise MovieNotFoundError(__name__, dvdid, result)



async def crawl_and_fill(self, movie: MovieInfo) -> None:
"""解析指定番号的影片数据"""
# airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据
url = f'{self.base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW'
response = await self.client.get(url)
resp_json = response.json()
# 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息
if resp_json['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid):
barcode = await self.search_movie(movie.dvdid)
if barcode:
url = f'{self.base_url}/api/video/barcode/{barcode}?lng=zh-TW'
response = await self.client.get(url)
resp_json = response.json()

if resp_json['count'] == 0:
raise MovieNotFoundError(__name__, movie.dvdid, resp_json)

# 从API返回的数据中提取需要的字段
# TODO: 数据中含有更多信息(如女优的中文&日文名对照),可能有助于未来功能扩展
data = resp_json['result']
dvdid = data['barcode']
movie.dvdid = dvdid
movie.url = self.base_url + '/video/' + dvdid
# plot和title中可能含有HTML的转义字符,需要进行解转义处理
movie.plot = unescape(data['description']) or None
movie.cover = data['img_url']
# airav的genre是以搜索关键词的形式组织的,没有特定的genre_id
movie.genre = [i['name'] for i in data['tags']]
movie.title = unescape(data['name'])
movie.actress = [i['name'] for i in data['actors']]
movie.publish_date = data['publish_date']
movie.preview_pics = data['images'] or []
if data['factories']:
movie.producer = data['factories'][0]['name']

if Cfg().crawler.hardworking:
# 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472')
video_url = f"{self.base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
response = await self.client.get(video_url)
resp = response.json()
# 如果失败,结果如 {'msg': 'fail', 'status': 'fail'}
if 'data' in resp:
# 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址
# TODO: 发现部分影片(如080719-976)的传统格式预览片错误
movie.preview_video = resp['data'].get('url')

# airav上部分影片会被标记为'馬賽克破壞版'等,这些影片的title、plot和genre都不再准确
for keyword in ('馬賽克破壞版', '馬賽克破解版', '無碼流出版'):
if movie.title and keyword in movie.title:
movie.title = None
movie.genre = []
if movie.plot and keyword in movie.plot:
movie.plot = None
movie.genre = []
if not any([movie.title, movie.plot, movie.genre]):
break

if __name__ == "__main__":

async def test_main():
crawler = await AiravCrawler.create()
movie = MovieInfo("DSAD-938")
await crawler.crawl_and_fill(movie)
print(movie)

import asyncio
asyncio.run(test_main())
Loading
Loading