Skip to content

Commit

Permalink
refactor network with aiohttp
Browse files Browse the repository at this point in the history
  • Loading branch information
glyh committed Sep 29, 2024
1 parent 5ed7f36 commit 9729542
Show file tree
Hide file tree
Showing 29 changed files with 794 additions and 348 deletions.
3 changes: 3 additions & 0 deletions javsp/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from pydantic_extra_types.pendulum_dt import Duration
from typing import Any, Coroutine, Dict, List
from javsp.crawlers.all import crawlers
from javsp.network.client import clear_clients

sys.stdout.reconfigure(encoding='utf-8')

Expand Down Expand Up @@ -549,6 +550,8 @@ async def aentry():
logger.info(f'扫描影片文件:共找到 {movie_count} 部影片')
await RunNormalMode(recognized + recognize_fail)

await clear_clients()

sys.exit(0)

def entry():
Expand Down
2 changes: 2 additions & 0 deletions javsp/chromium.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def decrypt(self, encrypted_value):

def get_browsers_cookies():
"""获取系统上的所有Chromium系浏览器的JavDB的Cookies"""
if not sys.platform.startswith('win32'): # 不支持windows以外的系统
return []
# 不予支持: Opera, 360安全&极速, 搜狗使用非标的用户目录或数据格式; QQ浏览器屏蔽站点
user_data_dirs = {
'Chrome': '/Google/Chrome/User Data',
Expand Down
4 changes: 2 additions & 2 deletions javsp/crawlers/interface.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from httpx import AsyncClient
from javsp.config import CrawlerID
from javsp.datatype import MovieInfo
from abc import ABC, abstractmethod
from typing import Self
from aiohttp import ClientSession


class Crawler(ABC):
base_url: str
client: AsyncClient
client: ClientSession
id: CrawlerID


Expand Down
30 changes: 15 additions & 15 deletions javsp/crawlers/proxyfree.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,32 +9,32 @@

from javsp.config import CrawlerID
from javsp.network.utils import test_connect, choose_one_connectable
from javsp.network.client import get_client
from javsp.network.client import get_session


async def _get_avsox_urls() -> list[str]:
link = 'https://tellme.pw/avsox'
client = get_client(Url(link))
resp = await client.get(link)
tree = html.fromstring(resp.text)
s = get_session(Url(link))
resp = await s.get(link)
tree = html.fromstring(await resp.text())
urls = tree.xpath('//h4/strong/a/@href')
return urls


async def _get_javbus_urls() -> list[str]:
link = 'https://www.javbus.one/'
client = get_client(Url(link))
resp = await client.get(link)
text = resp.text
s = get_session(Url(link))
resp = await s.get(link)
text = await resp.text()
urls = re.findall(r'防屏蔽地址:(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})', text, re.I | re.A)
return urls


async def _get_javlib_urls() -> list[str]:
link = 'https://github.com/javlibcom'
client = get_client(Url(link))
resp = await client.get(link)
tree = html.fromstring(resp.text)
s = get_session(Url(link))
resp = await s.get(link)
tree = html.fromstring(await resp.text())
text = tree.xpath("//div[@class='p-note user-profile-bio mb-3 js-user-profile-bio f4']")[0].text_content()
match = re.search(r'[\w\.]+', text, re.A)
if match:
Expand All @@ -45,15 +45,15 @@ async def _get_javlib_urls() -> list[str]:

async def _get_javdb_urls() -> list[str]:
root_link = 'https://jav524.app'
client = get_client(Url(root_link))
resp = await client.get(root_link)
tree = html.fromstring(resp.text)
s = get_session(Url(root_link))
resp = await s.get(root_link)
tree = html.fromstring(await resp.text())
js_links = tree.xpath("//script[@src]/@src")
for link in js_links:
if '/js/index' in link:
link = root_link + link
resp = await client.get(link)
text = resp.text
resp = await s.get(link)
text = await resp.text()
match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A)
if match:
return [match.group(1)]
Expand Down
27 changes: 16 additions & 11 deletions javsp/crawlers/sites/airav.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""从airav抓取数据"""
import re
from html import unescape
from typing import Dict

from javsp.crawlers.exceptions import MovieNotFoundError
from javsp.network.client import get_client
from javsp.network.client import get_session
from javsp.network.utils import resolve_site_fallback
from javsp.config import Cfg, CrawlerID
from javsp.datatype import MovieInfo
Expand All @@ -13,13 +14,15 @@
class AiravCrawler(Crawler):
id = CrawlerID.airav

headers: Dict[str, str]

@classmethod
async def create(cls):
self = cls()
url = await resolve_site_fallback(self.id, 'https://www.airav.wiki')
self.base_url = str(url)
self.client = get_client(url)
self.client.headers['Accept-Language'] = 'zh-TW,zh;q=0.9'
self.client = get_session(url)
self.headers = {'Accept-Language': 'zh-TW,zh;q=0.9'}
return self

async def search_movie(self, dvdid: str):
Expand All @@ -30,8 +33,8 @@ async def search_movie(self, dvdid: str):
result = []
while len(result) < count:
url = f'{self.base_url}/api/video/list?lang=zh-TW&lng=zh-TW&search={dvdid}&page={page}'
response = await self.client.get(url)
resp = response.json()
response = await self.client.get(url, headers=self.headers)
resp = await response.json()
# {"offset": 2460, "count": 12345, "result": [...], "status": "ok"}
if resp['result']:
result.extend(resp['result'])
Expand Down Expand Up @@ -59,15 +62,15 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None:
"""解析指定番号的影片数据"""
# airav也提供简体,但是为了尽量保持女优名等与其他站点一致,抓取繁体的数据
url = f'{self.base_url}/api/video/barcode/{movie.dvdid}?lng=zh-TW'
response = await self.client.get(url)
resp_json = response.json()
response = await self.client.get(url, headers=self.headers)
resp_json = await response.json()
# 只在番号是纯数字时,尝试进行搜索,否则可能导致搜索到错误的影片信息
if resp_json['count'] == 0 and re.match(r'\d{6}[-_]\d{2,3}', movie.dvdid):
barcode = await self.search_movie(movie.dvdid)
if barcode:
url = f'{self.base_url}/api/video/barcode/{barcode}?lng=zh-TW'
response = await self.client.get(url)
resp_json = response.json()
response = await self.client.get(url, headers=self.headers)
resp_json = await response.json()

if resp_json['count'] == 0:
raise MovieNotFoundError(__name__, movie.dvdid, resp_json)
Expand All @@ -93,8 +96,8 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None:
if Cfg().crawler.hardworking:
# 注意这里用的是获取的dvdid,而不是传入的movie.dvdid(如'1pondo_012717_472'与'012717_472')
video_url = f"{self.base_url}/api/video/getVideoMedia?barcode={dvdid}&vid={data['vid']}"
response = await self.client.get(video_url)
resp = response.json()
response = await self.client.get(video_url, headers=self.headers)
resp = await response.json()
# 如果失败,结果如 {'msg': 'fail', 'status': 'fail'}
if 'data' in resp:
# 除url外还有url_cdn, url_hlx, url_hls_cdn字段,后两者为m3u8格式。目前将url作为预览视频的地址
Expand All @@ -113,12 +116,14 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None:
break

if __name__ == "__main__":
from javsp.network.client import clear_clients

async def test_main():
crawler = await AiravCrawler.create()
movie = MovieInfo("DSAD-938")
await crawler.crawl_and_fill(movie)
print(movie)
await clear_clients()

import asyncio
asyncio.run(test_main())
12 changes: 7 additions & 5 deletions javsp/crawlers/sites/arzon.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re

from javsp.network.utils import resolve_site_fallback
from javsp.network.client import get_client
from javsp.network.client import get_session
from javsp.crawlers.interface import Crawler
from javsp.config import CrawlerID
from javsp.crawlers.exceptions import *
Expand All @@ -17,7 +17,7 @@ async def create(cls):
self = cls()
url = await resolve_site_fallback(self.id, "https://www.arzon.jp")
self.base_url = str(url)
self.client = get_client(url)
self.client = get_session(url)
# https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1"
await self.client.get(skip_verify_url)
Expand All @@ -30,18 +30,18 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None:
# url = f'{base_url}/imagelist.html?q={full_id}'

r = await self.client.get(url)
if r.status_code == 404:
if r.status == 404:
raise MovieNotFoundError(__name__, movie.dvdid)
# https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
data = html.fromstring(r.content)
data = html.fromstring(await r.read())

urls = data.xpath("//h2/a/@href")
if len(urls) == 0:
raise MovieNotFoundError(__name__, movie.dvdid)

item_url = self.base_url + urls[0]
e = await self.client.get(item_url)
item = html.fromstring(e.content)
item = html.fromstring(await e.read())

title = item.xpath("//div[@class='detail_title_new2']//h1/text()")[0]
cover = item.xpath("//td[@align='center']//a/img/@src")[0]
Expand Down Expand Up @@ -91,13 +91,15 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None:
movie.preview_pics = preview_pics

if __name__ == "__main__":
from javsp.network.client import clear_clients

async def test_main():
crawler = await ArzonCrawler.create()
movie = MovieInfo("CSCT-011")
try:
await crawler.crawl_and_fill(movie)
print(movie)
await clear_clients()
except Exception as e:
print(repr(e))

Expand Down
10 changes: 5 additions & 5 deletions javsp/crawlers/sites/arzon_iv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


from javsp.network.utils import resolve_site_fallback
from javsp.network.client import get_client
from javsp.network.client import get_session
from javsp.crawlers.interface import Crawler
from javsp.config import CrawlerID
from javsp.crawlers.exceptions import *
Expand All @@ -18,7 +18,7 @@ async def create(cls):
self = cls()
url = await resolve_site_fallback(self.id, "https://www.arzon.jp")
self.base_url = str(url)
self.client = get_client(url)
self.client = get_session(url)
# https://www.arzon.jp/index.php?action=adult_customer_agecheck&agecheck=1&redirect=https%3A%2F%2Fwww.arzon.jp%2F
skip_verify_url = f"{self.base_url}/index.php?action=adult_customer_agecheck&agecheck=1"
await self.client.get(skip_verify_url)
Expand All @@ -31,18 +31,18 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None:
# url = f'{base_url}/imagelist.html?q={full_id}'

r = await self.client.get(url)
if r.status_code == 404:
if r.status == 404:
raise MovieNotFoundError(__name__, movie.dvdid)
# https://stackoverflow.com/questions/15830421/xml-unicode-strings-with-encoding-declaration-are-not-supported
data = html.fromstring(r.content)
data = html.fromstring(await r.read())

urls = data.xpath("//h2/a/@href")
if len(urls) == 0:
raise MovieNotFoundError(__name__, movie.dvdid)

item_url = self.base_url + urls[0]
e = await self.client.get(item_url)
item = html.fromstring(e.content)
item = html.fromstring(await e.read())

title = item.xpath("//div[@class='detail_title_new']//h1/text()")[0]
cover = item.xpath("//td[@align='center']//a/img/@src")[0]
Expand Down
10 changes: 4 additions & 6 deletions javsp/crawlers/sites/avsox.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from javsp.crawlers.exceptions import MovieNotFoundError
from javsp.datatype import MovieInfo
from javsp.network.utils import resolve_site_fallback
from javsp.network.client import get_client
from javsp.network.client import get_session
from javsp.crawlers.interface import Crawler
from javsp.config import CrawlerID
from lxml import html
Expand All @@ -16,15 +16,15 @@ async def create(cls):
self = cls()
url = await resolve_site_fallback(self.id, "https://avsox.click/")
self.base_url = str(url)
self.client = get_client(url)
self.client = get_session(url)
return self

async def crawl_and_fill(self, movie: MovieInfo) -> None:
full_id: str = movie.dvdid
if full_id.startswith('FC2-'):
full_id = full_id.replace('FC2-', 'FC2-PPV-')
resp = await self.client.get(f'{self.base_url}tw/search/{full_id}')
tree = html.fromstring(resp.text)
tree = html.fromstring(await resp.text())
tree.make_links_absolute(str(resp.url), resolve_base_href=True)
ids = tree.xpath("//div[@class='photo-info']/span/date[1]/text()")
urls = tree.xpath("//a[contains(@class, 'movie-box')]/@href")
Expand All @@ -37,9 +37,7 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None:

# 提取影片信息
resp = await self.client.get(url)
# with open('file.html', 'wb') as f:
# f.write(resp.content)
tree = html.fromstring(resp.text)
tree = html.fromstring(await resp.text())
container = tree.xpath("/html/body/div[@class='container']")[0]
title = container.xpath("h3/text()")[0]
cover = container.xpath("//a[@class='bigImage']/@href")[0]
Expand Down
8 changes: 4 additions & 4 deletions javsp/crawlers/sites/avwiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from javsp.datatype import MovieInfo
from javsp.crawlers.interface import Crawler
from javsp.network.utils import resolve_site_fallback
from javsp.network.client import get_client
from javsp.network.client import get_session
from javsp.config import CrawlerID
from lxml import html

Expand All @@ -16,7 +16,7 @@ async def create(cls):
self = cls()
url = await resolve_site_fallback(self.id, 'https://av-wiki.net')
self.base_url = str(url)
self.client = get_client(url)
self.client = get_session(url)
return self

async def crawl_and_fill(self, movie: MovieInfo) -> None:
Expand All @@ -27,9 +27,9 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None:
movie.url = url = f'{self.base_url}/{movie.dvdid}'

resp = await self.client.get(url)
if resp.status_code == 404:
if resp.status == 404:
raise MovieNotFoundError(__name__, movie.dvdid)
tree = html.fromstring(resp.content)
tree = html.fromstring(await resp.text())

cover_tag = tree.xpath("//header/div/a[@class='image-link-border']/img")
if cover_tag:
Expand Down
8 changes: 4 additions & 4 deletions javsp/crawlers/sites/dl_getchu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from javsp.config import CrawlerID
from javsp.crawlers.exceptions import MovieNotFoundError
from javsp.crawlers.interface import Crawler
from javsp.network.client import get_client
from javsp.network.client import get_session
from javsp.network.utils import resolve_site_fallback
from javsp.crawlers.exceptions import *
from javsp.datatype import MovieInfo
Expand Down Expand Up @@ -55,7 +55,7 @@ async def create(cls):
self = cls()
url = await resolve_site_fallback(self.id, 'https://dl.getchu.com')
self.base_url = str(url)
self.client = get_client(url)
self.client = get_session(url)
return self

async def crawl_and_fill(self, movie: MovieInfo) -> None:
Expand All @@ -68,9 +68,9 @@ async def crawl_and_fill(self, movie: MovieInfo) -> None:
# 抓取网页
url = f'{self.base_url}/i/item{getchu_id}'
r = await self.client.get(url)
if r.status_code == 404:
if r.status == 404:
raise MovieNotFoundError(__name__, movie.dvdid)
tree = html.fromstring(r.text)
tree = html.fromstring((await r.read()).decode(encoding='euc_jp', errors='ignore'))
container = tree.xpath("//form[@action='https://dl.getchu.com/cart/']/div/table[3]")
if len(container) > 0:
container = container[0]
Expand Down
Loading

0 comments on commit 9729542

Please sign in to comment.