Skip to content

Commit

Permalink
fix asyncio proxyfree test, use uvloop for main loop
Browse files Browse the repository at this point in the history
  • Loading branch information
glyh committed Sep 28, 2024
1 parent f72a375 commit 264c7b7
Show file tree
Hide file tree
Showing 7 changed files with 118 additions and 51 deletions.
9 changes: 4 additions & 5 deletions javsp/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,16 @@
import re
import sys
import json
import asyncio
import time
import asyncio
import logging
import uvloop
from PIL import Image
from lxml.etree import Comment
from pydantic import ValidationError
from pydantic_core import Url
from pydantic_extra_types.pendulum_dt import Duration
import threading
from typing import Any, Coroutine, Dict, List
from javsp.crawlers.interface import Crawler
from javsp.crawlers.all import crawlers

sys.stdout.reconfigure(encoding='utf-8')
Expand Down Expand Up @@ -102,7 +101,7 @@ async def wrapper(id: CrawlerID, movie: MovieInfo) -> None:
co_pool.append(wrapper(crawler_id, info))

# 等待所有协程结束
asyncio.gather(*co_pool)
await asyncio.gather(*co_pool)

# 根据抓取结果更新影片类型判定
if movie.data_src == 'cid' and movie.dvdid:
Expand Down Expand Up @@ -555,7 +554,7 @@ async def aentry():
sys.exit(0)

def entry():
asyncio.run(aentry(), debug=True)
uvloop.run(aentry(), debug=True)

if __name__ == "__main__":
entry()
28 changes: 12 additions & 16 deletions javsp/crawlers/proxyfree.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
"""获取各个网站的免代理地址"""
from collections.abc import Callable, Coroutine
import re
import sys
from typing import Any, Dict

from pydantic_core import Url
from pydantic_extra_types.pendulum_dt import Duration
from lxml import html

from javsp.config import CrawlerID
from javsp.network.utils import test_connect
from javsp.network.utils import test_connect, choose_one_connectable
from javsp.network.client import get_client


async def _get_avsox_urls() -> list:
async def _get_avsox_urls() -> list[str]:
link = 'https://tellme.pw/avsox'
client = get_client(Url(link))
resp = await client.get(link)
Expand All @@ -22,7 +21,7 @@ async def _get_avsox_urls() -> list:
return urls


async def _get_javbus_urls() -> list:
async def _get_javbus_urls() -> list[str]:
link = 'https://www.javbus.one/'
client = get_client(Url(link))
resp = await client.get(link)
Expand All @@ -31,7 +30,7 @@ async def _get_javbus_urls() -> list:
return urls


async def _get_javlib_urls() -> list:
async def _get_javlib_urls() -> list[str]:
link = 'https://github.com/javlibcom'
client = get_client(Url(link))
resp = await client.get(link)
Expand All @@ -41,9 +40,10 @@ async def _get_javlib_urls() -> list:
if match:
domain = f'https://www.{match.group(0)}.com'
return [domain]
return []


async def _get_javdb_urls() -> list:
async def _get_javdb_urls() -> list[str]:
root_link = 'https://jav524.app'
client = get_client(Url(root_link))
resp = await client.get(root_link)
Expand All @@ -57,6 +57,7 @@ async def _get_javdb_urls() -> list:
match = re.search(r'\$officialUrl\s*=\s*"(https://(?:[\d\w][-\d\w]{1,61}[\d\w]\.){1,2}[a-z]{2,})"', text, flags=re.I | re.A)
if match:
return [match.group(1)]
return []

proxy_free_fns: Dict[CrawlerID, Callable[[], Coroutine[Any, Any, list[str]]]]= {
CrawlerID.avsox: _get_avsox_urls,
Expand All @@ -65,29 +66,24 @@ async def _get_javdb_urls() -> list:
CrawlerID.javlib: _get_javlib_urls,
}

def _choose_one(urls: list[str]) -> str:
for url in urls:
if test_connect(url, Duration(seconds=5)):
return url
return ''

async def get_proxy_free_url(site_name: CrawlerID, prefer_url: str | None=None) -> str:
async def get_proxy_free_url(site_name: CrawlerID, prefer_url: str | None = None) -> str | None:
"""获取指定网站的免代理地址
Args:
site_name (str): 站点名称
prefer_url (str, optional): 优先测试此url是否可用
Returns:
str: 指定站点的免代理地址(失败时为空字符串)
"""
if prefer_url and test_connect(prefer_url, Duration(seconds=5)):
if prefer_url and await test_connect(prefer_url, Duration(seconds=5)):
return prefer_url

if site_name in proxy_free_fns:
try:
urls = await proxy_free_fns[site_name]()
return _choose_one(urls)
print(f"I got {urls}")
return await choose_one_connectable(urls)
except:
return ''
return None
else:
raise Exception("Dont't know how to get proxy-free url for " + site_name)

Expand Down
6 changes: 4 additions & 2 deletions javsp/network/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@

from javsp.config import Cfg

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
default_headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}

def get_proxy(unproxied: bool):
if Cfg().network.proxy_server is None or unproxied:
Expand All @@ -33,7 +35,7 @@ def get_client(url: Url) -> AsyncClient:
client = AsyncClient(
transport=transport,
# 必须使用copy(),否则各个模块对headers的修改都将会指向本模块中定义的headers变量,导致只有最后一个对headers的修改生效
headers=headers.copy(),
headers=default_headers.copy(),
timeout=Cfg().network.timeout.total_seconds(),
follow_redirects=True,
)
Expand Down
47 changes: 30 additions & 17 deletions javsp/network/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from javsp.config import Cfg, CrawlerID
from javsp.network.client import get_client

import asyncio

class DownloadInfo(NamedTuple):
size: ByteSize
elapsed: timedelta
Expand Down Expand Up @@ -53,18 +55,10 @@ async def url_download(url: Url, target_path: str, desc: str | None = None) -> D

return DownloadInfo(ByteSize(response.num_bytes_downloaded), response.elapsed)

# def resp2html(resp: Response) -> lxml.html.HtmlElement:
#
# """将request返回的response转换为经lxml解析后的document"""
#
# html = lxml.html.fromstring(resp.text)
# html.make_links_absolute(str(resp.url), resolve_base_href=True)
# return html
#
async def test_connect(url_str: str, timeout: Duration) -> bool:
"""测试与指定url的连接,不使用映射,但使用代理"""
try:

print(f"Attemping to connect {url_str}")
client = get_client(Url(url_str))
response = \
await client.get(
Expand All @@ -76,16 +70,35 @@ async def test_connect(url_str: str, timeout: Duration) -> bool:
except:
return False

async def choose_one_connectable(urls: list[str]) -> str | None:
print(urls)
co_connectables: list[Coroutine[Any, Any, bool]] = []
for url in urls:
co_connectables.append(test_connect(url, Duration(seconds=5)))

connectables = await asyncio.gather(*co_connectables)
for i, connectable in enumerate(connectables):
if connectable:
return urls[i]
return None

async def resolve_site_fallback(cr_id: CrawlerID, default: str) -> Url:
if cr_id not in Cfg().network.fallback:
return Url(default)

tasks: list[tuple[str, Coroutine[Any, Any, bool]]] = []
for fallback in Cfg().network.fallback[cr_id]:
tasks.append((fallback, test_connect(fallback, Duration(seconds=3))))

for (fallback, task) in tasks:
if await task:
return Url(fallback)
fallbacks = Cfg().network.fallback[cr_id]
chosen = await choose_one_connectable(fallbacks)
if chosen is None:
return Url(default)
else:
return Url(chosen)


if __name__ == '__main__':
# async def aentry():
# print(await choose_one_connectable(['http://iandown.what', 'http://www.baidu.com']))

async def aentry():
print(await test_connect("https://www.y78k.com/", timeout=3))

return Url(default)
asyncio.run(aentry())
51 changes: 50 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ pendulum = "^3.0.0"
slimeface = "^2024.9.27"
httpx = {extras = ["socks"], version = "^0.27.2"}
aiofiles = "^24.1.0"
uvloop = "^0.20.0"

[tool.poetry.scripts]
javsp = "javsp.__main__:entry"
Expand Down
27 changes: 17 additions & 10 deletions unittest/test_proxyfree.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,25 @@
import os
import sys

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from javsp.web.proxyfree import *
import uvloop
import tracemalloc

from javsp.crawlers.proxyfree import get_proxy_free_url
from javsp.config import CrawlerID

def test_get_url():
assert get_proxy_free_url('javlib') != ''
assert get_proxy_free_url('javdb') != ''
async def wrap():
assert await get_proxy_free_url(CrawlerID.javlib) != None
assert await get_proxy_free_url(CrawlerID.javdb) != None
uvloop.run(wrap())


def test_get_url_with_prefer():
prefer_url = 'https://www.baidu.com'
assert prefer_url == get_proxy_free_url('javlib', prefer_url)
async def wrap():
prefer_url = 'https://www.baidu.com'
assert prefer_url == await get_proxy_free_url(CrawlerID.javlib, prefer_url)
uvloop.run(wrap())

if __name__ == "__main__":
print(get_proxy_free_url('javlib'))
async def aentry():
print(await get_proxy_free_url(CrawlerID.javlib))

tracemalloc.start()
uvloop.run(aentry(), debug=True)

0 comments on commit 264c7b7

Please sign in to comment.