From c652b606d514702ca3e1c6ac260b2d7784019362 Mon Sep 17 00:00:00 2001 From: masklinn Date: Mon, 15 Jul 2024 21:20:24 +0200 Subject: [PATCH] regex-based POC Uses ua-parser/uap-rust#3 --- pyproject.toml | 7 +++- src/ua_parser/__main__.py | 17 +++++---- src/ua_parser/regex.py | 76 +++++++++++++++++++++++++++++++++++++++ tests/test_core.py | 13 +++++++ tox.ini | 1 + 5 files changed, 107 insertions(+), 7 deletions(-) create mode 100644 src/ua_parser/regex.py diff --git a/pyproject.toml b/pyproject.toml index 920fcd0..7faae00 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,6 @@ version = "1.0.0a1" readme = "README.rst" requires-python = ">=3.8" dependencies = [] -optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] } license = {text = "Apache 2.0"} urls = {repository = "https://github.com/ua-parser/uap-python"} @@ -39,10 +38,16 @@ classifiers = [ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy" ] +[project.optional-dependencies] +yaml = ["PyYaml"] +re2 = ["google-re2"] +regex = ["ua-parser-rs"] + [tool.ruff.lint] select = ["F", "E", "W", "I", "RET", "RUF", "PT"] ignore = [ diff --git a/src/ua_parser/__main__.py b/src/ua_parser/__main__.py index d4ff29b..c461a28 100644 --- a/src/ua_parser/__main__.py +++ b/src/ua_parser/__main__.py @@ -39,11 +39,13 @@ from .caching import Cache, Local from .loaders import load_builtins, load_yaml from .re2 import Resolver as Re2Resolver +from .regex import Resolver as RegexResolver from .user_agent_parser import Parse CACHEABLE = { "basic": True, "re2": True, + "regex": True, "legacy": False, } @@ -178,6 +180,8 @@ def get_parser( r = BasicResolver(rules) elif parser == "re2": r = Re2Resolver(rules) + elif parser == "regex": + r = RegexResolver(rules) else: sys.exit(f"unknown parser {parser!r}") @@ -327,6 +331,7 @@ def run_threaded(args: argparse.Namespace) -> None: ("locking-lru", CachingResolver(basic, caching.Lru(CACHESIZE))), ("local-lru", CachingResolver(basic, Local(lambda: caching.Lru(CACHESIZE)))), ("re2", Re2Resolver(load_builtins())), + ("regex", RegexResolver(load_builtins())), ] for name, resolver in resolvers: print(f"{name:11}: ", end="", flush=True) @@ -436,14 +441,14 @@ def __call__( bench.add_argument( "--bases", nargs="+", - choices=["basic", "re2", "legacy"], - default=["basic", "re2", "legacy"], + choices=["basic", "re2", "regex", "legacy"], + default=["basic", "re2", "regex", "legacy"], help="""Base resolvers to benchmark. `basic` is a linear search through the regexes file, `re2` is a prefiltered regex set - implemented in C++, `legacy` is the legacy API (essentially a - basic resolver with a clearing cache of fixed 200 entries, but - less layered so usually slightly faster than an equivalent - basic-based resolver).""", + implemented in C++, `regex` is a prefiltered regex set implemented + in Rust, `legacy` is the legacy API (essentially a basic resolver + with a clearing cache of fixed 200 entries, but less layered so + usually slightly faster than an equivalent basic-based resolver).""", ) bench.add_argument( "--caches", diff --git a/src/ua_parser/regex.py b/src/ua_parser/regex.py new file mode 100644 index 0000000..b67a35f --- /dev/null +++ b/src/ua_parser/regex.py @@ -0,0 +1,76 @@ +__all__ = ["Resolver"] + +from operator import attrgetter + +import ua_parser_rs # type: ignore + +from .core import ( + Device, + Domain, + Matchers, + OS, + PartialResult, + UserAgent, +) + + +class Resolver: + ua: ua_parser_rs.UserAgentExtractor + os: ua_parser_rs.OSExtractor + de: ua_parser_rs.DeviceExtractor + + def __init__(self, matchers: Matchers) -> None: + ua, os, de = matchers + self.ua = ua_parser_rs.UserAgentExtractor( + map( + attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"), + ua, + ) + ) + self.os = ua_parser_rs.OSExtractor( + map( + attrgetter("regex", "family", "major", "minor", "patch", "patch_minor"), + os, + ) + ) + self.de = ua_parser_rs.DeviceExtractor( + map( + attrgetter("regex", "regex_flag", "family", "brand", "model"), + de, + ) + ) + + def __call__(self, ua: str, domains: Domain, /) -> PartialResult: + user_agent = os = device = None + if Domain.USER_AGENT in domains: + if m := self.ua.extract(ua): + user_agent = UserAgent( + m.family, + m.major, + m.minor, + m.patch, + m.patch_minor, + ) + if Domain.OS in domains: + if m := self.os.extract(ua): + os = OS( + m.os, + m.major, + m.minor, + m.patch, + m.patch_minor, + ) + if Domain.DEVICE in domains: + if m := self.de.extract(ua): + device = Device( + m.family, + m.brand, + m.model, + ) + return PartialResult( + domains=domains, + string=ua, + user_agent=user_agent, + os=os, + device=device, + ) diff --git a/tests/test_core.py b/tests/test_core.py index 4c80126..2666ed3 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -53,6 +53,19 @@ else: PARSERS.append(pytest.param(Parser(re2.Resolver(load_builtins())), id="re2")) +try: + from ua_parser import regex +except ImportError: + PARSERS.append( + pytest.param( + None, + id="regex", + marks=pytest.mark.skip(reason="regex parser not available"), + ) + ) +else: + PARSERS.append(pytest.param(Parser(regex.Resolver(load_builtins())), id="regex")) + UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)} diff --git a/tox.ini b/tox.ini index bb4af08..4301c19 100644 --- a/tox.ini +++ b/tox.ini @@ -20,6 +20,7 @@ deps = pytest pyyaml google-re2 + ua-parser-rs commands = pytest -Werror --doctest-glob="*.rst" {posargs}