From 727a88c0f0e34ead2b56a3c44d948b24cf159ca5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Sat, 12 Oct 2024 02:20:37 +0100 Subject: [PATCH 01/26] Apply dist_thresh to Genius and Google backends This commit introduces a distance threshold mechanism for the Genius and Google backends. - Create a new `SearchBackend` base class with a method `check_match` that performs checking. - Start using undocumented `dist_thresh` configuration option for good, and mention it in the docs. This controls the maximum allowable distance for matching artist and title names. These changes aim to improve the accuracy of lyrics matching, especially when there are slight variations in artist or title names, see #4791. --- beetsplug/lyrics.py | 119 +++++++++++++++++++++--------------- docs/changelog.rst | 8 +++ docs/plugins/lyrics.rst | 6 ++ test/plugins/test_lyrics.py | 40 +++++++++++- 4 files changed, 122 insertions(+), 51 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 70edaa0011..e779b14d6e 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -16,10 +16,10 @@ from __future__ import annotations -import difflib import errno import itertools import json +import math import os.path import re import struct @@ -30,7 +30,7 @@ from functools import cached_property, partial, total_ordering from http import HTTPStatus from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator -from urllib.parse import quote, urlencode +from urllib.parse import quote, urlencode, urlparse import requests from typing_extensions import TypedDict @@ -38,6 +38,7 @@ import beets from beets import plugins, ui +from beets.autotag.hooks import string_dist if TYPE_CHECKING: from beets.importer import ImportTask @@ -485,15 +486,47 @@ def fetch(self, artist: str, title: str, *_) -> str | None: return lyrics -class Genius(Backend): +class SearchBackend(Backend): + REQUIRES_BS = True + + @cached_property + def dist_thresh(self) -> float: + return self.config["dist_thresh"].get(float) + + def check_match( + self, target_artist: str, target_title: str, artist: str, title: str + ) -> bool: + """Check if the given artist and title are 'good enough' match.""" + max_dist = max( + string_dist(target_artist, artist), + string_dist(target_title, title), + ) + + if (max_dist := round(max_dist, 2)) <= self.dist_thresh: + return True + + if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4): + # log out the candidate that did not make it but was close. + # This may show a matching candidate with some noise in the name + self._log.debug( + "({}, {}) does not match ({}, {}) but dist was close: {:.2f}", + artist, + title, + target_artist, + target_title, + max_dist, + ) + + return False + + +class Genius(SearchBackend): """Fetch lyrics from Genius via genius-api. Simply adapted from bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/ """ - REQUIRES_BS = True - base_url = "https://api.genius.com" def __init__(self, config, log): @@ -516,19 +549,15 @@ def fetch(self, artist: str, title: str, *_) -> str | None: self._log.debug("Genius API request returned invalid JSON") return None - # find a matching artist in the json + check = partial(self.check_match, artist, title) for hit in json["response"]["hits"]: - hit_artist = hit["result"]["primary_artist"]["name"] - - if slug(hit_artist) == slug(artist): - html = self.fetch_url(hit["result"]["url"]) + result = hit["result"] + if check(result["primary_artist"]["name"], result["title"]): + html = self.fetch_url(result["url"]) if not html: return None return self._scrape_lyrics_from_html(html) - self._log.debug( - "Genius failed to find a matching artist for '{0}'", artist - ) return None def _search(self, artist, title): @@ -724,10 +753,9 @@ def is_text_notcode(text): return None -class Google(Backend): +class Google(SearchBackend): """Fetch lyrics from Google search results.""" - REQUIRES_BS = True SEARCH_URL = "https://www.googleapis.com/customsearch/v1" def is_lyrics(self, text, artist=None): @@ -775,21 +803,20 @@ def slugify(self, text): BY_TRANS = ["by", "par", "de", "von"] LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"] - def is_page_candidate(self, url_link, url_title, title, artist): + def is_page_candidate( + self, artist: str, title: str, url_link: str, url_title: str + ) -> bool: """Return True if the URL title makes it a good candidate to be a page that contains lyrics of title by artist. """ - title = self.slugify(title.lower()) - artist = self.slugify(artist.lower()) - sitename = re.search( - "//([^/]+)/.*", self.slugify(url_link.lower()) - ).group(1) - url_title = self.slugify(url_title.lower()) - - # Check if URL title contains song title (exact match) - if url_title.find(title) != -1: + title_slug = self.slugify(title.lower()) + url_title_slug = self.slugify(url_title.lower()) + if title_slug in url_title_slug: return True + artist = self.slugify(artist.lower()) + sitename = urlparse(url_link).netloc + # or try extracting song title from URL title and check if # they are close enough tokens = ( @@ -798,12 +825,9 @@ def is_page_candidate(self, url_link, url_title, title, artist): + self.LYRICS_TRANS ) tokens = [re.escape(t) for t in tokens] - song_title = re.sub("(%s)" % "|".join(tokens), "", url_title) + song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug) - song_title = song_title.strip("_|") - typo_ratio = 0.9 - ratio = difflib.SequenceMatcher(None, song_title, title).ratio() - return ratio >= typo_ratio + return self.check_match(artist, title_slug, artist, song_title) def fetch(self, artist: str, title: str, *_) -> str | None: params = { @@ -825,24 +849,21 @@ def fetch(self, artist: str, title: str, *_) -> str | None: self._log.debug("google backend error: {0}", reason) return None - if "items" in data.keys(): - for item in data["items"]: - url_link = item["link"] - url_title = item.get("title", "") - if not self.is_page_candidate( - url_link, url_title, title, artist - ): - continue - html = self.fetch_url(url_link) - if not html: - continue - lyrics = scrape_lyrics_from_html(html) - if not lyrics: - continue - - if self.is_lyrics(lyrics, artist): - self._log.debug("got lyrics from {0}", item["displayLink"]) - return lyrics + check_candidate = partial(self.is_page_candidate, artist, title) + for item in data.get("items", []): + url_link = item["link"] + if not check_candidate(url_link, item.get("title", "")): + continue + html = self.fetch_url(url_link) + if not html: + continue + lyrics = scrape_lyrics_from_html(html) + if not lyrics: + continue + + if self.is_lyrics(lyrics, artist): + self._log.debug("got lyrics from {0}", item["displayLink"]) + return lyrics return None @@ -866,6 +887,7 @@ def __init__(self): "bing_client_secret": None, "bing_lang_from": [], "bing_lang_to": None, + "dist_thresh": 0.11, "google_API_key": None, "google_engine_ID": "009217259823014548361:lndtuqkycfu", "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W" @@ -877,7 +899,6 @@ def __init__(self): # Musixmatch is disabled by default as they are currently blocking # requests with the beets user agent. "sources": [s for s in self.SOURCES if s != "musixmatch"], - "dist_thresh": 0.1, } ) self.config["bing_client_secret"].redact = True diff --git a/docs/changelog.rst b/docs/changelog.rst index 6ce13a8144..2576eefec9 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -11,6 +11,11 @@ been dropped. New features: +* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to + control the maximum allowed distance between the lyrics search result and the + tagged item's artist and title. This is useful for preventing false positives + when fetching lyrics. + Bug fixes: * :doc:`plugins/lyrics`: LRCLib will fallback to plain lyrics if synced lyrics @@ -55,6 +60,9 @@ Bug fixes: ``lrclib`` over other sources since it returns reliable results quicker than others. :bug:`5102` +* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able + to match lyrics when there is a slight variation in the artist name. + :bug:`4791` For packagers: diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst index d1f434d70f..d080b1f940 100644 --- a/docs/plugins/lyrics.rst +++ b/docs/plugins/lyrics.rst @@ -42,6 +42,12 @@ configuration file. The available options are: Default: ``[]`` - **bing_lang_to**: Language to translate lyrics into. Default: None. +- **dist_thresh**: The maximum distance between the artist and title + combination of the music file and lyrics candidate to consider them a match. + Lower values will make the plugin more strict, higher values will make it + more lenient. This does not apply to the ``lrclib`` backend as it matches + durations. + Default: ``0.11``. - **fallback**: By default, the file will be left unchanged when no lyrics are found. Use the empty string ``''`` to reset the lyrics in such a case. Default: None. diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 47c9837701..8123996985 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -161,6 +161,42 @@ def test_slug(self, text, expected): assert lyrics.slug(text) == expected +class TestSearchBackend: + @pytest.fixture + def backend(self, dist_thresh): + plugin = lyrics.LyricsPlugin() + plugin.config.set({"dist_thresh": dist_thresh}) + return lyrics.SearchBackend(plugin.config, plugin._log) + + @pytest.mark.parametrize( + "dist_thresh, target_artist, artist, should_match", + [ + (0.11, "Target Artist", "Target Artist", True), + (0.11, "Target Artist", "Target Artis", True), + (0.11, "Target Artist", "Target Arti", False), + (0.11, "Psychonaut", "Psychonaut (BEL)", True), + (0.11, "beets song", "beats song", True), + (0.10, "beets song", "beats song", False), + ( + 0.11, + "Lucid Dreams (Forget Me)", + "Lucid Dreams (Remix) ft. Lil Uzi Vert", + False, + ), + ( + 0.12, + "Lucid Dreams (Forget Me)", + "Lucid Dreams (Remix) ft. Lil Uzi Vert", + True, + ), + ], + ) + def test_check_match(self, backend, target_artist, artist, should_match): + assert ( + backend.check_match(target_artist, "", artist, "") == should_match + ) + + @pytest.fixture(scope="module") def lyrics_root_dir(pytestconfig: pytest.Config): return pytestconfig.rootpath / "test" / "rsrc" / "lyrics" @@ -275,10 +311,10 @@ def test_is_page_candidate( self, backend, lyrics_html, url_title, artist, should_be_candidate ): result = backend.is_page_candidate( + artist, + self.TITLE, "http://www.example.com/lyrics/beetssong", url_title, - self.TITLE, - artist, ) assert bool(result) == should_be_candidate From 9f0a6ffa598680e37d571d4d9f43814aefb220b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Tue, 27 Aug 2024 13:43:31 +0100 Subject: [PATCH 02/26] Make lyrics plugin documentation slightly more clear --- beetsplug/lyrics.py | 6 +- docs/plugins/lyrics.rst | 130 ++++++++++++++++++++-------------------- 2 files changed, 70 insertions(+), 66 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index e779b14d6e..48299bb80a 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -890,8 +890,10 @@ def __init__(self): "dist_thresh": 0.11, "google_API_key": None, "google_engine_ID": "009217259823014548361:lndtuqkycfu", - "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W" - "76V-uFL5jks5dNvcGCdarqFjDhP9c", + "genius_api_key": ( + "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W" + "76V-uFL5jks5dNvcGCdarqFjDhP9c" + ), "fallback": None, "force": False, "local": False, diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst index d080b1f940..f034cf47a1 100644 --- a/docs/plugins/lyrics.rst +++ b/docs/plugins/lyrics.rst @@ -2,25 +2,27 @@ Lyrics Plugin ============= The ``lyrics`` plugin fetches and stores song lyrics from databases on the Web. -Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_, `LRCLIB`_ -and, optionally, the Google custom search API. +Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_, +`LRCLIB`_ and, optionally, the Google Custom Search API. .. _Genius.com: https://genius.com/ .. _Tekstowo.pl: https://www.tekstowo.pl/ .. _LRCLIB: https://lrclib.net/ -Fetch Lyrics During Import --------------------------- +Install +------- -To automatically fetch lyrics for songs you import, first enable it in your -configuration (see :ref:`using-plugins`). Then, install ``beets`` with -``lyrics`` extra +Firstly, enable ``lyrics`` plugin in your configuration (see +:ref:`using-plugins`). Then, install ``beets`` with ``lyrics`` extra .. code-block:: bash pip install "beets[lyrics]" +Fetch Lyrics During Import +-------------------------- + When importing new files, beets will now fetch lyrics for files that don't already have them. The lyrics will be stored in the beets database. If the ``import.write`` config option is on, then the lyrics will also be written to @@ -29,52 +31,52 @@ the files' tags. Configuration ------------- -To configure the plugin, make a ``lyrics:`` section in your -configuration file. The available options are: +To configure the plugin, make a ``lyrics:`` section in your configuration file. +Default configuration: + +.. code-block:: yaml + + lyrics: + auto: yes + bing_client_secret: null + bing_lang_from: [] + bing_lang_to: null + dist_thresh: 0.11 + fallback: null + force: no + google_API_key: null + google_engine_ID: 009217259823014548361:lndtuqkycfu + sources: [lrclib, google, genius, tekstowo] + synced: no + +The available options are: - **auto**: Fetch lyrics automatically during import. - Default: ``yes``. - **bing_client_secret**: Your Bing Translation application password - (to :ref:`lyrics-translation`) + (see :ref:`lyrics-translation`) - **bing_lang_from**: By default all lyrics with a language other than ``bing_lang_to`` are translated. Use a list of lang codes to restrict the set of source languages to translate. - Default: ``[]`` - **bing_lang_to**: Language to translate lyrics into. - Default: None. - **dist_thresh**: The maximum distance between the artist and title combination of the music file and lyrics candidate to consider them a match. Lower values will make the plugin more strict, higher values will make it more lenient. This does not apply to the ``lrclib`` backend as it matches durations. - Default: ``0.11``. - **fallback**: By default, the file will be left unchanged when no lyrics are found. Use the empty string ``''`` to reset the lyrics in such a case. - Default: None. - **force**: By default, beets won't fetch lyrics if the files already have ones. To instead always fetch lyrics, set the ``force`` option to ``yes``. - Default: ``no``. - **google_API_key**: Your Google API key (to enable the Google Custom Search backend). - Default: None. - **google_engine_ID**: The custom search engine to use. Default: The `beets custom search engine`_, which gathers an updated list of sources known to be scrapeable. - **sources**: List of sources to search for lyrics. An asterisk ``*`` expands - to all available sources. - Default: ``lrclib google genius tekstowo``, i.e., all the available sources. The - ``google`` source will be automatically deactivated if no ``google_API_key`` - is setup. - The ``google``, ``genius``, and ``tekstowo`` sources will only be enabled if - BeautifulSoup is installed. -- **synced**: Prefer synced lyrics over plain lyrics if a source offers them. Currently `lrclib` is the only source that provides them. Default: `no`. - -Here's an example of ``config.yaml``:: - - lyrics: - fallback: '' - google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab - google_engine_ID: 009217259823014548361:lndtuqkycfu + to all available sources. The ``google`` source will be automatically + deactivated if no ``google_API_key`` is setup. +- **synced**: Prefer synced lyrics over plain lyrics if a source offers them. + Currently ``lrclib`` is the only source that provides them. .. _beets custom search engine: https://www.google.com:443/cse/publicurl?cx=009217259823014548361:lndtuqkycfu @@ -89,74 +91,74 @@ by that band, and ``beet lyrics`` will get lyrics for my entire library. The lyrics will be added to the beets database and, if ``import.write`` is on, embedded into files' metadata. -The ``-p`` option to the ``lyrics`` command makes it print lyrics out to the -console so you can view the fetched (or previously-stored) lyrics. +The ``-p, --print`` option to the ``lyrics`` command makes it print lyrics out +to the console so you can view the fetched (or previously-stored) lyrics. -The ``-f`` option forces the command to fetch lyrics, even for tracks that -already have lyrics. Inversely, the ``-l`` option restricts operations -to lyrics that are locally available, which show lyrics faster without using -the network at all. +The ``-f, --force`` option forces the command to fetch lyrics, even for tracks +that already have lyrics. + +Inversely, the ``-l, --local`` option restricts operations to lyrics that are +locally available, which show lyrics faster without using the network at all. Rendering Lyrics into Other Formats ----------------------------------- -The ``-r directory`` option renders all lyrics as `reStructuredText`_ (ReST) -documents in ``directory`` (by default, the current directory). That -directory, in turn, can be parsed by tools like `Sphinx`_ to generate HTML, -ePUB, or PDF documents. +The ``-r directory, --write-rest directory`` option renders all lyrics as +`reStructuredText`_ (ReST) documents in ``directory`` (by default, the current +directory). That directory, in turn, can be parsed by tools like `Sphinx`_ to +generate HTML, ePUB, or PDF documents. -A minimal ``conf.py`` and ``index.rst`` files are created the first time the +Minimal ``conf.py`` and ``index.rst`` files are created the first time the command is run. They are not overwritten on subsequent runs, so you can safely modify these files to customize the output. -.. _Sphinx: https://www.sphinx-doc.org/ -.. _reStructuredText: http://docutils.sourceforge.net/rst.html +Sphinx supports various `builders`_, see a few suggestions: -Sphinx supports various `builders -`_, but here are a -few suggestions. - * Build an HTML version:: +.. admonition:: Build an HTML version - sphinx-build -b html . _build/html + :: - * Build an ePUB3 formatted file, usable on ebook readers:: + sphinx-build -b html . _build/html - sphinx-build -b epub3 . _build/epub +.. admonition:: Build an ePUB3 formatted file, usable on ebook readers - * Build a PDF file, which incidentally also builds a LaTeX file:: + :: - sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf + sphinx-build -b epub3 . _build/epub -.. _activate-google-custom-search: +.. admonition:: Build a PDF file, which incidentally also builds a LaTeX file + + :: + + sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf + + +.. _Sphinx: https://www.sphinx-doc.org/ +.. _reStructuredText: http://docutils.sourceforge.net/rst.html +.. _builders: https://www.sphinx-doc.org/en/stable/builders.html Activate Google Custom Search ------------------------------ You need to `register for a Google API key`_. Set the ``google_API_key`` configuration option to your key. + Then add ``google`` to the list of sources in your configuration (or use default list, which includes it as long as you have an API key). If you use default ``google_engine_ID``, we recommend limiting the sources to ``google`` as the other sources are already included in the Google results. -.. _register for a Google API key: https://console.developers.google.com/ - Optionally, you can `define a custom search engine`_. Get your search engine's token and use it for your ``google_engine_ID`` configuration option. By default, beets use a list of sources known to be scrapeable. -.. _define a custom search engine: https://www.google.com/cse/all - Note that the Google custom search API is limited to 100 queries per day. After that, the lyrics plugin will fall back on other declared data sources. -.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/ - -Activate Genius and Tekstowo.pl Lyrics --------------------------------------- +.. _register for a Google API key: https://console.developers.google.com/ +.. _define a custom search engine: https://www.google.com/cse/all -These backends are enabled by default. .. _lyrics-translation: @@ -167,6 +169,6 @@ You need to register for a Microsoft Azure Marketplace free account and to the `Microsoft Translator API`_. Follow the four steps process, specifically at step 3 enter ``beets`` as *Client ID* and copy/paste the generated *Client secret* into your ``bing_client_secret`` configuration, alongside -``bing_lang_to`` target `language code`. +``bing_lang_to`` target ``language code``. .. _Microsoft Translator API: https://docs.microsoft.com/en-us/azure/cognitive-services/translator/translator-how-to-signup From db345ca3f29306dcf331ec2d7f25dee52ffadcf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Wed, 4 Sep 2024 04:15:46 +0100 Subject: [PATCH 03/26] Centralize requests setup with requests.Session Improve requests performance with requests.Session which uses connection pooling for repeated requests to the same host. Additionally, this centralizes request configuration, making sure that we use the same timeout and provide beets user agent for all requests. --- beetsplug/lyrics.py | 68 +++++++++++++++++---------------------------- setup.cfg | 4 +-- 2 files changed, 28 insertions(+), 44 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 48299bb80a..96943e538d 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -16,6 +16,7 @@ from __future__ import annotations +import atexit import errno import itertools import json @@ -24,13 +25,12 @@ import re import struct import unicodedata -import warnings from contextlib import suppress from dataclasses import dataclass from functools import cached_property, partial, total_ordering from http import HTTPStatus from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator -from urllib.parse import quote, urlencode, urlparse +from urllib.parse import quote, urlparse import requests from typing_extensions import TypedDict @@ -106,6 +106,22 @@ class NotFoundError(requests.exceptions.HTTPError): pass +class TimeoutSession(requests.Session): + def request(self, *args, **kwargs): + kwargs.setdefault("timeout", 10) + return super().request(*args, **kwargs) + + +r_session = TimeoutSession() +r_session.headers.update({"User-Agent": USER_AGENT}) + + +@atexit.register +def close_session(): + """Close the requests session on shut down.""" + r_session.close() + + # Utilities. @@ -246,21 +262,7 @@ def fetch_url(self, url, **kwargs): is unreachable. """ try: - # Disable the InsecureRequestWarning that comes from using - # `verify=false`. - # https://github.com/kennethreitz/requests/issues/2214 - # We're not overly worried about the NSA MITMing our lyrics scraper - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - r = requests.get( - url, - verify=False, - headers={ - "User-Agent": USER_AGENT, - }, - timeout=10, - **kwargs, - ) + r = r_session.get(url) except requests.RequestException as exc: self._log.debug("lyrics request failed: {0}", exc) return @@ -368,9 +370,7 @@ def warn(self, message: str, *args) -> None: def fetch_json(self, *args, **kwargs): """Wrap the request method to raise an exception on HTTP errors.""" - kwargs.setdefault("timeout", 10) - kwargs.setdefault("headers", {"User-Agent": USER_AGENT}) - r = requests.get(*args, **kwargs) + r = r_session.get(*args, **kwargs) if r.status_code == HTTPStatus.NOT_FOUND: raise NotFoundError("HTTP Error: Not Found", response=r) r.raise_for_status() @@ -532,10 +532,7 @@ class Genius(SearchBackend): def __init__(self, config, log): super().__init__(config, log) self.api_key = config["genius_api_key"].as_str() - self.headers = { - "Authorization": "Bearer %s" % self.api_key, - "User-Agent": USER_AGENT, - } + self.headers = {"Authorization": f"Bearer {self.api_key}"} def fetch(self, artist: str, title: str, *_) -> str | None: """Fetch lyrics from genius.com @@ -570,18 +567,13 @@ def _search(self, artist, title): search_url = self.base_url + "/search" data = {"q": title + " " + artist.lower()} try: - response = requests.get( - search_url, - params=data, - headers=self.headers, - timeout=10, - ) + r = r_session.get(search_url, params=data, headers=self.headers) except requests.RequestException as exc: self._log.debug("Genius API request failed: {0}", exc) return None try: - return response.json() + return r.json() except ValueError: return None @@ -976,13 +968,7 @@ def get_bing_access_token(self): } oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13" - oauth_token = json.loads( - requests.post( - oauth_url, - data=urlencode(params), - timeout=10, - ).content - ) + oauth_token = r_session.post(oauth_url, params=params).json() if "access_token" in oauth_token: return "Bearer " + oauth_token["access_token"] else: @@ -1199,10 +1185,8 @@ def append_translation(self, text, to_lang): "https://api.microsofttranslator.com/v2/Http.svc/" "Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang) ) - r = requests.get( - url, - headers={"Authorization ": self.bing_auth_token}, - timeout=10, + r = r_session.get( + url, headers={"Authorization": self.bing_auth_token} ) if r.status_code != 200: self._log.debug( diff --git a/setup.cfg b/setup.cfg index 15ca23f658..8e3d7e3b82 100644 --- a/setup.cfg +++ b/setup.cfg @@ -21,8 +21,8 @@ omit = beets/test/* precision = 2 skip_empty = true show_missing = true -exclude_lines = - pragma: no cover +exclude_also = + @atexit.register if TYPE_CHECKING if typing.TYPE_CHECKING raise AssertionError From eaa54441f84f37ad77ba0037b501fb52721089c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Sun, 13 Oct 2024 16:14:15 +0100 Subject: [PATCH 04/26] Centralise request error handling --- beetsplug/lyrics.py | 228 +++++++++++++++--------------------- test/plugins/test_lyrics.py | 62 ++++++---- 2 files changed, 136 insertions(+), 154 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 96943e538d..240c2c2c01 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -19,13 +19,12 @@ import atexit import errno import itertools -import json import math import os.path import re import struct import unicodedata -from contextlib import suppress +from contextlib import contextmanager, suppress from dataclasses import dataclass from functools import cached_property, partial, total_ordering from http import HTTPStatus @@ -108,8 +107,14 @@ class NotFoundError(requests.exceptions.HTTPError): class TimeoutSession(requests.Session): def request(self, *args, **kwargs): + """Wrap the request method to raise an exception on HTTP errors.""" kwargs.setdefault("timeout", 10) - return super().request(*args, **kwargs) + r = super().request(*args, **kwargs) + if r.status_code == HTTPStatus.NOT_FOUND: + raise NotFoundError("HTTP Error: Not Found", response=r) + r.raise_for_status() + + return r r_session = TimeoutSession() @@ -250,28 +255,36 @@ def try_parse_html(html, **kwargs): return None -class Backend: +class RequestHandler: + _log: beets.logging.Logger + + def fetch_text(self, url: str, **kwargs) -> str: + """Return text / HTML data from the given URL.""" + self._log.debug("Fetching HTML from {}", url) + return r_session.get(url, **kwargs).text + + def fetch_json(self, url: str, **kwargs): + """Return JSON data from the given URL.""" + self._log.debug("Fetching JSON from {}", url) + return r_session.get(url, **kwargs).json() + + @contextmanager + def handle_request(self) -> Iterator[None]: + try: + yield + except requests.JSONDecodeError: + self._log.warning("Could not decode response JSON data") + except requests.RequestException as exc: + self._log.warning("Request error: {}", exc) + + +class Backend(RequestHandler): REQUIRES_BS = False def __init__(self, config, log): self._log = log self.config = config - def fetch_url(self, url, **kwargs): - """Retrieve the content at a given URL, or return None if the source - is unreachable. - """ - try: - r = r_session.get(url) - except requests.RequestException as exc: - self._log.debug("lyrics request failed: {0}", exc) - return - if r.status_code == requests.codes.ok: - return r.text - else: - self._log.debug("failed to fetch: {0} ({1})", url, r.status_code) - return None - def fetch( self, artist: str, title: str, album: str, length: int ) -> str | None: @@ -368,15 +381,6 @@ def warn(self, message: str, *args) -> None: """Log a warning message with the class name.""" self._log.warning(f"{self.__class__.__name__}: {message}", *args) - def fetch_json(self, *args, **kwargs): - """Wrap the request method to raise an exception on HTTP errors.""" - r = r_session.get(*args, **kwargs) - if r.status_code == HTTPStatus.NOT_FOUND: - raise NotFoundError("HTTP Error: Not Found", response=r) - r.raise_for_status() - - return r.json() - def fetch_candidates( self, artist: str, title: str, album: str, length: int ) -> Iterator[list[LRCLibItem]]: @@ -414,13 +418,7 @@ def fetch( if item := self.pick_best_match(candidates): return item.get_text(self.config["synced"]) except StopIteration: - pass - except requests.JSONDecodeError: - self.warn("Could not decode response JSON data") - except requests.RequestException as exc: - self.warn("Request error: {}", exc) - - return None + return None class DirectBackend(Backend): @@ -460,9 +458,7 @@ def encode(cls, text: str) -> str: def fetch(self, artist: str, title: str, *_) -> str | None: url = self.build_url(artist, title) - html = self.fetch_url(url) - if not html: - return None + html = self.fetch_text(url) if "We detected that your IP is blocked" in html: self._log.warning( "we are blocked at MusixMatch: url %s failed" % url @@ -528,6 +524,7 @@ class Genius(SearchBackend): """ base_url = "https://api.genius.com" + search_url = f"{base_url}/search" def __init__(self, config, log): super().__init__(config, log) @@ -550,10 +547,9 @@ def fetch(self, artist: str, title: str, *_) -> str | None: for hit in json["response"]["hits"]: result = hit["result"] if check(result["primary_artist"]["name"], result["title"]): - html = self.fetch_url(result["url"]) - if not html: - return None - return self._scrape_lyrics_from_html(html) + return self._scrape_lyrics_from_html( + self.fetch_text(result["url"]) + ) return None @@ -564,29 +560,20 @@ def _search(self, artist, title): :returns: json response """ - search_url = self.base_url + "/search" - data = {"q": title + " " + artist.lower()} - try: - r = r_session.get(search_url, params=data, headers=self.headers) - except requests.RequestException as exc: - self._log.debug("Genius API request failed: {0}", exc) - return None - - try: - return r.json() - except ValueError: - return None + return self.fetch_json( + self.search_url, + params={"q": f"{title} {artist.lower()}"}, + headers=self.headers, + ) def replace_br(self, lyrics_div): for br in lyrics_div.find_all("br"): br.replace_with("\n") - def _scrape_lyrics_from_html(self, html): + def _scrape_lyrics_from_html(self, html: str) -> str | None: """Scrape lyrics from a given genius.com html""" soup = try_parse_html(html) - if not soup: - return # Remove script tags that they put in the middle of the lyrics. [h.extract() for h in soup("script")] @@ -657,10 +644,12 @@ def encode(cls, text: str) -> str: return cls.non_alpha_to_underscore(unidecode(text.lower())) def fetch(self, artist: str, title: str, *_) -> str | None: - if html := self.fetch_url(self.build_url(artist, title)): - return self.extract_lyrics(html) - - return None + # We are expecting to receive a 404 since we are guessing the URL. + # Thus suppress the error so that it does not end up in the logs. + with suppress(NotFoundError): + return self.extract_lyrics( + self.fetch_text(self.build_url(artist, title)) + ) def extract_lyrics(self, html: str) -> str | None: html = _scrape_strip_cruft(html) @@ -714,7 +703,7 @@ def _scrape_merge_paragraphs(html): return re.sub(r"
\s*
", "\n", html) -def scrape_lyrics_from_html(html): +def scrape_lyrics_from_html(html: str) -> str | None: """Scrape lyrics from a URL. If no lyrics can be found, return None instead. """ @@ -734,8 +723,6 @@ def is_text_notcode(text): # extract all long text blocks that are not code soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode)) - if not soup: - return None # Get the longest text element (if any). strings = sorted(soup.stripped_strings, key=len, reverse=True) @@ -828,39 +815,26 @@ def fetch(self, artist: str, title: str, *_) -> str | None: "q": f"{artist} {title}", } - data = self.fetch_url(self.SEARCH_URL, params=params) - if not data: - self._log.debug("google backend returned no data") - return None - try: - data = json.loads(data) - except ValueError as exc: - self._log.debug("google backend returned malformed JSON: {}", exc) - if "error" in data: - reason = data["error"]["errors"][0]["reason"] - self._log.debug("google backend error: {0}", reason) - return None - check_candidate = partial(self.is_page_candidate, artist, title) - for item in data.get("items", []): + for item in self.fetch_json(self.SEARCH_URL, params=params).get( + "items", [] + ): url_link = item["link"] if not check_candidate(url_link, item.get("title", "")): continue - html = self.fetch_url(url_link) - if not html: - continue - lyrics = scrape_lyrics_from_html(html) - if not lyrics: - continue + with self.handle_request(): + lyrics = scrape_lyrics_from_html(self.fetch_text(url_link)) + if not lyrics: + continue - if self.is_lyrics(lyrics, artist): - self._log.debug("got lyrics from {0}", item["displayLink"]) - return lyrics + if self.is_lyrics(lyrics, artist): + self._log.debug("got lyrics from {0}", item["displayLink"]) + return lyrics return None -class LyricsPlugin(plugins.BeetsPlugin): +class LyricsPlugin(RequestHandler, plugins.BeetsPlugin): SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"] SOURCE_BACKENDS = { "google": Google, @@ -931,7 +905,6 @@ def __init__(self): self.config["bing_lang_from"] = [ x.lower() for x in self.config["bing_lang_from"].as_str_seq() ] - self.bing_auth_token = None if not HAS_LANGDETECT and self.config["bing_client_secret"].get(): self._log.warning( @@ -959,7 +932,8 @@ def sanitize_bs_sources(self, sources): return enabled_sources - def get_bing_access_token(self): + @cached_property + def bing_access_token(self) -> str | None: params = { "client_id": "beets", "client_secret": self.config["bing_client_secret"], @@ -968,14 +942,11 @@ def get_bing_access_token(self): } oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13" - oauth_token = r_session.post(oauth_url, params=params).json() - if "access_token" in oauth_token: - return "Bearer " + oauth_token["access_token"] - else: - self._log.warning( - "Could not get Bing Translate API access token." - ' Check your "bing_client_secret" password' - ) + with self.handle_request(): + r = r_session.post(oauth_url, params=params) + return r.json()["access_token"] + + return None def commands(self): cmd = ui.Subcommand("lyrics", help="fetch song lyrics") @@ -1164,44 +1135,39 @@ def get_lyrics(self, artist: str, title: str, *args) -> str | None: None if no lyrics were found. """ for backend in self.backends: - lyrics = backend.fetch(artist, title, *args) - if lyrics: - self._log.debug( - "got lyrics from backend: {0}", backend.__class__.__name__ - ) - return _scrape_strip_cruft(lyrics, True) + with backend.handle_request(): + if lyrics := backend.fetch(artist, title, *args): + self._log.debug( + "got lyrics from backend: {0}", + backend.__class__.__name__, + ) + return _scrape_strip_cruft(lyrics, True) return None def append_translation(self, text, to_lang): from xml.etree import ElementTree - if not self.bing_auth_token: - self.bing_auth_token = self.get_bing_access_token() - if self.bing_auth_token: - # Extract unique lines to limit API request size per song - text_lines = set(text.split("\n")) - url = ( - "https://api.microsofttranslator.com/v2/Http.svc/" - "Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang) + if not (token := self.bing_access_token): + self._log.warning( + "Could not get Bing Translate API access token. " + "Check your 'bing_client_secret' password." ) - r = r_session.get( - url, headers={"Authorization": self.bing_auth_token} + return text + + # Extract unique lines to limit API request size per song + lines = text.split("\n") + unique_lines = set(lines) + url = "https://api.microsofttranslator.com/v2/Http.svc/Translate" + with self.handle_request(): + text = self.fetch_text( + url, + headers={"Authorization": f"Bearer {token}"}, + params={"text": "|".join(unique_lines), "to": to_lang}, ) - if r.status_code != 200: - self._log.debug( - "translation API error {}: {}", r.status_code, r.text - ) - if "token has expired" in r.text: - self.bing_auth_token = None - return self.append_translation(text, to_lang) - return text - lines_translated = ElementTree.fromstring( - r.text.encode("utf-8") - ).text - # Use a translation mapping dict to build resulting lyrics - translations = dict(zip(text_lines, lines_translated.split("|"))) - result = "" - for line in text.split("\n"): - result += "{} / {}\n".format(line, translations[line]) - return result + if translated := ElementTree.fromstring(text.encode("utf-8")).text: + # Use a translation mapping dict to build resulting lyrics + translations = dict(zip(unique_lines, translated.split("|"))) + return "".join(f"{ln} / {translations[ln]}\n" for ln in lines) + + return text diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 8123996985..b77054b52f 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -202,7 +202,7 @@ def lyrics_root_dir(pytestconfig: pytest.Config): return pytestconfig.rootpath / "test" / "rsrc" / "lyrics" -class LyricsBackendTest(PluginMixin): +class LyricsPluginMixin(PluginMixin): plugin = "lyrics" @pytest.fixture @@ -218,6 +218,42 @@ def lyrics_plugin(self, backend_name, plugin_config): return lyrics.LyricsPlugin() + +class TestLyricsPlugin(LyricsPluginMixin): + @pytest.fixture + def backend_name(self): + """Return lyrics configuration to test.""" + return "lrclib" + + @pytest.mark.parametrize( + "request_kwargs, expected_log_match", + [ + ( + {"status_code": HTTPStatus.BAD_GATEWAY}, + r"lyrics: Request error: 502", + ), + ({"text": "invalid"}, r"lyrics: Could not decode.*JSON"), + ], + ) + def test_error_handling( + self, + requests_mock, + lyrics_plugin, + caplog, + request_kwargs, + expected_log_match, + ): + """Errors are logged with the plugin name.""" + requests_mock.get(lyrics.LRCLib.SEARCH_URL, **request_kwargs) + + assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None + assert caplog.messages + last_log = caplog.messages[-1] + assert last_log + assert re.search(expected_log_match, last_log, re.I) + + +class LyricsBackendTest(LyricsPluginMixin): @pytest.fixture def backend(self, lyrics_plugin): """Return a lyrics backend instance.""" @@ -399,13 +435,9 @@ def backend_name(self): return "lrclib" @pytest.fixture - def request_kwargs(self, response_data): - return {"json": response_data} - - @pytest.fixture - def fetch_lyrics(self, backend, requests_mock, request_kwargs): + def fetch_lyrics(self, backend, requests_mock, response_data): requests_mock.get(backend.GET_URL, status_code=HTTPStatus.NOT_FOUND) - requests_mock.get(backend.SEARCH_URL, **request_kwargs) + requests_mock.get(backend.SEARCH_URL, json=response_data) return partial(backend.fetch, "la", "la", "la", self.ITEM_DURATION) @@ -478,19 +510,3 @@ def test_synced_config_option(self, fetch_lyrics, expected_lyrics): @pytest.mark.parametrize("plugin_config", [{"synced": True}]) def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics): assert fetch_lyrics() == expected_lyrics - - @pytest.mark.parametrize( - "request_kwargs, expected_log_match", - [ - ( - {"status_code": HTTPStatus.BAD_GATEWAY}, - r"LRCLib: Request error: 502", - ), - ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"), - ], - ) - def test_error(self, caplog, fetch_lyrics, expected_log_match): - assert fetch_lyrics() is None - assert caplog.messages - assert (last_log := caplog.messages[-1]) - assert re.search(expected_log_match, last_log, re.I) From 65557de0f25f3b01b8f6aee5b210d67270555957 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Fri, 6 Sep 2024 07:35:24 +0100 Subject: [PATCH 05/26] Include class name in the log messages --- beetsplug/lyrics.py | 113 +++++++++++++++++++----------------- test/plugins/test_lyrics.py | 6 +- 2 files changed, 64 insertions(+), 55 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 240c2c2c01..06352ba994 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -28,8 +28,8 @@ from dataclasses import dataclass from functools import cached_property, partial, total_ordering from http import HTTPStatus -from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator -from urllib.parse import quote, urlparse +from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator +from urllib.parse import quote, urlencode, urlparse import requests from typing_extensions import TypedDict @@ -58,6 +58,8 @@ except ImportError: HAS_LANGDETECT = False +JSONDict = dict[str, Any] + DIV_RE = re.compile(r"<(/?)div>?", re.I) COMMENT_RE = re.compile(r"", re.S) TAG_RE = re.compile(r"<[^>]*>") @@ -258,14 +260,37 @@ def try_parse_html(html, **kwargs): class RequestHandler: _log: beets.logging.Logger - def fetch_text(self, url: str, **kwargs) -> str: + def debug(self, message: str, *args) -> None: + """Log a debug message with the class name.""" + self._log.debug(f"{self.__class__.__name__}: {message}", *args) + + def info(self, message: str, *args) -> None: + """Log an info message with the class name.""" + self._log.info(f"{self.__class__.__name__}: {message}", *args) + + def warn(self, message: str, *args) -> None: + """Log warning with the class name.""" + self._log.warning(f"{self.__class__.__name__}: {message}", *args) + + @staticmethod + def format_url(url: str, params: JSONDict | None) -> str: + if not params: + return url + + return f"{url}?{urlencode(params)}" + + def fetch_text( + self, url: str, params: JSONDict | None = None, **kwargs + ) -> str: """Return text / HTML data from the given URL.""" - self._log.debug("Fetching HTML from {}", url) + url = self.format_url(url, params) + self.debug("Fetching HTML from {}", url) return r_session.get(url, **kwargs).text - def fetch_json(self, url: str, **kwargs): + def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs): """Return JSON data from the given URL.""" - self._log.debug("Fetching JSON from {}", url) + url = self.format_url(url, params) + self.debug("Fetching JSON from {}", url) return r_session.get(url, **kwargs).json() @contextmanager @@ -273,9 +298,9 @@ def handle_request(self) -> Iterator[None]: try: yield except requests.JSONDecodeError: - self._log.warning("Could not decode response JSON data") + self.warn("Could not decode response JSON data") except requests.RequestException as exc: - self._log.warning("Request error: {}", exc) + self.warn("Request error: {}", exc) class Backend(RequestHandler): @@ -377,10 +402,6 @@ class LRCLib(Backend): GET_URL = f"{BASE_URL}/get" SEARCH_URL = f"{BASE_URL}/search" - def warn(self, message: str, *args) -> None: - """Log a warning message with the class name.""" - self._log.warning(f"{self.__class__.__name__}: {message}", *args) - def fetch_candidates( self, artist: str, title: str, album: str, length: int ) -> Iterator[list[LRCLibItem]]: @@ -412,13 +433,12 @@ def fetch( """Fetch lyrics text for the given song data.""" evaluate_item = partial(LRCLyrics.make, target_duration=length) - try: - for group in self.fetch_candidates(artist, title, album, length): - candidates = [evaluate_item(item) for item in group] - if item := self.pick_best_match(candidates): - return item.get_text(self.config["synced"]) - except StopIteration: - return None + for group in self.fetch_candidates(artist, title, album, length): + candidates = [evaluate_item(item) for item in group] + if item := self.pick_best_match(candidates): + return item.get_text(self.config["synced"]) + + return None class DirectBackend(Backend): @@ -460,9 +480,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None: html = self.fetch_text(url) if "We detected that your IP is blocked" in html: - self._log.warning( - "we are blocked at MusixMatch: url %s failed" % url - ) + self.warn("Failed: Blocked IP address") return None html_parts = html.split('

str | None: then attempt to scrape that url for the lyrics. """ json = self._search(artist, title) - if not json: - self._log.debug("Genius API request returned invalid JSON") - return None check = partial(self.check_match, artist, title) for hit in json["response"]["hits"]: @@ -585,7 +600,7 @@ def _scrape_lyrics_from_html(self, html: str) -> str | None: lyrics_divs = soup.find_all("div", {"data-lyrics-container": True}) if not lyrics_divs: - self._log.debug("Received unusual song page html") + self.debug("Received unusual song page html") return self._try_extracting_lyrics_from_non_data_lyrics_container( soup ) @@ -608,10 +623,10 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup): class_=re.compile("LyricsPlaceholder__Message"), string="This song is an instrumental", ): - self._log.debug("Detected instrumental") + self.debug("Detected instrumental") return INSTRUMENTAL_LYRICS else: - self._log.debug("Couldn't scrape page using known layouts") + self.debug("Couldn't scrape page using known layouts") return None lyrics_div = verse_div.parent @@ -651,6 +666,8 @@ def fetch(self, artist: str, title: str, *_) -> str | None: self.fetch_text(self.build_url(artist, title)) ) + return None + def extract_lyrics(self, html: str) -> str | None: html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) @@ -744,7 +761,7 @@ def is_lyrics(self, text, artist=None): bad_triggers_occ = [] nb_lines = text.count("\n") if nb_lines <= 1: - self._log.debug("Ignoring too short lyrics '{0}'", text) + self.debug("Ignoring too short lyrics '{}'", text) return False elif nb_lines < 5: bad_triggers_occ.append("too_short") @@ -763,7 +780,7 @@ def is_lyrics(self, text, artist=None): ) if bad_triggers_occ: - self._log.debug("Bad triggers detected: {0}", bad_triggers_occ) + self.debug("Bad triggers detected: {}", bad_triggers_occ) return len(bad_triggers_occ) < 2 def slugify(self, text): @@ -776,7 +793,7 @@ def slugify(self, text): text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore") text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8"))) except UnicodeDecodeError: - self._log.exception("Failing to normalize '{0}'", text) + self.debug("Failed to normalize '{}'", text) return text BY_TRANS = ["by", "par", "de", "von"] @@ -828,7 +845,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None: continue if self.is_lyrics(lyrics, artist): - self._log.debug("got lyrics from {0}", item["displayLink"]) + self.debug("Got lyrics from {}", item["displayLink"]) return lyrics return None @@ -897,9 +914,7 @@ def __init__(self): # configuration includes `google`. This way, the source # is silent by default but can be enabled just by # setting an API key. - self._log.debug( - "Disabling google source: " "no API key configured." - ) + self.debug("Disabling google source: " "no API key configured.") sources.remove("google") self.config["bing_lang_from"] = [ @@ -907,15 +922,14 @@ def __init__(self): ] if not HAS_LANGDETECT and self.config["bing_client_secret"].get(): - self._log.warning( - "To use bing translations, you need to " - "install the langdetect module. See the " - "documentation for further details." + self.warn( + "To use bing translations, you need to install the langdetect " + "module. See the documentation for further details." ) self.backends = [ - self.SOURCE_BACKENDS[source](self.config, self._log) - for source in sources + self.SOURCE_BACKENDS[s](self.config, self._log.getChild(s)) + for s in sources ] def sanitize_bs_sources(self, sources): @@ -946,8 +960,6 @@ def bing_access_token(self) -> str | None: r = r_session.post(oauth_url, params=params) return r.json()["access_token"] - return None - def commands(self): cmd = ui.Subcommand("lyrics", help="fetch song lyrics") cmd.parser.add_option( @@ -1092,7 +1104,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None: """ # Skip if the item already has lyrics. if not force and item.lyrics: - self._log.info("lyrics already present: {0}", item) + self.info("Lyrics already present: {}", item) return lyrics_matches = [] @@ -1108,7 +1120,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None: lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches)) if lyrics: - self._log.info("fetched lyrics: {0}", item) + self.info("Lyrics found: {}", item) if HAS_LANGDETECT and self.config["bing_client_secret"].get(): lang_from = langdetect.detect(lyrics) if self.config["bing_lang_to"].get() != lang_from and ( @@ -1119,7 +1131,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None: lyrics, self.config["bing_lang_to"] ) else: - self._log.info("lyrics not found: {0}", item) + self.info("Lyrics not found: {}", item) fallback = self.config["fallback"].get() if fallback: lyrics = fallback @@ -1134,13 +1146,10 @@ def get_lyrics(self, artist: str, title: str, *args) -> str | None: """Fetch lyrics, trying each source in turn. Return a string or None if no lyrics were found. """ + self.info("Fetching lyrics for {} - {}", artist, title) for backend in self.backends: with backend.handle_request(): if lyrics := backend.fetch(artist, title, *args): - self._log.debug( - "got lyrics from backend: {0}", - backend.__class__.__name__, - ) return _scrape_strip_cruft(lyrics, True) return None @@ -1149,7 +1158,7 @@ def append_translation(self, text, to_lang): from xml.etree import ElementTree if not (token := self.bing_access_token): - self._log.warning( + self.warn( "Could not get Bing Translate API access token. " "Check your 'bing_client_secret' password." ) diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index b77054b52f..817eb4af92 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -230,9 +230,9 @@ def backend_name(self): [ ( {"status_code": HTTPStatus.BAD_GATEWAY}, - r"lyrics: Request error: 502", + r"LRCLib: Request error: 502", ), - ({"text": "invalid"}, r"lyrics: Could not decode.*JSON"), + ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"), ], ) def test_error_handling( @@ -243,7 +243,7 @@ def test_error_handling( request_kwargs, expected_log_match, ): - """Errors are logged with the plugin name.""" + """Errors are logged with the backend name.""" requests_mock.get(lyrics.LRCLib.SEARCH_URL, **request_kwargs) assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None From bf00c26494e693d4f1e686c25c4eb198aeacd0e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Wed, 11 Sep 2024 07:51:43 +0100 Subject: [PATCH 06/26] Leave a single chef in the kitchen --- beetsplug/lyrics.py | 79 ++++++++++--------------------------- test/plugins/test_lyrics.py | 6 +-- 2 files changed, 23 insertions(+), 62 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 06352ba994..168b00821d 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -44,8 +44,7 @@ from beets.library import Item try: - import bs4 - from bs4 import SoupStrainer + from bs4 import BeautifulSoup HAS_BEAUTIFUL_SOUP = True except ImportError: @@ -246,17 +245,6 @@ def slug(text): return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-") -if HAS_BEAUTIFUL_SOUP: - - def try_parse_html(html, **kwargs): - return bs4.BeautifulSoup(html, "html.parser", **kwargs) - -else: - - def try_parse_html(html, **kwargs): - return None - - class RequestHandler: _log: beets.logging.Logger @@ -562,9 +550,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None: for hit in json["response"]["hits"]: result = hit["result"] if check(result["primary_artist"]["name"], result["title"]): - return self._scrape_lyrics_from_html( - self.fetch_text(result["url"]) - ) + return self.scrape_lyrics(self.fetch_text(hit["result"]["url"])) return None @@ -581,17 +567,9 @@ def _search(self, artist, title): headers=self.headers, ) - def replace_br(self, lyrics_div): - for br in lyrics_div.find_all("br"): - br.replace_with("\n") - - def _scrape_lyrics_from_html(self, html: str) -> str | None: + def scrape_lyrics(self, html: str) -> str | None: """Scrape lyrics from a given genius.com html""" - - soup = try_parse_html(html) - - # Remove script tags that they put in the middle of the lyrics. - [h.extract() for h in soup("script")] + soup = get_soup(html) # Most of the time, the page contains a div with class="lyrics" where # all of the lyrics can be found already correctly formatted @@ -606,7 +584,6 @@ def _scrape_lyrics_from_html(self, html: str) -> str | None: ) lyrics = "" for lyrics_div in lyrics_divs: - self.replace_br(lyrics_div) lyrics += lyrics_div.get_text() + "\n\n" while lyrics[-1] == "\n": lyrics = lyrics[:-1] @@ -630,7 +607,6 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup): return None lyrics_div = verse_div.parent - self.replace_br(lyrics_div) ads = lyrics_div.find_all( "div", class_=re.compile("InreadAd__Container") @@ -662,17 +638,14 @@ def fetch(self, artist: str, title: str, *_) -> str | None: # We are expecting to receive a 404 since we are guessing the URL. # Thus suppress the error so that it does not end up in the logs. with suppress(NotFoundError): - return self.extract_lyrics( + return self.scrape_lyrics( self.fetch_text(self.build_url(artist, title)) ) return None - def extract_lyrics(self, html: str) -> str | None: - html = _scrape_strip_cruft(html) - html = _scrape_merge_paragraphs(html) - - soup = try_parse_html(html) + def scrape_lyrics(self, html: str) -> str | None: + soup = get_soup(html) if lyrics_div := soup.select_one("div.song-text > div.inner-text"): return lyrics_div.get_text() @@ -720,33 +693,11 @@ def _scrape_merge_paragraphs(html): return re.sub(r"

\s*
", "\n", html) -def scrape_lyrics_from_html(html: str) -> str | None: - """Scrape lyrics from a URL. If no lyrics can be found, return None - instead. - """ - - def is_text_notcode(text): - if not text: - return False - length = len(text) - return ( - length > 20 - and text.count(" ") > length / 25 - and (text.find("{") == -1 or text.find(";") == -1) - ) - +def get_soup(html: str) -> BeautifulSoup: html = _scrape_strip_cruft(html) html = _scrape_merge_paragraphs(html) - # extract all long text blocks that are not code - soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode)) - - # Get the longest text element (if any). - strings = sorted(soup.stripped_strings, key=len, reverse=True) - if strings: - return strings[0] - else: - return None + return BeautifulSoup(html, "html.parser") class Google(SearchBackend): @@ -754,6 +705,16 @@ class Google(SearchBackend): SEARCH_URL = "https://www.googleapis.com/customsearch/v1" + @staticmethod + def scrape_lyrics(html: str) -> str | None: + soup = get_soup(html) + + # Get the longest text element (if any). + strings = sorted(soup.stripped_strings, key=len, reverse=True) + if strings: + return strings[0] + return None + def is_lyrics(self, text, artist=None): """Determine whether the text seems to be valid lyrics.""" if not text: @@ -840,7 +801,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None: if not check_candidate(url_link, item.get("title", "")): continue with self.handle_request(): - lyrics = scrape_lyrics_from_html(self.fetch_text(url_link)) + lyrics = self.scrape_lyrics(self.fetch_text(url_link)) if not lyrics: continue diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py index 817eb4af92..d412d318bc 100644 --- a/test/plugins/test_lyrics.py +++ b/test/plugins/test_lyrics.py @@ -328,7 +328,7 @@ def file_name(self): def test_mocked_source_ok(self, backend, lyrics_html): """Test that lyrics of the mocked page are correctly scraped""" - result = lyrics.scrape_lyrics_from_html(lyrics_html).lower() + result = backend.scrape_lyrics(lyrics_html).lower() assert result assert backend.is_lyrics(result) @@ -390,7 +390,7 @@ def backend_name(self): ], ) # fmt: skip def test_scrape(self, backend, lyrics_html, expected_line_count): - result = backend._scrape_lyrics_from_html(lyrics_html) or "" + result = backend.scrape_lyrics(lyrics_html) or "" assert len(result.splitlines()) == expected_line_count @@ -411,7 +411,7 @@ def backend_name(self): ], ) def test_scrape(self, backend, lyrics_html, expecting_lyrics): - assert bool(backend.extract_lyrics(lyrics_html)) == expecting_lyrics + assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics LYRICS_DURATION = 950 From 469db00ee23711660880ece870f6d09a755b2786 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= Date: Sat, 19 Oct 2024 03:30:41 +0100 Subject: [PATCH 07/26] Do not try to strip cruft from the parsed lyrics text. Having removed it I fuond that only the Genius lyrics changed: it had en extra new line. Thus I defined a function 'collapse_newlines' which now gets called for the Genius lyrics. --- beetsplug/lyrics.py | 24 +++++++++++------------- test/plugins/test_lyrics.py | 5 ++--- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py index 168b00821d..13b786f4fb 100644 --- a/beetsplug/lyrics.py +++ b/beetsplug/lyrics.py @@ -59,9 +59,6 @@ JSONDict = dict[str, Any] -DIV_RE = re.compile(r"<(/?)div>?", re.I) -COMMENT_RE = re.compile(r"", re.S) -TAG_RE = re.compile(r"<[^>]*>") BREAK_RE = re.compile(r"\n?\s*]*)*>\s*\n?", re.I) USER_AGENT = f"beets/{beets.__version__}" INSTRUMENTAL_LYRICS = "[Instrumental]" @@ -549,8 +546,11 @@ def fetch(self, artist: str, title: str, *_) -> str | None: check = partial(self.check_match, artist, title) for hit in json["response"]["hits"]: result = hit["result"] - if check(result["primary_artist"]["name"], result["title"]): - return self.scrape_lyrics(self.fetch_text(hit["result"]["url"])) + url = hit["result"]["url"] + if check(result["primary_artist"]["name"], result["title"]) and ( + lyrics := self.scrape_lyrics(self.fetch_text(url)) + ): + return collapse_newlines(lyrics) return None @@ -667,7 +667,10 @@ def remove_credits(text): return text -def _scrape_strip_cruft(html, plain_text_out=False): +collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n") + + +def _scrape_strip_cruft(html: str) -> str: """Clean up HTML""" html = unescape(html) @@ -679,13 +682,8 @@ def _scrape_strip_cruft(html, plain_text_out=False): html = re.sub("