From 727a88c0f0e34ead2b56a3c44d948b24cf159ca5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sat, 12 Oct 2024 02:20:37 +0100
Subject: [PATCH 01/26] Apply dist_thresh to Genius and Google backends
This commit introduces a distance threshold mechanism for the Genius and
Google backends.
- Create a new `SearchBackend` base class with a method `check_match`
that performs checking.
- Start using undocumented `dist_thresh` configuration option for good,
and mention it in the docs. This controls the maximum allowable
distance for matching artist and title names.
These changes aim to improve the accuracy of lyrics matching, especially
when there are slight variations in artist or title names, see #4791.
---
beetsplug/lyrics.py | 119 +++++++++++++++++++++---------------
docs/changelog.rst | 8 +++
docs/plugins/lyrics.rst | 6 ++
test/plugins/test_lyrics.py | 40 +++++++++++-
4 files changed, 122 insertions(+), 51 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 70edaa0011..e779b14d6e 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -16,10 +16,10 @@
from __future__ import annotations
-import difflib
import errno
import itertools
import json
+import math
import os.path
import re
import struct
@@ -30,7 +30,7 @@
from functools import cached_property, partial, total_ordering
from http import HTTPStatus
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlencode
+from urllib.parse import quote, urlencode, urlparse
import requests
from typing_extensions import TypedDict
@@ -38,6 +38,7 @@
import beets
from beets import plugins, ui
+from beets.autotag.hooks import string_dist
if TYPE_CHECKING:
from beets.importer import ImportTask
@@ -485,15 +486,47 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
return lyrics
-class Genius(Backend):
+class SearchBackend(Backend):
+ REQUIRES_BS = True
+
+ @cached_property
+ def dist_thresh(self) -> float:
+ return self.config["dist_thresh"].get(float)
+
+ def check_match(
+ self, target_artist: str, target_title: str, artist: str, title: str
+ ) -> bool:
+ """Check if the given artist and title are 'good enough' match."""
+ max_dist = max(
+ string_dist(target_artist, artist),
+ string_dist(target_title, title),
+ )
+
+ if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
+ return True
+
+ if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
+ # log out the candidate that did not make it but was close.
+ # This may show a matching candidate with some noise in the name
+ self._log.debug(
+ "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
+ artist,
+ title,
+ target_artist,
+ target_title,
+ max_dist,
+ )
+
+ return False
+
+
+class Genius(SearchBackend):
"""Fetch lyrics from Genius via genius-api.
Simply adapted from
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
"""
- REQUIRES_BS = True
-
base_url = "https://api.genius.com"
def __init__(self, config, log):
@@ -516,19 +549,15 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
self._log.debug("Genius API request returned invalid JSON")
return None
- # find a matching artist in the json
+ check = partial(self.check_match, artist, title)
for hit in json["response"]["hits"]:
- hit_artist = hit["result"]["primary_artist"]["name"]
-
- if slug(hit_artist) == slug(artist):
- html = self.fetch_url(hit["result"]["url"])
+ result = hit["result"]
+ if check(result["primary_artist"]["name"], result["title"]):
+ html = self.fetch_url(result["url"])
if not html:
return None
return self._scrape_lyrics_from_html(html)
- self._log.debug(
- "Genius failed to find a matching artist for '{0}'", artist
- )
return None
def _search(self, artist, title):
@@ -724,10 +753,9 @@ def is_text_notcode(text):
return None
-class Google(Backend):
+class Google(SearchBackend):
"""Fetch lyrics from Google search results."""
- REQUIRES_BS = True
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
def is_lyrics(self, text, artist=None):
@@ -775,21 +803,20 @@ def slugify(self, text):
BY_TRANS = ["by", "par", "de", "von"]
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
- def is_page_candidate(self, url_link, url_title, title, artist):
+ def is_page_candidate(
+ self, artist: str, title: str, url_link: str, url_title: str
+ ) -> bool:
"""Return True if the URL title makes it a good candidate to be a
page that contains lyrics of title by artist.
"""
- title = self.slugify(title.lower())
- artist = self.slugify(artist.lower())
- sitename = re.search(
- "//([^/]+)/.*", self.slugify(url_link.lower())
- ).group(1)
- url_title = self.slugify(url_title.lower())
-
- # Check if URL title contains song title (exact match)
- if url_title.find(title) != -1:
+ title_slug = self.slugify(title.lower())
+ url_title_slug = self.slugify(url_title.lower())
+ if title_slug in url_title_slug:
return True
+ artist = self.slugify(artist.lower())
+ sitename = urlparse(url_link).netloc
+
# or try extracting song title from URL title and check if
# they are close enough
tokens = (
@@ -798,12 +825,9 @@ def is_page_candidate(self, url_link, url_title, title, artist):
+ self.LYRICS_TRANS
)
tokens = [re.escape(t) for t in tokens]
- song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
+ song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
- song_title = song_title.strip("_|")
- typo_ratio = 0.9
- ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
- return ratio >= typo_ratio
+ return self.check_match(artist, title_slug, artist, song_title)
def fetch(self, artist: str, title: str, *_) -> str | None:
params = {
@@ -825,24 +849,21 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
self._log.debug("google backend error: {0}", reason)
return None
- if "items" in data.keys():
- for item in data["items"]:
- url_link = item["link"]
- url_title = item.get("title", "")
- if not self.is_page_candidate(
- url_link, url_title, title, artist
- ):
- continue
- html = self.fetch_url(url_link)
- if not html:
- continue
- lyrics = scrape_lyrics_from_html(html)
- if not lyrics:
- continue
-
- if self.is_lyrics(lyrics, artist):
- self._log.debug("got lyrics from {0}", item["displayLink"])
- return lyrics
+ check_candidate = partial(self.is_page_candidate, artist, title)
+ for item in data.get("items", []):
+ url_link = item["link"]
+ if not check_candidate(url_link, item.get("title", "")):
+ continue
+ html = self.fetch_url(url_link)
+ if not html:
+ continue
+ lyrics = scrape_lyrics_from_html(html)
+ if not lyrics:
+ continue
+
+ if self.is_lyrics(lyrics, artist):
+ self._log.debug("got lyrics from {0}", item["displayLink"])
+ return lyrics
return None
@@ -866,6 +887,7 @@ def __init__(self):
"bing_client_secret": None,
"bing_lang_from": [],
"bing_lang_to": None,
+ "dist_thresh": 0.11,
"google_API_key": None,
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
"genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@@ -877,7 +899,6 @@ def __init__(self):
# Musixmatch is disabled by default as they are currently blocking
# requests with the beets user agent.
"sources": [s for s in self.SOURCES if s != "musixmatch"],
- "dist_thresh": 0.1,
}
)
self.config["bing_client_secret"].redact = True
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 6ce13a8144..2576eefec9 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -11,6 +11,11 @@ been dropped.
New features:
+* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to
+ control the maximum allowed distance between the lyrics search result and the
+ tagged item's artist and title. This is useful for preventing false positives
+ when fetching lyrics.
+
Bug fixes:
* :doc:`plugins/lyrics`: LRCLib will fallback to plain lyrics if synced lyrics
@@ -55,6 +60,9 @@ Bug fixes:
``lrclib`` over other sources since it returns reliable results quicker than
others.
:bug:`5102`
+* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
+ to match lyrics when there is a slight variation in the artist name.
+ :bug:`4791`
For packagers:
diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
index d1f434d70f..d080b1f940 100644
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@@ -42,6 +42,12 @@ configuration file. The available options are:
Default: ``[]``
- **bing_lang_to**: Language to translate lyrics into.
Default: None.
+- **dist_thresh**: The maximum distance between the artist and title
+ combination of the music file and lyrics candidate to consider them a match.
+ Lower values will make the plugin more strict, higher values will make it
+ more lenient. This does not apply to the ``lrclib`` backend as it matches
+ durations.
+ Default: ``0.11``.
- **fallback**: By default, the file will be left unchanged when no lyrics are
found. Use the empty string ``''`` to reset the lyrics in such a case.
Default: None.
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 47c9837701..8123996985 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -161,6 +161,42 @@ def test_slug(self, text, expected):
assert lyrics.slug(text) == expected
+class TestSearchBackend:
+ @pytest.fixture
+ def backend(self, dist_thresh):
+ plugin = lyrics.LyricsPlugin()
+ plugin.config.set({"dist_thresh": dist_thresh})
+ return lyrics.SearchBackend(plugin.config, plugin._log)
+
+ @pytest.mark.parametrize(
+ "dist_thresh, target_artist, artist, should_match",
+ [
+ (0.11, "Target Artist", "Target Artist", True),
+ (0.11, "Target Artist", "Target Artis", True),
+ (0.11, "Target Artist", "Target Arti", False),
+ (0.11, "Psychonaut", "Psychonaut (BEL)", True),
+ (0.11, "beets song", "beats song", True),
+ (0.10, "beets song", "beats song", False),
+ (
+ 0.11,
+ "Lucid Dreams (Forget Me)",
+ "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+ False,
+ ),
+ (
+ 0.12,
+ "Lucid Dreams (Forget Me)",
+ "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+ True,
+ ),
+ ],
+ )
+ def test_check_match(self, backend, target_artist, artist, should_match):
+ assert (
+ backend.check_match(target_artist, "", artist, "") == should_match
+ )
+
+
@pytest.fixture(scope="module")
def lyrics_root_dir(pytestconfig: pytest.Config):
return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
@@ -275,10 +311,10 @@ def test_is_page_candidate(
self, backend, lyrics_html, url_title, artist, should_be_candidate
):
result = backend.is_page_candidate(
+ artist,
+ self.TITLE,
"http://www.example.com/lyrics/beetssong",
url_title,
- self.TITLE,
- artist,
)
assert bool(result) == should_be_candidate
From 9f0a6ffa598680e37d571d4d9f43814aefb220b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Tue, 27 Aug 2024 13:43:31 +0100
Subject: [PATCH 02/26] Make lyrics plugin documentation slightly more clear
---
beetsplug/lyrics.py | 6 +-
docs/plugins/lyrics.rst | 130 ++++++++++++++++++++--------------------
2 files changed, 70 insertions(+), 66 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index e779b14d6e..48299bb80a 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -890,8 +890,10 @@ def __init__(self):
"dist_thresh": 0.11,
"google_API_key": None,
"google_engine_ID": "009217259823014548361:lndtuqkycfu",
- "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
- "76V-uFL5jks5dNvcGCdarqFjDhP9c",
+ "genius_api_key": (
+ "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
+ "76V-uFL5jks5dNvcGCdarqFjDhP9c"
+ ),
"fallback": None,
"force": False,
"local": False,
diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
index d080b1f940..f034cf47a1 100644
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@@ -2,25 +2,27 @@ Lyrics Plugin
=============
The ``lyrics`` plugin fetches and stores song lyrics from databases on the Web.
-Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_, `LRCLIB`_
-and, optionally, the Google custom search API.
+Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_,
+`LRCLIB`_ and, optionally, the Google Custom Search API.
.. _Genius.com: https://genius.com/
.. _Tekstowo.pl: https://www.tekstowo.pl/
.. _LRCLIB: https://lrclib.net/
-Fetch Lyrics During Import
---------------------------
+Install
+-------
-To automatically fetch lyrics for songs you import, first enable it in your
-configuration (see :ref:`using-plugins`). Then, install ``beets`` with
-``lyrics`` extra
+Firstly, enable ``lyrics`` plugin in your configuration (see
+:ref:`using-plugins`). Then, install ``beets`` with ``lyrics`` extra
.. code-block:: bash
pip install "beets[lyrics]"
+Fetch Lyrics During Import
+--------------------------
+
When importing new files, beets will now fetch lyrics for files that don't
already have them. The lyrics will be stored in the beets database. If the
``import.write`` config option is on, then the lyrics will also be written to
@@ -29,52 +31,52 @@ the files' tags.
Configuration
-------------
-To configure the plugin, make a ``lyrics:`` section in your
-configuration file. The available options are:
+To configure the plugin, make a ``lyrics:`` section in your configuration file.
+Default configuration:
+
+.. code-block:: yaml
+
+ lyrics:
+ auto: yes
+ bing_client_secret: null
+ bing_lang_from: []
+ bing_lang_to: null
+ dist_thresh: 0.11
+ fallback: null
+ force: no
+ google_API_key: null
+ google_engine_ID: 009217259823014548361:lndtuqkycfu
+ sources: [lrclib, google, genius, tekstowo]
+ synced: no
+
+The available options are:
- **auto**: Fetch lyrics automatically during import.
- Default: ``yes``.
- **bing_client_secret**: Your Bing Translation application password
- (to :ref:`lyrics-translation`)
+ (see :ref:`lyrics-translation`)
- **bing_lang_from**: By default all lyrics with a language other than
``bing_lang_to`` are translated. Use a list of lang codes to restrict the set
of source languages to translate.
- Default: ``[]``
- **bing_lang_to**: Language to translate lyrics into.
- Default: None.
- **dist_thresh**: The maximum distance between the artist and title
combination of the music file and lyrics candidate to consider them a match.
Lower values will make the plugin more strict, higher values will make it
more lenient. This does not apply to the ``lrclib`` backend as it matches
durations.
- Default: ``0.11``.
- **fallback**: By default, the file will be left unchanged when no lyrics are
found. Use the empty string ``''`` to reset the lyrics in such a case.
- Default: None.
- **force**: By default, beets won't fetch lyrics if the files already have
ones. To instead always fetch lyrics, set the ``force`` option to ``yes``.
- Default: ``no``.
- **google_API_key**: Your Google API key (to enable the Google Custom Search
backend).
- Default: None.
- **google_engine_ID**: The custom search engine to use.
Default: The `beets custom search engine`_, which gathers an updated list of
sources known to be scrapeable.
- **sources**: List of sources to search for lyrics. An asterisk ``*`` expands
- to all available sources.
- Default: ``lrclib google genius tekstowo``, i.e., all the available sources. The
- ``google`` source will be automatically deactivated if no ``google_API_key``
- is setup.
- The ``google``, ``genius``, and ``tekstowo`` sources will only be enabled if
- BeautifulSoup is installed.
-- **synced**: Prefer synced lyrics over plain lyrics if a source offers them. Currently `lrclib` is the only source that provides them. Default: `no`.
-
-Here's an example of ``config.yaml``::
-
- lyrics:
- fallback: ''
- google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab
- google_engine_ID: 009217259823014548361:lndtuqkycfu
+ to all available sources. The ``google`` source will be automatically
+ deactivated if no ``google_API_key`` is setup.
+- **synced**: Prefer synced lyrics over plain lyrics if a source offers them.
+ Currently ``lrclib`` is the only source that provides them.
.. _beets custom search engine: https://www.google.com:443/cse/publicurl?cx=009217259823014548361:lndtuqkycfu
@@ -89,74 +91,74 @@ by that band, and ``beet lyrics`` will get lyrics for my entire library. The
lyrics will be added to the beets database and, if ``import.write`` is on,
embedded into files' metadata.
-The ``-p`` option to the ``lyrics`` command makes it print lyrics out to the
-console so you can view the fetched (or previously-stored) lyrics.
+The ``-p, --print`` option to the ``lyrics`` command makes it print lyrics out
+to the console so you can view the fetched (or previously-stored) lyrics.
-The ``-f`` option forces the command to fetch lyrics, even for tracks that
-already have lyrics. Inversely, the ``-l`` option restricts operations
-to lyrics that are locally available, which show lyrics faster without using
-the network at all.
+The ``-f, --force`` option forces the command to fetch lyrics, even for tracks
+that already have lyrics.
+
+Inversely, the ``-l, --local`` option restricts operations to lyrics that are
+locally available, which show lyrics faster without using the network at all.
Rendering Lyrics into Other Formats
-----------------------------------
-The ``-r directory`` option renders all lyrics as `reStructuredText`_ (ReST)
-documents in ``directory`` (by default, the current directory). That
-directory, in turn, can be parsed by tools like `Sphinx`_ to generate HTML,
-ePUB, or PDF documents.
+The ``-r directory, --write-rest directory`` option renders all lyrics as
+`reStructuredText`_ (ReST) documents in ``directory`` (by default, the current
+directory). That directory, in turn, can be parsed by tools like `Sphinx`_ to
+generate HTML, ePUB, or PDF documents.
-A minimal ``conf.py`` and ``index.rst`` files are created the first time the
+Minimal ``conf.py`` and ``index.rst`` files are created the first time the
command is run. They are not overwritten on subsequent runs, so you can safely
modify these files to customize the output.
-.. _Sphinx: https://www.sphinx-doc.org/
-.. _reStructuredText: http://docutils.sourceforge.net/rst.html
+Sphinx supports various `builders`_, see a few suggestions:
-Sphinx supports various `builders
-`_, but here are a
-few suggestions.
- * Build an HTML version::
+.. admonition:: Build an HTML version
- sphinx-build -b html . _build/html
+ ::
- * Build an ePUB3 formatted file, usable on ebook readers::
+ sphinx-build -b html . _build/html
- sphinx-build -b epub3 . _build/epub
+.. admonition:: Build an ePUB3 formatted file, usable on ebook readers
- * Build a PDF file, which incidentally also builds a LaTeX file::
+ ::
- sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf
+ sphinx-build -b epub3 . _build/epub
-.. _activate-google-custom-search:
+.. admonition:: Build a PDF file, which incidentally also builds a LaTeX file
+
+ ::
+
+ sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf
+
+
+.. _Sphinx: https://www.sphinx-doc.org/
+.. _reStructuredText: http://docutils.sourceforge.net/rst.html
+.. _builders: https://www.sphinx-doc.org/en/stable/builders.html
Activate Google Custom Search
------------------------------
You need to `register for a Google API key`_. Set the ``google_API_key``
configuration option to your key.
+
Then add ``google`` to the list of sources in your configuration (or use
default list, which includes it as long as you have an API key).
If you use default ``google_engine_ID``, we recommend limiting the sources to
``google`` as the other sources are already included in the Google results.
-.. _register for a Google API key: https://console.developers.google.com/
-
Optionally, you can `define a custom search engine`_. Get your search engine's
token and use it for your ``google_engine_ID`` configuration option. By
default, beets use a list of sources known to be scrapeable.
-.. _define a custom search engine: https://www.google.com/cse/all
-
Note that the Google custom search API is limited to 100 queries per day.
After that, the lyrics plugin will fall back on other declared data sources.
-.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
-
-Activate Genius and Tekstowo.pl Lyrics
---------------------------------------
+.. _register for a Google API key: https://console.developers.google.com/
+.. _define a custom search engine: https://www.google.com/cse/all
-These backends are enabled by default.
.. _lyrics-translation:
@@ -167,6 +169,6 @@ You need to register for a Microsoft Azure Marketplace free account and
to the `Microsoft Translator API`_. Follow the four steps process, specifically
at step 3 enter ``beets`` as *Client ID* and copy/paste the generated
*Client secret* into your ``bing_client_secret`` configuration, alongside
-``bing_lang_to`` target `language code`.
+``bing_lang_to`` target ``language code``.
.. _Microsoft Translator API: https://docs.microsoft.com/en-us/azure/cognitive-services/translator/translator-how-to-signup
From db345ca3f29306dcf331ec2d7f25dee52ffadcf4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Wed, 4 Sep 2024 04:15:46 +0100
Subject: [PATCH 03/26] Centralize requests setup with requests.Session
Improve requests performance with requests.Session which uses connection
pooling for repeated requests to the same host.
Additionally, this centralizes request configuration, making sure that
we use the same timeout and provide beets user agent for all requests.
---
beetsplug/lyrics.py | 68 +++++++++++++++++----------------------------
setup.cfg | 4 +--
2 files changed, 28 insertions(+), 44 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 48299bb80a..96943e538d 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -16,6 +16,7 @@
from __future__ import annotations
+import atexit
import errno
import itertools
import json
@@ -24,13 +25,12 @@
import re
import struct
import unicodedata
-import warnings
from contextlib import suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
from http import HTTPStatus
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlencode, urlparse
+from urllib.parse import quote, urlparse
import requests
from typing_extensions import TypedDict
@@ -106,6 +106,22 @@ class NotFoundError(requests.exceptions.HTTPError):
pass
+class TimeoutSession(requests.Session):
+ def request(self, *args, **kwargs):
+ kwargs.setdefault("timeout", 10)
+ return super().request(*args, **kwargs)
+
+
+r_session = TimeoutSession()
+r_session.headers.update({"User-Agent": USER_AGENT})
+
+
+@atexit.register
+def close_session():
+ """Close the requests session on shut down."""
+ r_session.close()
+
+
# Utilities.
@@ -246,21 +262,7 @@ def fetch_url(self, url, **kwargs):
is unreachable.
"""
try:
- # Disable the InsecureRequestWarning that comes from using
- # `verify=false`.
- # https://github.com/kennethreitz/requests/issues/2214
- # We're not overly worried about the NSA MITMing our lyrics scraper
- with warnings.catch_warnings():
- warnings.simplefilter("ignore")
- r = requests.get(
- url,
- verify=False,
- headers={
- "User-Agent": USER_AGENT,
- },
- timeout=10,
- **kwargs,
- )
+ r = r_session.get(url)
except requests.RequestException as exc:
self._log.debug("lyrics request failed: {0}", exc)
return
@@ -368,9 +370,7 @@ def warn(self, message: str, *args) -> None:
def fetch_json(self, *args, **kwargs):
"""Wrap the request method to raise an exception on HTTP errors."""
- kwargs.setdefault("timeout", 10)
- kwargs.setdefault("headers", {"User-Agent": USER_AGENT})
- r = requests.get(*args, **kwargs)
+ r = r_session.get(*args, **kwargs)
if r.status_code == HTTPStatus.NOT_FOUND:
raise NotFoundError("HTTP Error: Not Found", response=r)
r.raise_for_status()
@@ -532,10 +532,7 @@ class Genius(SearchBackend):
def __init__(self, config, log):
super().__init__(config, log)
self.api_key = config["genius_api_key"].as_str()
- self.headers = {
- "Authorization": "Bearer %s" % self.api_key,
- "User-Agent": USER_AGENT,
- }
+ self.headers = {"Authorization": f"Bearer {self.api_key}"}
def fetch(self, artist: str, title: str, *_) -> str | None:
"""Fetch lyrics from genius.com
@@ -570,18 +567,13 @@ def _search(self, artist, title):
search_url = self.base_url + "/search"
data = {"q": title + " " + artist.lower()}
try:
- response = requests.get(
- search_url,
- params=data,
- headers=self.headers,
- timeout=10,
- )
+ r = r_session.get(search_url, params=data, headers=self.headers)
except requests.RequestException as exc:
self._log.debug("Genius API request failed: {0}", exc)
return None
try:
- return response.json()
+ return r.json()
except ValueError:
return None
@@ -976,13 +968,7 @@ def get_bing_access_token(self):
}
oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13"
- oauth_token = json.loads(
- requests.post(
- oauth_url,
- data=urlencode(params),
- timeout=10,
- ).content
- )
+ oauth_token = r_session.post(oauth_url, params=params).json()
if "access_token" in oauth_token:
return "Bearer " + oauth_token["access_token"]
else:
@@ -1199,10 +1185,8 @@ def append_translation(self, text, to_lang):
"https://api.microsofttranslator.com/v2/Http.svc/"
"Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang)
)
- r = requests.get(
- url,
- headers={"Authorization ": self.bing_auth_token},
- timeout=10,
+ r = r_session.get(
+ url, headers={"Authorization": self.bing_auth_token}
)
if r.status_code != 200:
self._log.debug(
diff --git a/setup.cfg b/setup.cfg
index 15ca23f658..8e3d7e3b82 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,8 +21,8 @@ omit = beets/test/*
precision = 2
skip_empty = true
show_missing = true
-exclude_lines =
- pragma: no cover
+exclude_also =
+ @atexit.register
if TYPE_CHECKING
if typing.TYPE_CHECKING
raise AssertionError
From eaa54441f84f37ad77ba0037b501fb52721089c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sun, 13 Oct 2024 16:14:15 +0100
Subject: [PATCH 04/26] Centralise request error handling
---
beetsplug/lyrics.py | 228 +++++++++++++++---------------------
test/plugins/test_lyrics.py | 62 ++++++----
2 files changed, 136 insertions(+), 154 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 96943e538d..240c2c2c01 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -19,13 +19,12 @@
import atexit
import errno
import itertools
-import json
import math
import os.path
import re
import struct
import unicodedata
-from contextlib import suppress
+from contextlib import contextmanager, suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
from http import HTTPStatus
@@ -108,8 +107,14 @@ class NotFoundError(requests.exceptions.HTTPError):
class TimeoutSession(requests.Session):
def request(self, *args, **kwargs):
+ """Wrap the request method to raise an exception on HTTP errors."""
kwargs.setdefault("timeout", 10)
- return super().request(*args, **kwargs)
+ r = super().request(*args, **kwargs)
+ if r.status_code == HTTPStatus.NOT_FOUND:
+ raise NotFoundError("HTTP Error: Not Found", response=r)
+ r.raise_for_status()
+
+ return r
r_session = TimeoutSession()
@@ -250,28 +255,36 @@ def try_parse_html(html, **kwargs):
return None
-class Backend:
+class RequestHandler:
+ _log: beets.logging.Logger
+
+ def fetch_text(self, url: str, **kwargs) -> str:
+ """Return text / HTML data from the given URL."""
+ self._log.debug("Fetching HTML from {}", url)
+ return r_session.get(url, **kwargs).text
+
+ def fetch_json(self, url: str, **kwargs):
+ """Return JSON data from the given URL."""
+ self._log.debug("Fetching JSON from {}", url)
+ return r_session.get(url, **kwargs).json()
+
+ @contextmanager
+ def handle_request(self) -> Iterator[None]:
+ try:
+ yield
+ except requests.JSONDecodeError:
+ self._log.warning("Could not decode response JSON data")
+ except requests.RequestException as exc:
+ self._log.warning("Request error: {}", exc)
+
+
+class Backend(RequestHandler):
REQUIRES_BS = False
def __init__(self, config, log):
self._log = log
self.config = config
- def fetch_url(self, url, **kwargs):
- """Retrieve the content at a given URL, or return None if the source
- is unreachable.
- """
- try:
- r = r_session.get(url)
- except requests.RequestException as exc:
- self._log.debug("lyrics request failed: {0}", exc)
- return
- if r.status_code == requests.codes.ok:
- return r.text
- else:
- self._log.debug("failed to fetch: {0} ({1})", url, r.status_code)
- return None
-
def fetch(
self, artist: str, title: str, album: str, length: int
) -> str | None:
@@ -368,15 +381,6 @@ def warn(self, message: str, *args) -> None:
"""Log a warning message with the class name."""
self._log.warning(f"{self.__class__.__name__}: {message}", *args)
- def fetch_json(self, *args, **kwargs):
- """Wrap the request method to raise an exception on HTTP errors."""
- r = r_session.get(*args, **kwargs)
- if r.status_code == HTTPStatus.NOT_FOUND:
- raise NotFoundError("HTTP Error: Not Found", response=r)
- r.raise_for_status()
-
- return r.json()
-
def fetch_candidates(
self, artist: str, title: str, album: str, length: int
) -> Iterator[list[LRCLibItem]]:
@@ -414,13 +418,7 @@ def fetch(
if item := self.pick_best_match(candidates):
return item.get_text(self.config["synced"])
except StopIteration:
- pass
- except requests.JSONDecodeError:
- self.warn("Could not decode response JSON data")
- except requests.RequestException as exc:
- self.warn("Request error: {}", exc)
-
- return None
+ return None
class DirectBackend(Backend):
@@ -460,9 +458,7 @@ def encode(cls, text: str) -> str:
def fetch(self, artist: str, title: str, *_) -> str | None:
url = self.build_url(artist, title)
- html = self.fetch_url(url)
- if not html:
- return None
+ html = self.fetch_text(url)
if "We detected that your IP is blocked" in html:
self._log.warning(
"we are blocked at MusixMatch: url %s failed" % url
@@ -528,6 +524,7 @@ class Genius(SearchBackend):
"""
base_url = "https://api.genius.com"
+ search_url = f"{base_url}/search"
def __init__(self, config, log):
super().__init__(config, log)
@@ -550,10 +547,9 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
for hit in json["response"]["hits"]:
result = hit["result"]
if check(result["primary_artist"]["name"], result["title"]):
- html = self.fetch_url(result["url"])
- if not html:
- return None
- return self._scrape_lyrics_from_html(html)
+ return self._scrape_lyrics_from_html(
+ self.fetch_text(result["url"])
+ )
return None
@@ -564,29 +560,20 @@ def _search(self, artist, title):
:returns: json response
"""
- search_url = self.base_url + "/search"
- data = {"q": title + " " + artist.lower()}
- try:
- r = r_session.get(search_url, params=data, headers=self.headers)
- except requests.RequestException as exc:
- self._log.debug("Genius API request failed: {0}", exc)
- return None
-
- try:
- return r.json()
- except ValueError:
- return None
+ return self.fetch_json(
+ self.search_url,
+ params={"q": f"{title} {artist.lower()}"},
+ headers=self.headers,
+ )
def replace_br(self, lyrics_div):
for br in lyrics_div.find_all("br"):
br.replace_with("\n")
- def _scrape_lyrics_from_html(self, html):
+ def _scrape_lyrics_from_html(self, html: str) -> str | None:
"""Scrape lyrics from a given genius.com html"""
soup = try_parse_html(html)
- if not soup:
- return
# Remove script tags that they put in the middle of the lyrics.
[h.extract() for h in soup("script")]
@@ -657,10 +644,12 @@ def encode(cls, text: str) -> str:
return cls.non_alpha_to_underscore(unidecode(text.lower()))
def fetch(self, artist: str, title: str, *_) -> str | None:
- if html := self.fetch_url(self.build_url(artist, title)):
- return self.extract_lyrics(html)
-
- return None
+ # We are expecting to receive a 404 since we are guessing the URL.
+ # Thus suppress the error so that it does not end up in the logs.
+ with suppress(NotFoundError):
+ return self.extract_lyrics(
+ self.fetch_text(self.build_url(artist, title))
+ )
def extract_lyrics(self, html: str) -> str | None:
html = _scrape_strip_cruft(html)
@@ -714,7 +703,7 @@ def _scrape_merge_paragraphs(html):
return re.sub(r"
\s*
", "\n", html)
-def scrape_lyrics_from_html(html):
+def scrape_lyrics_from_html(html: str) -> str | None:
"""Scrape lyrics from a URL. If no lyrics can be found, return None
instead.
"""
@@ -734,8 +723,6 @@ def is_text_notcode(text):
# extract all long text blocks that are not code
soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode))
- if not soup:
- return None
# Get the longest text element (if any).
strings = sorted(soup.stripped_strings, key=len, reverse=True)
@@ -828,39 +815,26 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
"q": f"{artist} {title}",
}
- data = self.fetch_url(self.SEARCH_URL, params=params)
- if not data:
- self._log.debug("google backend returned no data")
- return None
- try:
- data = json.loads(data)
- except ValueError as exc:
- self._log.debug("google backend returned malformed JSON: {}", exc)
- if "error" in data:
- reason = data["error"]["errors"][0]["reason"]
- self._log.debug("google backend error: {0}", reason)
- return None
-
check_candidate = partial(self.is_page_candidate, artist, title)
- for item in data.get("items", []):
+ for item in self.fetch_json(self.SEARCH_URL, params=params).get(
+ "items", []
+ ):
url_link = item["link"]
if not check_candidate(url_link, item.get("title", "")):
continue
- html = self.fetch_url(url_link)
- if not html:
- continue
- lyrics = scrape_lyrics_from_html(html)
- if not lyrics:
- continue
+ with self.handle_request():
+ lyrics = scrape_lyrics_from_html(self.fetch_text(url_link))
+ if not lyrics:
+ continue
- if self.is_lyrics(lyrics, artist):
- self._log.debug("got lyrics from {0}", item["displayLink"])
- return lyrics
+ if self.is_lyrics(lyrics, artist):
+ self._log.debug("got lyrics from {0}", item["displayLink"])
+ return lyrics
return None
-class LyricsPlugin(plugins.BeetsPlugin):
+class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
SOURCE_BACKENDS = {
"google": Google,
@@ -931,7 +905,6 @@ def __init__(self):
self.config["bing_lang_from"] = [
x.lower() for x in self.config["bing_lang_from"].as_str_seq()
]
- self.bing_auth_token = None
if not HAS_LANGDETECT and self.config["bing_client_secret"].get():
self._log.warning(
@@ -959,7 +932,8 @@ def sanitize_bs_sources(self, sources):
return enabled_sources
- def get_bing_access_token(self):
+ @cached_property
+ def bing_access_token(self) -> str | None:
params = {
"client_id": "beets",
"client_secret": self.config["bing_client_secret"],
@@ -968,14 +942,11 @@ def get_bing_access_token(self):
}
oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13"
- oauth_token = r_session.post(oauth_url, params=params).json()
- if "access_token" in oauth_token:
- return "Bearer " + oauth_token["access_token"]
- else:
- self._log.warning(
- "Could not get Bing Translate API access token."
- ' Check your "bing_client_secret" password'
- )
+ with self.handle_request():
+ r = r_session.post(oauth_url, params=params)
+ return r.json()["access_token"]
+
+ return None
def commands(self):
cmd = ui.Subcommand("lyrics", help="fetch song lyrics")
@@ -1164,44 +1135,39 @@ def get_lyrics(self, artist: str, title: str, *args) -> str | None:
None if no lyrics were found.
"""
for backend in self.backends:
- lyrics = backend.fetch(artist, title, *args)
- if lyrics:
- self._log.debug(
- "got lyrics from backend: {0}", backend.__class__.__name__
- )
- return _scrape_strip_cruft(lyrics, True)
+ with backend.handle_request():
+ if lyrics := backend.fetch(artist, title, *args):
+ self._log.debug(
+ "got lyrics from backend: {0}",
+ backend.__class__.__name__,
+ )
+ return _scrape_strip_cruft(lyrics, True)
return None
def append_translation(self, text, to_lang):
from xml.etree import ElementTree
- if not self.bing_auth_token:
- self.bing_auth_token = self.get_bing_access_token()
- if self.bing_auth_token:
- # Extract unique lines to limit API request size per song
- text_lines = set(text.split("\n"))
- url = (
- "https://api.microsofttranslator.com/v2/Http.svc/"
- "Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang)
+ if not (token := self.bing_access_token):
+ self._log.warning(
+ "Could not get Bing Translate API access token. "
+ "Check your 'bing_client_secret' password."
)
- r = r_session.get(
- url, headers={"Authorization": self.bing_auth_token}
+ return text
+
+ # Extract unique lines to limit API request size per song
+ lines = text.split("\n")
+ unique_lines = set(lines)
+ url = "https://api.microsofttranslator.com/v2/Http.svc/Translate"
+ with self.handle_request():
+ text = self.fetch_text(
+ url,
+ headers={"Authorization": f"Bearer {token}"},
+ params={"text": "|".join(unique_lines), "to": to_lang},
)
- if r.status_code != 200:
- self._log.debug(
- "translation API error {}: {}", r.status_code, r.text
- )
- if "token has expired" in r.text:
- self.bing_auth_token = None
- return self.append_translation(text, to_lang)
- return text
- lines_translated = ElementTree.fromstring(
- r.text.encode("utf-8")
- ).text
- # Use a translation mapping dict to build resulting lyrics
- translations = dict(zip(text_lines, lines_translated.split("|")))
- result = ""
- for line in text.split("\n"):
- result += "{} / {}\n".format(line, translations[line])
- return result
+ if translated := ElementTree.fromstring(text.encode("utf-8")).text:
+ # Use a translation mapping dict to build resulting lyrics
+ translations = dict(zip(unique_lines, translated.split("|")))
+ return "".join(f"{ln} / {translations[ln]}\n" for ln in lines)
+
+ return text
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 8123996985..b77054b52f 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -202,7 +202,7 @@ def lyrics_root_dir(pytestconfig: pytest.Config):
return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
-class LyricsBackendTest(PluginMixin):
+class LyricsPluginMixin(PluginMixin):
plugin = "lyrics"
@pytest.fixture
@@ -218,6 +218,42 @@ def lyrics_plugin(self, backend_name, plugin_config):
return lyrics.LyricsPlugin()
+
+class TestLyricsPlugin(LyricsPluginMixin):
+ @pytest.fixture
+ def backend_name(self):
+ """Return lyrics configuration to test."""
+ return "lrclib"
+
+ @pytest.mark.parametrize(
+ "request_kwargs, expected_log_match",
+ [
+ (
+ {"status_code": HTTPStatus.BAD_GATEWAY},
+ r"lyrics: Request error: 502",
+ ),
+ ({"text": "invalid"}, r"lyrics: Could not decode.*JSON"),
+ ],
+ )
+ def test_error_handling(
+ self,
+ requests_mock,
+ lyrics_plugin,
+ caplog,
+ request_kwargs,
+ expected_log_match,
+ ):
+ """Errors are logged with the plugin name."""
+ requests_mock.get(lyrics.LRCLib.SEARCH_URL, **request_kwargs)
+
+ assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None
+ assert caplog.messages
+ last_log = caplog.messages[-1]
+ assert last_log
+ assert re.search(expected_log_match, last_log, re.I)
+
+
+class LyricsBackendTest(LyricsPluginMixin):
@pytest.fixture
def backend(self, lyrics_plugin):
"""Return a lyrics backend instance."""
@@ -399,13 +435,9 @@ def backend_name(self):
return "lrclib"
@pytest.fixture
- def request_kwargs(self, response_data):
- return {"json": response_data}
-
- @pytest.fixture
- def fetch_lyrics(self, backend, requests_mock, request_kwargs):
+ def fetch_lyrics(self, backend, requests_mock, response_data):
requests_mock.get(backend.GET_URL, status_code=HTTPStatus.NOT_FOUND)
- requests_mock.get(backend.SEARCH_URL, **request_kwargs)
+ requests_mock.get(backend.SEARCH_URL, json=response_data)
return partial(backend.fetch, "la", "la", "la", self.ITEM_DURATION)
@@ -478,19 +510,3 @@ def test_synced_config_option(self, fetch_lyrics, expected_lyrics):
@pytest.mark.parametrize("plugin_config", [{"synced": True}])
def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics):
assert fetch_lyrics() == expected_lyrics
-
- @pytest.mark.parametrize(
- "request_kwargs, expected_log_match",
- [
- (
- {"status_code": HTTPStatus.BAD_GATEWAY},
- r"LRCLib: Request error: 502",
- ),
- ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
- ],
- )
- def test_error(self, caplog, fetch_lyrics, expected_log_match):
- assert fetch_lyrics() is None
- assert caplog.messages
- assert (last_log := caplog.messages[-1])
- assert re.search(expected_log_match, last_log, re.I)
From 65557de0f25f3b01b8f6aee5b210d67270555957 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Fri, 6 Sep 2024 07:35:24 +0100
Subject: [PATCH 05/26] Include class name in the log messages
---
beetsplug/lyrics.py | 113 +++++++++++++++++++-----------------
test/plugins/test_lyrics.py | 6 +-
2 files changed, 64 insertions(+), 55 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 240c2c2c01..06352ba994 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -28,8 +28,8 @@
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
from http import HTTPStatus
-from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlparse
+from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
+from urllib.parse import quote, urlencode, urlparse
import requests
from typing_extensions import TypedDict
@@ -58,6 +58,8 @@
except ImportError:
HAS_LANGDETECT = False
+JSONDict = dict[str, Any]
+
DIV_RE = re.compile(r"<(/?)div>?", re.I)
COMMENT_RE = re.compile(r"", re.S)
TAG_RE = re.compile(r"<[^>]*>")
@@ -258,14 +260,37 @@ def try_parse_html(html, **kwargs):
class RequestHandler:
_log: beets.logging.Logger
- def fetch_text(self, url: str, **kwargs) -> str:
+ def debug(self, message: str, *args) -> None:
+ """Log a debug message with the class name."""
+ self._log.debug(f"{self.__class__.__name__}: {message}", *args)
+
+ def info(self, message: str, *args) -> None:
+ """Log an info message with the class name."""
+ self._log.info(f"{self.__class__.__name__}: {message}", *args)
+
+ def warn(self, message: str, *args) -> None:
+ """Log warning with the class name."""
+ self._log.warning(f"{self.__class__.__name__}: {message}", *args)
+
+ @staticmethod
+ def format_url(url: str, params: JSONDict | None) -> str:
+ if not params:
+ return url
+
+ return f"{url}?{urlencode(params)}"
+
+ def fetch_text(
+ self, url: str, params: JSONDict | None = None, **kwargs
+ ) -> str:
"""Return text / HTML data from the given URL."""
- self._log.debug("Fetching HTML from {}", url)
+ url = self.format_url(url, params)
+ self.debug("Fetching HTML from {}", url)
return r_session.get(url, **kwargs).text
- def fetch_json(self, url: str, **kwargs):
+ def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
"""Return JSON data from the given URL."""
- self._log.debug("Fetching JSON from {}", url)
+ url = self.format_url(url, params)
+ self.debug("Fetching JSON from {}", url)
return r_session.get(url, **kwargs).json()
@contextmanager
@@ -273,9 +298,9 @@ def handle_request(self) -> Iterator[None]:
try:
yield
except requests.JSONDecodeError:
- self._log.warning("Could not decode response JSON data")
+ self.warn("Could not decode response JSON data")
except requests.RequestException as exc:
- self._log.warning("Request error: {}", exc)
+ self.warn("Request error: {}", exc)
class Backend(RequestHandler):
@@ -377,10 +402,6 @@ class LRCLib(Backend):
GET_URL = f"{BASE_URL}/get"
SEARCH_URL = f"{BASE_URL}/search"
- def warn(self, message: str, *args) -> None:
- """Log a warning message with the class name."""
- self._log.warning(f"{self.__class__.__name__}: {message}", *args)
-
def fetch_candidates(
self, artist: str, title: str, album: str, length: int
) -> Iterator[list[LRCLibItem]]:
@@ -412,13 +433,12 @@ def fetch(
"""Fetch lyrics text for the given song data."""
evaluate_item = partial(LRCLyrics.make, target_duration=length)
- try:
- for group in self.fetch_candidates(artist, title, album, length):
- candidates = [evaluate_item(item) for item in group]
- if item := self.pick_best_match(candidates):
- return item.get_text(self.config["synced"])
- except StopIteration:
- return None
+ for group in self.fetch_candidates(artist, title, album, length):
+ candidates = [evaluate_item(item) for item in group]
+ if item := self.pick_best_match(candidates):
+ return item.get_text(self.config["synced"])
+
+ return None
class DirectBackend(Backend):
@@ -460,9 +480,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
html = self.fetch_text(url)
if "We detected that your IP is blocked" in html:
- self._log.warning(
- "we are blocked at MusixMatch: url %s failed" % url
- )
+ self.warn("Failed: Blocked IP address")
return None
html_parts = html.split('
str | None:
then attempt to scrape that url for the lyrics.
"""
json = self._search(artist, title)
- if not json:
- self._log.debug("Genius API request returned invalid JSON")
- return None
check = partial(self.check_match, artist, title)
for hit in json["response"]["hits"]:
@@ -585,7 +600,7 @@ def _scrape_lyrics_from_html(self, html: str) -> str | None:
lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
if not lyrics_divs:
- self._log.debug("Received unusual song page html")
+ self.debug("Received unusual song page html")
return self._try_extracting_lyrics_from_non_data_lyrics_container(
soup
)
@@ -608,10 +623,10 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
class_=re.compile("LyricsPlaceholder__Message"),
string="This song is an instrumental",
):
- self._log.debug("Detected instrumental")
+ self.debug("Detected instrumental")
return INSTRUMENTAL_LYRICS
else:
- self._log.debug("Couldn't scrape page using known layouts")
+ self.debug("Couldn't scrape page using known layouts")
return None
lyrics_div = verse_div.parent
@@ -651,6 +666,8 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
self.fetch_text(self.build_url(artist, title))
)
+ return None
+
def extract_lyrics(self, html: str) -> str | None:
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
@@ -744,7 +761,7 @@ def is_lyrics(self, text, artist=None):
bad_triggers_occ = []
nb_lines = text.count("\n")
if nb_lines <= 1:
- self._log.debug("Ignoring too short lyrics '{0}'", text)
+ self.debug("Ignoring too short lyrics '{}'", text)
return False
elif nb_lines < 5:
bad_triggers_occ.append("too_short")
@@ -763,7 +780,7 @@ def is_lyrics(self, text, artist=None):
)
if bad_triggers_occ:
- self._log.debug("Bad triggers detected: {0}", bad_triggers_occ)
+ self.debug("Bad triggers detected: {}", bad_triggers_occ)
return len(bad_triggers_occ) < 2
def slugify(self, text):
@@ -776,7 +793,7 @@ def slugify(self, text):
text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
except UnicodeDecodeError:
- self._log.exception("Failing to normalize '{0}'", text)
+ self.debug("Failed to normalize '{}'", text)
return text
BY_TRANS = ["by", "par", "de", "von"]
@@ -828,7 +845,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
continue
if self.is_lyrics(lyrics, artist):
- self._log.debug("got lyrics from {0}", item["displayLink"])
+ self.debug("Got lyrics from {}", item["displayLink"])
return lyrics
return None
@@ -897,9 +914,7 @@ def __init__(self):
# configuration includes `google`. This way, the source
# is silent by default but can be enabled just by
# setting an API key.
- self._log.debug(
- "Disabling google source: " "no API key configured."
- )
+ self.debug("Disabling google source: " "no API key configured.")
sources.remove("google")
self.config["bing_lang_from"] = [
@@ -907,15 +922,14 @@ def __init__(self):
]
if not HAS_LANGDETECT and self.config["bing_client_secret"].get():
- self._log.warning(
- "To use bing translations, you need to "
- "install the langdetect module. See the "
- "documentation for further details."
+ self.warn(
+ "To use bing translations, you need to install the langdetect "
+ "module. See the documentation for further details."
)
self.backends = [
- self.SOURCE_BACKENDS[source](self.config, self._log)
- for source in sources
+ self.SOURCE_BACKENDS[s](self.config, self._log.getChild(s))
+ for s in sources
]
def sanitize_bs_sources(self, sources):
@@ -946,8 +960,6 @@ def bing_access_token(self) -> str | None:
r = r_session.post(oauth_url, params=params)
return r.json()["access_token"]
- return None
-
def commands(self):
cmd = ui.Subcommand("lyrics", help="fetch song lyrics")
cmd.parser.add_option(
@@ -1092,7 +1104,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
"""
# Skip if the item already has lyrics.
if not force and item.lyrics:
- self._log.info("lyrics already present: {0}", item)
+ self.info("Lyrics already present: {}", item)
return
lyrics_matches = []
@@ -1108,7 +1120,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches))
if lyrics:
- self._log.info("fetched lyrics: {0}", item)
+ self.info("Lyrics found: {}", item)
if HAS_LANGDETECT and self.config["bing_client_secret"].get():
lang_from = langdetect.detect(lyrics)
if self.config["bing_lang_to"].get() != lang_from and (
@@ -1119,7 +1131,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
lyrics, self.config["bing_lang_to"]
)
else:
- self._log.info("lyrics not found: {0}", item)
+ self.info("Lyrics not found: {}", item)
fallback = self.config["fallback"].get()
if fallback:
lyrics = fallback
@@ -1134,13 +1146,10 @@ def get_lyrics(self, artist: str, title: str, *args) -> str | None:
"""Fetch lyrics, trying each source in turn. Return a string or
None if no lyrics were found.
"""
+ self.info("Fetching lyrics for {} - {}", artist, title)
for backend in self.backends:
with backend.handle_request():
if lyrics := backend.fetch(artist, title, *args):
- self._log.debug(
- "got lyrics from backend: {0}",
- backend.__class__.__name__,
- )
return _scrape_strip_cruft(lyrics, True)
return None
@@ -1149,7 +1158,7 @@ def append_translation(self, text, to_lang):
from xml.etree import ElementTree
if not (token := self.bing_access_token):
- self._log.warning(
+ self.warn(
"Could not get Bing Translate API access token. "
"Check your 'bing_client_secret' password."
)
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index b77054b52f..817eb4af92 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -230,9 +230,9 @@ def backend_name(self):
[
(
{"status_code": HTTPStatus.BAD_GATEWAY},
- r"lyrics: Request error: 502",
+ r"LRCLib: Request error: 502",
),
- ({"text": "invalid"}, r"lyrics: Could not decode.*JSON"),
+ ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
],
)
def test_error_handling(
@@ -243,7 +243,7 @@ def test_error_handling(
request_kwargs,
expected_log_match,
):
- """Errors are logged with the plugin name."""
+ """Errors are logged with the backend name."""
requests_mock.get(lyrics.LRCLib.SEARCH_URL, **request_kwargs)
assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None
From bf00c26494e693d4f1e686c25c4eb198aeacd0e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Wed, 11 Sep 2024 07:51:43 +0100
Subject: [PATCH 06/26] Leave a single chef in the kitchen
---
beetsplug/lyrics.py | 79 ++++++++++---------------------------
test/plugins/test_lyrics.py | 6 +--
2 files changed, 23 insertions(+), 62 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 06352ba994..168b00821d 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -44,8 +44,7 @@
from beets.library import Item
try:
- import bs4
- from bs4 import SoupStrainer
+ from bs4 import BeautifulSoup
HAS_BEAUTIFUL_SOUP = True
except ImportError:
@@ -246,17 +245,6 @@ def slug(text):
return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
-if HAS_BEAUTIFUL_SOUP:
-
- def try_parse_html(html, **kwargs):
- return bs4.BeautifulSoup(html, "html.parser", **kwargs)
-
-else:
-
- def try_parse_html(html, **kwargs):
- return None
-
-
class RequestHandler:
_log: beets.logging.Logger
@@ -562,9 +550,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
for hit in json["response"]["hits"]:
result = hit["result"]
if check(result["primary_artist"]["name"], result["title"]):
- return self._scrape_lyrics_from_html(
- self.fetch_text(result["url"])
- )
+ return self.scrape_lyrics(self.fetch_text(hit["result"]["url"]))
return None
@@ -581,17 +567,9 @@ def _search(self, artist, title):
headers=self.headers,
)
- def replace_br(self, lyrics_div):
- for br in lyrics_div.find_all("br"):
- br.replace_with("\n")
-
- def _scrape_lyrics_from_html(self, html: str) -> str | None:
+ def scrape_lyrics(self, html: str) -> str | None:
"""Scrape lyrics from a given genius.com html"""
-
- soup = try_parse_html(html)
-
- # Remove script tags that they put in the middle of the lyrics.
- [h.extract() for h in soup("script")]
+ soup = get_soup(html)
# Most of the time, the page contains a div with class="lyrics" where
# all of the lyrics can be found already correctly formatted
@@ -606,7 +584,6 @@ def _scrape_lyrics_from_html(self, html: str) -> str | None:
)
lyrics = ""
for lyrics_div in lyrics_divs:
- self.replace_br(lyrics_div)
lyrics += lyrics_div.get_text() + "\n\n"
while lyrics[-1] == "\n":
lyrics = lyrics[:-1]
@@ -630,7 +607,6 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
return None
lyrics_div = verse_div.parent
- self.replace_br(lyrics_div)
ads = lyrics_div.find_all(
"div", class_=re.compile("InreadAd__Container")
@@ -662,17 +638,14 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
# We are expecting to receive a 404 since we are guessing the URL.
# Thus suppress the error so that it does not end up in the logs.
with suppress(NotFoundError):
- return self.extract_lyrics(
+ return self.scrape_lyrics(
self.fetch_text(self.build_url(artist, title))
)
return None
- def extract_lyrics(self, html: str) -> str | None:
- html = _scrape_strip_cruft(html)
- html = _scrape_merge_paragraphs(html)
-
- soup = try_parse_html(html)
+ def scrape_lyrics(self, html: str) -> str | None:
+ soup = get_soup(html)
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
return lyrics_div.get_text()
@@ -720,33 +693,11 @@ def _scrape_merge_paragraphs(html):
return re.sub(r"
\s*
", "\n", html)
-def scrape_lyrics_from_html(html: str) -> str | None:
- """Scrape lyrics from a URL. If no lyrics can be found, return None
- instead.
- """
-
- def is_text_notcode(text):
- if not text:
- return False
- length = len(text)
- return (
- length > 20
- and text.count(" ") > length / 25
- and (text.find("{") == -1 or text.find(";") == -1)
- )
-
+def get_soup(html: str) -> BeautifulSoup:
html = _scrape_strip_cruft(html)
html = _scrape_merge_paragraphs(html)
- # extract all long text blocks that are not code
- soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode))
-
- # Get the longest text element (if any).
- strings = sorted(soup.stripped_strings, key=len, reverse=True)
- if strings:
- return strings[0]
- else:
- return None
+ return BeautifulSoup(html, "html.parser")
class Google(SearchBackend):
@@ -754,6 +705,16 @@ class Google(SearchBackend):
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
+ @staticmethod
+ def scrape_lyrics(html: str) -> str | None:
+ soup = get_soup(html)
+
+ # Get the longest text element (if any).
+ strings = sorted(soup.stripped_strings, key=len, reverse=True)
+ if strings:
+ return strings[0]
+ return None
+
def is_lyrics(self, text, artist=None):
"""Determine whether the text seems to be valid lyrics."""
if not text:
@@ -840,7 +801,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
if not check_candidate(url_link, item.get("title", "")):
continue
with self.handle_request():
- lyrics = scrape_lyrics_from_html(self.fetch_text(url_link))
+ lyrics = self.scrape_lyrics(self.fetch_text(url_link))
if not lyrics:
continue
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 817eb4af92..d412d318bc 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -328,7 +328,7 @@ def file_name(self):
def test_mocked_source_ok(self, backend, lyrics_html):
"""Test that lyrics of the mocked page are correctly scraped"""
- result = lyrics.scrape_lyrics_from_html(lyrics_html).lower()
+ result = backend.scrape_lyrics(lyrics_html).lower()
assert result
assert backend.is_lyrics(result)
@@ -390,7 +390,7 @@ def backend_name(self):
],
) # fmt: skip
def test_scrape(self, backend, lyrics_html, expected_line_count):
- result = backend._scrape_lyrics_from_html(lyrics_html) or ""
+ result = backend.scrape_lyrics(lyrics_html) or ""
assert len(result.splitlines()) == expected_line_count
@@ -411,7 +411,7 @@ def backend_name(self):
],
)
def test_scrape(self, backend, lyrics_html, expecting_lyrics):
- assert bool(backend.extract_lyrics(lyrics_html)) == expecting_lyrics
+ assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics
LYRICS_DURATION = 950
From 469db00ee23711660880ece870f6d09a755b2786 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sat, 19 Oct 2024 03:30:41 +0100
Subject: [PATCH 07/26] Do not try to strip cruft from the parsed lyrics text.
Having removed it I fuond that only the Genius lyrics changed: it had en
extra new line. Thus I defined a function 'collapse_newlines' which now
gets called for the Genius lyrics.
---
beetsplug/lyrics.py | 24 +++++++++++-------------
test/plugins/test_lyrics.py | 5 ++---
2 files changed, 13 insertions(+), 16 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 168b00821d..13b786f4fb 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -59,9 +59,6 @@
JSONDict = dict[str, Any]
-DIV_RE = re.compile(r"<(/?)div>?", re.I)
-COMMENT_RE = re.compile(r"", re.S)
-TAG_RE = re.compile(r"<[^>]*>")
BREAK_RE = re.compile(r"\n?\s* ]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -549,8 +546,11 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
check = partial(self.check_match, artist, title)
for hit in json["response"]["hits"]:
result = hit["result"]
- if check(result["primary_artist"]["name"], result["title"]):
- return self.scrape_lyrics(self.fetch_text(hit["result"]["url"]))
+ url = hit["result"]["url"]
+ if check(result["primary_artist"]["name"], result["title"]) and (
+ lyrics := self.scrape_lyrics(self.fetch_text(url))
+ ):
+ return collapse_newlines(lyrics)
return None
@@ -667,7 +667,10 @@ def remove_credits(text):
return text
-def _scrape_strip_cruft(html, plain_text_out=False):
+collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
+
+
+def _scrape_strip_cruft(html: str) -> str:
"""Clean up HTML"""
html = unescape(html)
@@ -679,13 +682,8 @@ def _scrape_strip_cruft(html, plain_text_out=False):
html = re.sub("
two
three"
From 24a534a7f593b9e926e24dac6d3371d8d5e9670b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Fri, 6 Sep 2024 12:11:01 +0100
Subject: [PATCH 08/26] Use a single slug implementation
Tidy up 'Google.is_page_candidate' method and remove 'Google.sluggify'
method which was a duplicate of 'slug'.
Since 'GeniusFetchTest' only tested whether the artist name is cleaned
up (the rest of the functionality is patched), remove it and move its
test cases to the 'test_slug' test.
---
beetsplug/lyrics.py | 33 ++++++++-------------------------
test/plugins/test_lyrics.py | 4 ----
2 files changed, 8 insertions(+), 29 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 13b786f4fb..5f28981dca 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -23,7 +23,6 @@
import os.path
import re
import struct
-import unicodedata
from contextlib import contextmanager, suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
@@ -224,7 +223,7 @@ def generate_alternatives(string, patterns):
return itertools.product(artists, multi_titles)
-def slug(text):
+def slug(text: str) -> str:
"""Make a URL-safe, human-readable version of the given text
This will do the following:
@@ -234,10 +233,6 @@ def slug(text):
3. strip whitespace
4. replace other non-word characters with dashes
5. strip extra dashes
-
- This somewhat duplicates the :func:`Google.slugify` function but
- slugify is not as generic as this one, which can be reused
- elsewhere.
"""
return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
@@ -742,19 +737,6 @@ def is_lyrics(self, text, artist=None):
self.debug("Bad triggers detected: {}", bad_triggers_occ)
return len(bad_triggers_occ) < 2
- def slugify(self, text):
- """Normalize a string and remove non-alphanumeric characters."""
- text = re.sub(r"[-'_\s]", "_", text)
- text = re.sub(r"_+", "_", text).strip("_")
- pat = r"([^,\(]*)\((.*?)\)" # Remove content within parentheses
- text = re.sub(pat, r"\g<1>", text).strip()
- try:
- text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
- text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
- except UnicodeDecodeError:
- self.debug("Failed to normalize '{}'", text)
- return text
-
BY_TRANS = ["by", "par", "de", "von"]
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
@@ -764,23 +746,24 @@ def is_page_candidate(
"""Return True if the URL title makes it a good candidate to be a
page that contains lyrics of title by artist.
"""
- title_slug = self.slugify(title.lower())
- url_title_slug = self.slugify(url_title.lower())
+ title_slug = slug(title)
+ url_title_slug = slug(url_title)
if title_slug in url_title_slug:
return True
- artist = self.slugify(artist.lower())
+ artist = slug(artist)
sitename = urlparse(url_link).netloc
# or try extracting song title from URL title and check if
# they are close enough
tokens = (
- [by + "_" + artist for by in self.BY_TRANS]
+ [by + "-" + artist for by in self.BY_TRANS]
+ [artist, sitename, sitename.replace("www.", "")]
+ self.LYRICS_TRANS
)
- tokens = [re.escape(t) for t in tokens]
- song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
+ song_title = re.sub(
+ "(%s)" % "|".join(tokens), "", url_title_slug
+ ).strip("-")
return self.check_match(artist, title_slug, artist, song_title)
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 9e24d46c23..1d264338a2 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -370,10 +370,6 @@ def test_is_page_candidate(
def test_bad_lyrics(self, backend, lyrics):
assert not backend.is_lyrics(lyrics)
- def test_slugify(self, backend):
- text = "http://site.com/\xe7afe-au_lait(boisson)"
- assert backend.slugify(text) == "http://site.com/cafe_au_lait"
-
class TestGeniusLyrics(LyricsBackendTest):
@pytest.fixture(scope="class")
From 8c8958fa97c17754caadca5f7f02d6ec7517aefb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Thu, 19 Sep 2024 20:00:44 +0100
Subject: [PATCH 09/26] lyrics: Add symbols for better visual feedback in the
logs
---
beetsplug/lyrics.py | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 5f28981dca..f79d1c03bd 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -1046,7 +1046,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
"""
# Skip if the item already has lyrics.
if not force and item.lyrics:
- self.info("Lyrics already present: {}", item)
+ self.info("🔵 Lyrics already present: {}", item)
return
lyrics_matches = []
@@ -1062,7 +1062,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches))
if lyrics:
- self.info("Lyrics found: {}", item)
+ self.info("🟢 Found lyrics: {0}", item)
if HAS_LANGDETECT and self.config["bing_client_secret"].get():
lang_from = langdetect.detect(lyrics)
if self.config["bing_lang_to"].get() != lang_from and (
@@ -1073,7 +1073,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
lyrics, self.config["bing_lang_to"]
)
else:
- self.info("Lyrics not found: {}", item)
+ self.info("🔴 Lyrics not found: {}", item)
fallback = self.config["fallback"].get()
if fallback:
lyrics = fallback
From d756fdd8fb51708155abcf5049399f73d1d86cef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Fri, 27 Sep 2024 22:57:20 +0100
Subject: [PATCH 10/26] lyrics: Do not write item unless lyrics have changed
---
beetsplug/lyrics.py | 16 +++++++---------
1 file changed, 7 insertions(+), 9 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index f79d1c03bd..913d3150a0 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -1074,15 +1074,13 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
)
else:
self.info("🔴 Lyrics not found: {}", item)
- fallback = self.config["fallback"].get()
- if fallback:
- lyrics = fallback
- else:
- return
- item.lyrics = lyrics
- if write:
- item.try_write()
- item.store()
+ lyrics = self.config["fallback"].get()
+
+ if lyrics not in {None, item.lyrics}:
+ item.lyrics = lyrics
+ if write:
+ item.try_write()
+ item.store()
def get_lyrics(self, artist: str, title: str, *args) -> str | None:
"""Fetch lyrics, trying each source in turn. Return a string or
From d0e341577ca475df83a5aef247e03f6e158f0e4f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Mon, 7 Oct 2024 10:33:01 +0100
Subject: [PATCH 11/26] Replace custom unescape implementation by html.unescape
---
beetsplug/lyrics.py | 23 +----------------------
1 file changed, 1 insertion(+), 22 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 913d3150a0..7f7605b91e 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -22,10 +22,10 @@
import math
import os.path
import re
-import struct
from contextlib import contextmanager, suppress
from dataclasses import dataclass
from functools import cached_property, partial, total_ordering
+from html import unescape
from http import HTTPStatus
from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
from urllib.parse import quote, urlencode, urlparse
@@ -127,27 +127,6 @@ def close_session():
# Utilities.
-def unichar(i):
- try:
- return chr(i)
- except ValueError:
- return struct.pack("i", i).decode("utf-32")
-
-
-def unescape(text):
- """Resolve xx; HTML entities (and some others)."""
- if isinstance(text, bytes):
- text = text.decode("utf-8", "ignore")
- out = text.replace(" ", " ")
-
- def replchar(m):
- num = m.group(1)
- return unichar(int(num))
-
- out = re.sub("(\\d+);", replchar, out)
- return out
-
-
def extract_text_between(html, start_marker, end_marker):
try:
_, html = html.split(start_marker, 1)
From b4ee56840ac6ef2f4987f2b36aecda17b2b2cd9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Mon, 7 Oct 2024 18:24:22 +0100
Subject: [PATCH 12/26] Remove extract_text_between
---
beetsplug/lyrics.py | 11 +----------
1 file changed, 1 insertion(+), 10 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 7f7605b91e..946ddd6c47 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -127,15 +127,6 @@ def close_session():
# Utilities.
-def extract_text_between(html, start_marker, end_marker):
- try:
- _, html = html.split(start_marker, 1)
- html, _ = html.split(end_marker, 1)
- except ValueError:
- return ""
- return html
-
-
def search_pairs(item):
"""Yield a pairs of artists and titles to search for.
@@ -445,7 +436,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
# Sometimes lyrics come in 2 or more parts
lyrics_parts = []
for html_part in html_parts:
- lyrics_parts.append(extract_text_between(html_part, ">", "
"))
+ lyrics_parts.append(re.sub(r"^[^>]+>|
.*", "", html_part))
lyrics = "\n".join(lyrics_parts)
lyrics = lyrics.strip(',"').replace("\\n", "\n")
# another odd case: sometimes only that string remains, for
From 2f37c5aa97a7a4c4d9730d09588521adc394f4ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Wed, 9 Oct 2024 12:12:09 +0100
Subject: [PATCH 13/26] Genius: refactor and simplify
---
beetsplug/_typing.py | 86 +++++++++++++++++++++++++
beetsplug/lyrics.py | 123 ++++++++++--------------------------
test/plugins/test_lyrics.py | 2 +-
3 files changed, 121 insertions(+), 90 deletions(-)
create mode 100644 beetsplug/_typing.py
diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py
new file mode 100644
index 0000000000..f673b85bd5
--- /dev/null
+++ b/beetsplug/_typing.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from typing import Any
+
+from typing_extensions import TypeAlias, TypedDict
+
+JSONDict: TypeAlias = "dict[str, Any]"
+
+
+class LRCLibAPI:
+ class Item(TypedDict):
+ """Lyrics data item returned by the LRCLib API."""
+
+ id: int
+ name: str
+ trackName: str
+ artistName: str
+ albumName: str
+ duration: float | None
+ instrumental: bool
+ plainLyrics: str
+ syncedLyrics: str | None
+
+
+class GeniusAPI:
+ """Genius API data types.
+
+ This documents *only* the fields that are used in the plugin.
+ :attr:`SearchResult` is an exception, since I thought some of the other
+ fields might be useful in the future.
+ """
+
+ class DateComponents(TypedDict):
+ year: int
+ month: int
+ day: int
+
+ class Artist(TypedDict):
+ api_path: str
+ header_image_url: str
+ id: int
+ image_url: str
+ is_meme_verified: bool
+ is_verified: bool
+ name: str
+ url: str
+
+ class Stats(TypedDict):
+ unreviewed_annotations: int
+ hot: bool
+
+ class SearchResult(TypedDict):
+ annotation_count: int
+ api_path: str
+ artist_names: str
+ full_title: str
+ header_image_thumbnail_url: str
+ header_image_url: str
+ id: int
+ lyrics_owner_id: int
+ lyrics_state: str
+ path: str
+ primary_artist_names: str
+ pyongs_count: int | None
+ relationships_index_url: str
+ release_date_components: GeniusAPI.DateComponents
+ release_date_for_display: str
+ release_date_with_abbreviated_month_for_display: str
+ song_art_image_thumbnail_url: str
+ song_art_image_url: str
+ stats: GeniusAPI.Stats
+ title: str
+ title_with_featured: str
+ url: str
+ featured_artists: list[GeniusAPI.Artist]
+ primary_artist: GeniusAPI.Artist
+ primary_artists: list[GeniusAPI.Artist]
+
+ class SearchHit(TypedDict):
+ result: GeniusAPI.SearchResult
+
+ class SearchResponse(TypedDict):
+ hits: list[GeniusAPI.SearchHit]
+
+ class Search(TypedDict):
+ response: GeniusAPI.SearchResponse
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 946ddd6c47..7a482ef2e0 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -31,7 +31,6 @@
from urllib.parse import quote, urlencode, urlparse
import requests
-from typing_extensions import TypedDict
from unidecode import unidecode
import beets
@@ -42,6 +41,8 @@
from beets.importer import ImportTask
from beets.library import Item
+ from ._typing import GeniusAPI, LRCLibAPI
+
try:
from bs4 import BeautifulSoup
@@ -266,20 +267,6 @@ def fetch(
raise NotImplementedError
-class LRCLibItem(TypedDict):
- """Lyrics data item returned by the LRCLib API."""
-
- id: int
- name: str
- trackName: str
- artistName: str
- albumName: str
- duration: float | None
- instrumental: bool
- plainLyrics: str
- syncedLyrics: str | None
-
-
@dataclass
@total_ordering
class LRCLyrics:
@@ -297,7 +284,9 @@ def __le__(self, other: LRCLyrics) -> bool:
return self.dist < other.dist
@classmethod
- def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics:
+ def make(
+ cls, candidate: LRCLibAPI.Item, target_duration: float
+ ) -> LRCLyrics:
return cls(
target_duration,
candidate["duration"] or 0.0,
@@ -354,7 +343,7 @@ class LRCLib(Backend):
def fetch_candidates(
self, artist: str, title: str, album: str, length: int
- ) -> Iterator[list[LRCLibItem]]:
+ ) -> Iterator[list[LRCLibAPI.Item]]:
"""Yield lyrics candidates for the given song data.
Firstly, attempt to GET lyrics directly, and then search the API if
@@ -491,13 +480,15 @@ class Genius(SearchBackend):
bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
"""
+ LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(? dict[str, str]:
+ return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
def fetch(self, artist: str, title: str, *_) -> str | None:
"""Fetch lyrics from genius.com
@@ -506,85 +497,39 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
we first query the api for a url matching our artist & title,
then attempt to scrape that url for the lyrics.
"""
- json = self._search(artist, title)
- check = partial(self.check_match, artist, title)
- for hit in json["response"]["hits"]:
- result = hit["result"]
- url = hit["result"]["url"]
- if check(result["primary_artist"]["name"], result["title"]) and (
- lyrics := self.scrape_lyrics(self.fetch_text(url))
- ):
- return collapse_newlines(lyrics)
-
- return None
-
- def _search(self, artist, title):
- """Searches the genius api for a given artist and title
-
- https://docs.genius.com/#search-h2
-
- :returns: json response
- """
- return self.fetch_json(
+ data = self.fetch_json(
self.search_url,
- params={"q": f"{title} {artist.lower()}"},
+ params={"q": f"{artist} {title}".lower()},
headers=self.headers,
)
+ if (url := self.find_lyrics_url(data, artist, title)) and (
+ lyrics := self.scrape_lyrics(self.fetch_text(url))
+ ):
+ return collapse_newlines(lyrics)
- def scrape_lyrics(self, html: str) -> str | None:
- """Scrape lyrics from a given genius.com html"""
- soup = get_soup(html)
-
- # Most of the time, the page contains a div with class="lyrics" where
- # all of the lyrics can be found already correctly formatted
- # Sometimes, though, it packages the lyrics into separate divs, most
- # likely for easier ad placement
+ return None
- lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
- if not lyrics_divs:
- self.debug("Received unusual song page html")
- return self._try_extracting_lyrics_from_non_data_lyrics_container(
- soup
- )
- lyrics = ""
- for lyrics_div in lyrics_divs:
- lyrics += lyrics_div.get_text() + "\n\n"
- while lyrics[-1] == "\n":
- lyrics = lyrics[:-1]
- return lyrics
+ def find_lyrics_url(
+ self, data: GeniusAPI.Search, artist: str, title: str
+ ) -> str | None:
+ """Find URL to the lyrics of the given artist and title.
- def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
- """Extract lyrics from a div without attribute data-lyrics-container
- This is the second most common layout on genius.com
+ https://docs.genius.com/#search-h2.
"""
- verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
- if not verse_div:
- if soup.find(
- "div",
- class_=re.compile("LyricsPlaceholder__Message"),
- string="This song is an instrumental",
- ):
- self.debug("Detected instrumental")
- return INSTRUMENTAL_LYRICS
- else:
- self.debug("Couldn't scrape page using known layouts")
- return None
+ check = partial(self.check_match, artist, title)
+ for result in (hit["result"] for hit in data["response"]["hits"]):
+ if check(result["artist_names"], result["title"]):
+ return result["url"]
- lyrics_div = verse_div.parent
+ return None
- ads = lyrics_div.find_all(
- "div", class_=re.compile("InreadAd__Container")
- )
- for ad in ads:
- ad.replace_with("\n")
+ def scrape_lyrics(self, html: str) -> str | None:
+ if m := self.LYRICS_IN_JSON_RE.search(html):
+ html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
+ return get_soup(html_text).get_text().strip()
- footers = lyrics_div.find_all(
- "div", class_=re.compile("Lyrics__Footer")
- )
- for footer in footers:
- footer.replace_with("")
- return lyrics_div.get_text()
+ return None
class Tekstowo(DirectBackend):
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 1d264338a2..0917761080 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -379,7 +379,7 @@ def backend_name(self):
@pytest.mark.parametrize(
"file_name, expected_line_count",
[
- ("geniuscom/2pacalleyezonmelyrics", 134),
+ ("geniuscom/2pacalleyezonmelyrics", 131),
("geniuscom/Ttngchinchillalyrics", 29),
("geniuscom/sample", 0), # see https://github.com/beetbox/beets/issues/3535
],
From 56bcdf209d04722b978ce4a34c7616224f644e20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sun, 13 Oct 2024 17:04:58 +0100
Subject: [PATCH 14/26] Unite Genius, Tekstowo and Google backends under the
same interface
---
beetsplug/lyrics.py | 168 ++++++++++++++++++------------------
test/plugins/test_lyrics.py | 39 +++++----
2 files changed, 104 insertions(+), 103 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 7a482ef2e0..4a1aef336b 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -27,7 +27,7 @@
from functools import cached_property, partial, total_ordering
from html import unescape
from http import HTTPStatus
-from typing import TYPE_CHECKING, Any, ClassVar, Iterable, Iterator
+from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
from urllib.parse import quote, urlencode, urlparse
import requests
@@ -41,7 +41,7 @@
from beets.importer import ImportTask
from beets.library import Item
- from ._typing import GeniusAPI, LRCLibAPI
+ from ._typing import GeniusAPI, JSONDict, LRCLibAPI
try:
from bs4 import BeautifulSoup
@@ -57,8 +57,6 @@
except ImportError:
HAS_LANGDETECT = False
-JSONDict = dict[str, Any]
-
BREAK_RE = re.compile(r"\n?\s* ]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -439,6 +437,12 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
return lyrics
+class SearchResult(NamedTuple):
+ artist: str
+ title: str
+ url: str
+
+
class SearchBackend(Backend):
REQUIRES_BS = True
@@ -447,12 +451,12 @@ def dist_thresh(self) -> float:
return self.config["dist_thresh"].get(float)
def check_match(
- self, target_artist: str, target_title: str, artist: str, title: str
+ self, target_artist: str, target_title: str, result: SearchResult
) -> bool:
- """Check if the given artist and title are 'good enough' match."""
+ """Check if the given search result is a 'good enough' match."""
max_dist = max(
- string_dist(target_artist, artist),
- string_dist(target_title, title),
+ string_dist(target_artist, result.artist),
+ string_dist(target_title, result.title),
)
if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
@@ -463,8 +467,8 @@ def check_match(
# This may show a matching candidate with some noise in the name
self.debug(
"({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
- artist,
- title,
+ result.artist,
+ result.title,
target_artist,
target_title,
max_dist,
@@ -472,61 +476,62 @@ def check_match(
return False
+ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+ """Search for the given query and yield search results."""
+ raise NotImplementedError
+
+ def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
+ check_match = partial(self.check_match, artist, title)
+ for candidate in self.search(artist, title):
+ if check_match(candidate):
+ yield candidate
+
+ def fetch(self, artist: str, title: str, *_) -> str | None:
+ """Fetch lyrics for the given artist and title."""
+ for result in self.get_results(artist, title):
+ if lyrics := self.scrape(self.fetch_text(result.url)):
+ return lyrics
+
+ return None
+
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
+ """Scrape the lyrics from the given HTML."""
+ raise NotImplementedError
+
class Genius(SearchBackend):
"""Fetch lyrics from Genius via genius-api.
- Simply adapted from
- bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
+ Because genius doesn't allow accessing lyrics via the api, we first query
+ the api for a url matching our artist & title, then scrape the HTML text
+ for the JSON data containing the lyrics.
+
+ Adapted from
+ bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping
"""
+ SEARCH_URL = "https://api.genius.com/search"
LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(? dict[str, str]:
return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
- def fetch(self, artist: str, title: str, *_) -> str | None:
- """Fetch lyrics from genius.com
-
- Because genius doesn't allow accessing lyrics via the api,
- we first query the api for a url matching our artist & title,
- then attempt to scrape that url for the lyrics.
- """
-
- data = self.fetch_json(
- self.search_url,
- params={"q": f"{artist} {title}".lower()},
+ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+ search_data: GeniusAPI.Search = self.fetch_json(
+ self.SEARCH_URL,
+ params={"q": f"{artist} {title}"},
headers=self.headers,
)
- if (url := self.find_lyrics_url(data, artist, title)) and (
- lyrics := self.scrape_lyrics(self.fetch_text(url))
- ):
- return collapse_newlines(lyrics)
-
- return None
-
- def find_lyrics_url(
- self, data: GeniusAPI.Search, artist: str, title: str
- ) -> str | None:
- """Find URL to the lyrics of the given artist and title.
-
- https://docs.genius.com/#search-h2.
- """
- check = partial(self.check_match, artist, title)
- for result in (hit["result"] for hit in data["response"]["hits"]):
- if check(result["artist_names"], result["title"]):
- return result["url"]
+ for r in (hit["result"] for hit in search_data["response"]["hits"]):
+ yield SearchResult(r["artist_names"], r["title"], r["url"])
- return None
-
- def scrape_lyrics(self, html: str) -> str | None:
- if m := self.LYRICS_IN_JSON_RE.search(html):
- html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
+ if m := cls.LYRICS_IN_JSON_RE.search(html):
+ html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
return get_soup(html_text).get_text().strip()
return None
@@ -548,13 +553,12 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
# We are expecting to receive a 404 since we are guessing the URL.
# Thus suppress the error so that it does not end up in the logs.
with suppress(NotFoundError):
- return self.scrape_lyrics(
- self.fetch_text(self.build_url(artist, title))
- )
+ return self.scrape(self.fetch_text(self.build_url(artist, title)))
return None
- def scrape_lyrics(self, html: str) -> str | None:
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
soup = get_soup(html)
if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
@@ -613,16 +617,6 @@ class Google(SearchBackend):
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
- @staticmethod
- def scrape_lyrics(html: str) -> str | None:
- soup = get_soup(html)
-
- # Get the longest text element (if any).
- strings = sorted(soup.stripped_strings, key=len, reverse=True)
- if strings:
- return strings[0]
- return None
-
def is_lyrics(self, text, artist=None):
"""Determine whether the text seems to be valid lyrics."""
if not text:
@@ -655,17 +649,11 @@ def is_lyrics(self, text, artist=None):
BY_TRANS = ["by", "par", "de", "von"]
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
- def is_page_candidate(
- self, artist: str, title: str, url_link: str, url_title: str
- ) -> bool:
- """Return True if the URL title makes it a good candidate to be a
- page that contains lyrics of title by artist.
- """
- title_slug = slug(title)
+ def make_search_result(
+ self, artist: str, url_link: str, url_title: str
+ ) -> SearchResult:
+ """Parse artist and title from the URL title and return a search result."""
url_title_slug = slug(url_title)
- if title_slug in url_title_slug:
- return True
-
artist = slug(artist)
sitename = urlparse(url_link).netloc
@@ -680,33 +668,45 @@ def is_page_candidate(
"(%s)" % "|".join(tokens), "", url_title_slug
).strip("-")
- return self.check_match(artist, title_slug, artist, song_title)
+ return SearchResult(artist, song_title, url_link)
- def fetch(self, artist: str, title: str, *_) -> str | None:
+ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
params = {
"key": self.config["google_API_key"].as_str(),
"cx": self.config["google_engine_ID"].as_str(),
"q": f"{artist} {title}",
}
- check_candidate = partial(self.is_page_candidate, artist, title)
- for item in self.fetch_json(self.SEARCH_URL, params=params).get(
- "items", []
- ):
- url_link = item["link"]
- if not check_candidate(url_link, item.get("title", "")):
- continue
+ data = self.fetch_json(self.SEARCH_URL, params=params)
+ for item in data.get("items", []):
+ yield self.make_search_result(artist, item["link"], item["title"])
+
+ def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
+ return super().get_results(artist, slug(title))
+
+ def fetch(self, artist: str, title: str, *_) -> str | None:
+ for result in self.get_results(artist, title):
with self.handle_request():
- lyrics = self.scrape_lyrics(self.fetch_text(url_link))
+ lyrics = self.scrape(self.fetch_text(result.url))
if not lyrics:
continue
if self.is_lyrics(lyrics, artist):
- self.debug("Got lyrics from {}", item["displayLink"])
+ self.debug(
+ "Got lyrics from {}", urlparse(result.url).netloc
+ )
return lyrics
return None
+ @classmethod
+ def scrape(cls, html: str) -> str | None:
+ # Get the longest text element (if any).
+ if strings := sorted(get_soup(html).stripped_strings, key=len):
+ return strings[-1]
+
+ return None
+
class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 0917761080..8afd80585f 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -191,9 +191,9 @@ def backend(self, dist_thresh):
],
)
def test_check_match(self, backend, target_artist, artist, should_match):
- assert (
- backend.check_match(target_artist, "", artist, "") == should_match
- )
+ result = lyrics.SearchResult(artist, "", "")
+
+ assert backend.check_match(target_artist, "", result) == should_match
@pytest.fixture(scope="module")
@@ -327,31 +327,32 @@ def file_name(self):
def test_mocked_source_ok(self, backend, lyrics_html):
"""Test that lyrics of the mocked page are correctly scraped"""
- result = backend.scrape_lyrics(lyrics_html).lower()
+ result = backend.scrape(lyrics_html).lower()
assert result
assert backend.is_lyrics(result)
assert PHRASE_BY_TITLE[self.TITLE] in result
@pytest.mark.parametrize(
- "url_title, artist, should_be_candidate",
+ "url_title, artist, expected_title",
[
- ("John Doe - beets song Lyrics", "John Doe", True),
- ("example.com | Beats song by John doe", "John Doe", True),
- ("example.com | seets bong lyrics by John doe", "John Doe", False),
- ("foo", "Sun O)))", False),
+ ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
+ ("example.com | Beats song by John doe", "John Doe", "beats-song"),
+ (
+ "example.com | seets bong lyrics by John doe",
+ "John Doe",
+ "seets-bong",
+ ),
+ ("foo", "Sun O)))", "foo"),
],
)
- def test_is_page_candidate(
- self, backend, lyrics_html, url_title, artist, should_be_candidate
+ def test_make_search_result(
+ self, backend, url_title, artist, expected_title
):
- result = backend.is_page_candidate(
- artist,
- self.TITLE,
- "http://www.example.com/lyrics/beetssong",
- url_title,
+ result = backend.make_search_result(
+ artist, "https://example.com", url_title
)
- assert bool(result) == should_be_candidate
+ assert result.title == expected_title
@pytest.mark.parametrize(
"lyrics",
@@ -385,7 +386,7 @@ def backend_name(self):
],
) # fmt: skip
def test_scrape(self, backend, lyrics_html, expected_line_count):
- result = backend.scrape_lyrics(lyrics_html) or ""
+ result = backend.scrape(lyrics_html) or ""
assert len(result.splitlines()) == expected_line_count
@@ -406,7 +407,7 @@ def backend_name(self):
],
)
def test_scrape(self, backend, lyrics_html, expecting_lyrics):
- assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics
+ assert bool(backend.scrape(lyrics_html)) == expecting_lyrics
LYRICS_DURATION = 950
From 7cf1814a3da7ea7047db4067817c76593e76db04 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sun, 13 Oct 2024 16:36:41 +0100
Subject: [PATCH 15/26] Google: Refactor and improve
* Type the response data that Google Custom Search API return.
* Exclude some 'letras.mus.br' pages that do not contain lyric.
* Exclude results from Musixmatch as we cannot access their pages.
* Improve parsing of the URL title:
- Handle long URL titles that get truncated (end with ellipsis) for
long searches
- Remove domains starting with 'www'
- Parse the title AND the artist. Previously this would only parse the
title, and fetch lyrics even when the artist did not match.
* Remove now redundant credits cleanup and checks for valid lyrics.
---
beetsplug/_typing.py | 31 ++++++-
beetsplug/lyrics.py | 156 ++++++++++++++++++------------------
test/plugins/test_lyrics.py | 84 ++++++++-----------
3 files changed, 141 insertions(+), 130 deletions(-)
diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py
index f673b85bd5..1aa288cbcb 100644
--- a/beetsplug/_typing.py
+++ b/beetsplug/_typing.py
@@ -2,7 +2,7 @@
from typing import Any
-from typing_extensions import TypeAlias, TypedDict
+from typing_extensions import NotRequired, TypeAlias, TypedDict
JSONDict: TypeAlias = "dict[str, Any]"
@@ -84,3 +84,32 @@ class SearchResponse(TypedDict):
class Search(TypedDict):
response: GeniusAPI.SearchResponse
+
+
+class GoogleCustomSearchAPI:
+ class Response(TypedDict):
+ """Search response from the Google Custom Search API.
+
+ If the search returns no results, the :attr:`items` field is not found.
+ """
+
+ items: NotRequired[list[GoogleCustomSearchAPI.Item]]
+
+ class Item(TypedDict):
+ """A Google Custom Search API result item.
+
+ :attr:`title` field is shown to the user in the search interface, thus
+ it gets truncated with an ellipsis for longer queries. For most
+ results, the full title is available as ``og:title`` metatag found
+ under the :attr:`pagemap` field. Note neither this metatag nor the
+ ``pagemap`` field is guaranteed to be present in the data.
+ """
+
+ title: str
+ link: str
+ pagemap: NotRequired[GoogleCustomSearchAPI.Pagemap]
+
+ class Pagemap(TypedDict):
+ """Pagemap data with a single meta tags dict in a list."""
+
+ metatags: list[JSONDict]
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 4a1aef336b..a15ae726a1 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -28,7 +28,7 @@
from html import unescape
from http import HTTPStatus
from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
-from urllib.parse import quote, urlencode, urlparse
+from urllib.parse import quote, urlencode
import requests
from unidecode import unidecode
@@ -41,7 +41,7 @@
from beets.importer import ImportTask
from beets.library import Item
- from ._typing import GeniusAPI, JSONDict, LRCLibAPI
+ from ._typing import GeniusAPI, GoogleCustomSearchAPI, JSONDict, LRCLibAPI
try:
from bs4 import BeautifulSoup
@@ -489,7 +489,9 @@ def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
def fetch(self, artist: str, title: str, *_) -> str | None:
"""Fetch lyrics for the given artist and title."""
for result in self.get_results(artist, title):
- if lyrics := self.scrape(self.fetch_text(result.url)):
+ if (html := self.fetch_text(result.url)) and (
+ lyrics := self.scrape(html)
+ ):
return lyrics
return None
@@ -567,20 +569,6 @@ def scrape(cls, html: str) -> str | None:
return None
-def remove_credits(text):
- """Remove first/last line of text if it contains the word 'lyrics'
- eg 'Lyrics by songsdatabase.com'
- """
- textlines = text.split("\n")
- credits = None
- for i in (0, -1):
- if textlines and "lyrics" in textlines[i].lower():
- credits = textlines.pop(i)
- if credits:
- text = "\n".join(textlines)
- return text
-
-
collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
@@ -617,87 +605,97 @@ class Google(SearchBackend):
SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
- def is_lyrics(self, text, artist=None):
- """Determine whether the text seems to be valid lyrics."""
- if not text:
- return False
- bad_triggers_occ = []
- nb_lines = text.count("\n")
- if nb_lines <= 1:
- self.debug("Ignoring too short lyrics '{}'", text)
- return False
- elif nb_lines < 5:
- bad_triggers_occ.append("too_short")
- else:
- # Lyrics look legit, remove credits to avoid being penalized
- # further down
- text = remove_credits(text)
+ #: Exclude some letras.mus.br pages which do not contain lyrics.
+ EXCLUDE_PAGES = [
+ "significado.html",
+ "traduccion.html",
+ "traducao.html",
+ "significados.html",
+ ]
- bad_triggers = ["lyrics", "copyright", "property", "links"]
- if artist:
- bad_triggers += [artist]
+ #: Regular expression to match noise in the URL title.
+ URL_TITLE_NOISE_RE = re.compile(
+ r"""
+\b
+(
+ paroles(\ et\ traduction|\ de\ chanson)?
+ | letras?(\ de)?
+ | liedtexte
+ | original\ song\ full\ text\.
+ | official
+ | 20[12]\d\ version
+ | (absolute\ |az)?lyrics(\ complete)?
+ | www\S+
+ | \S+\.(com|net|mus\.br)
+)
+([^\w.]|$)
+""",
+ re.IGNORECASE | re.VERBOSE,
+ )
+ #: Split cleaned up URL title into artist and title parts.
+ URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
- for item in bad_triggers:
- bad_triggers_occ += [item] * len(
- re.findall(r"\W%s\W" % item, text, re.I)
- )
+ def fetch_text(self, *args, **kwargs) -> str:
+ """Handle an error so that we can continue with the next URL."""
+ with self.handle_request():
+ return super().fetch_text(*args, **kwargs)
- if bad_triggers_occ:
- self.debug("Bad triggers detected: {}", bad_triggers_occ)
- return len(bad_triggers_occ) < 2
+ @staticmethod
+ def get_part_dist(artist: str, title: str, part: str) -> float:
+ """Return the distance between the given part and the artist and title.
- BY_TRANS = ["by", "par", "de", "von"]
- LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
+ A number between -1 and 1 is returned, where -1 means the part is
+ closer to the artist and 1 means it is closer to the title.
+ """
+ return string_dist(artist, part) - string_dist(title, part)
+ @classmethod
def make_search_result(
- self, artist: str, url_link: str, url_title: str
+ cls, artist: str, title: str, item: GoogleCustomSearchAPI.Item
) -> SearchResult:
"""Parse artist and title from the URL title and return a search result."""
- url_title_slug = slug(url_title)
- artist = slug(artist)
- sitename = urlparse(url_link).netloc
-
- # or try extracting song title from URL title and check if
- # they are close enough
- tokens = (
- [by + "-" + artist for by in self.BY_TRANS]
- + [artist, sitename, sitename.replace("www.", "")]
- + self.LYRICS_TRANS
+ url_title = (
+ # get full title from metatags if available
+ item.get("pagemap", {}).get("metatags", [{}])[0].get("og:title")
+ # default to the dispolay title
+ or item["title"]
)
- song_title = re.sub(
- "(%s)" % "|".join(tokens), "", url_title_slug
- ).strip("-")
+ clean_title = cls.URL_TITLE_NOISE_RE.sub("", url_title).strip(" .-|")
+ # split it into parts which may be part of the artist or the title
+ # `dict.fromkeys` removes duplicates keeping the order
+ parts = list(dict.fromkeys(cls.URL_TITLE_PARTS_RE.split(clean_title)))
+
+ if len(parts) == 1:
+ part = parts[0]
+ if m := re.search(rf"(?i)\W*({re.escape(title)})\W*", part):
+ # artist and title may not have a separator
+ result_title = m[1]
+ result_artist = part.replace(m[0], "")
+ else:
+ # assume that this is the title
+ result_artist, result_title = "", parts[0]
+ else:
+ # sort parts by their similarity to the artist
+ parts.sort(key=lambda p: cls.get_part_dist(artist, title, p))
+ result_artist, result_title = parts[0], " ".join(parts[1:])
- return SearchResult(artist, song_title, url_link)
+ return SearchResult(result_artist, result_title, item["link"])
def search(self, artist: str, title: str) -> Iterable[SearchResult]:
params = {
"key": self.config["google_API_key"].as_str(),
"cx": self.config["google_engine_ID"].as_str(),
"q": f"{artist} {title}",
+ "siteSearch": "www.musixmatch.com",
+ "siteSearchFilter": "e",
+ "excludeTerms": ", ".join(self.EXCLUDE_PAGES),
}
- data = self.fetch_json(self.SEARCH_URL, params=params)
+ data: GoogleCustomSearchAPI.Response = self.fetch_json(
+ self.SEARCH_URL, params=params
+ )
for item in data.get("items", []):
- yield self.make_search_result(artist, item["link"], item["title"])
-
- def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
- return super().get_results(artist, slug(title))
-
- def fetch(self, artist: str, title: str, *_) -> str | None:
- for result in self.get_results(artist, title):
- with self.handle_request():
- lyrics = self.scrape(self.fetch_text(result.url))
- if not lyrics:
- continue
-
- if self.is_lyrics(lyrics, artist):
- self.debug(
- "Got lyrics from {}", urlparse(result.url).netloc
- )
- return lyrics
-
- return None
+ yield self.make_search_result(artist, title, item)
@classmethod
def scrape(cls, html: str) -> str | None:
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 8afd80585f..6986e4f064 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -101,24 +101,6 @@ def test_search_pairs_titles(self, title, expected_extra_titles):
assert list(actual_titles) == [title, *expected_extra_titles]
- @pytest.mark.parametrize(
- "initial_lyrics, expected",
- [
- ("Verse\nLyrics credit in the last line", "Verse"),
- ("Lyrics credit in the first line\nVerse", "Verse"),
- (
- """Verse
- Lyrics mentioned somewhere in the middle
- Verse""",
- """Verse
- Lyrics mentioned somewhere in the middle
- Verse""",
- ),
- ],
- )
- def test_remove_credits(self, initial_lyrics, expected):
- assert lyrics.remove_credits(initial_lyrics) == expected
-
@pytest.mark.parametrize(
"initial_text, expected",
[
@@ -311,8 +293,6 @@ def test_backend_source(self, lyrics_plugin, lyrics_page: LyricsPage):
class TestGoogleLyrics(LyricsBackendTest):
"""Test scraping heuristics on a fake html page."""
- TITLE = "Beets song"
-
@pytest.fixture(scope="class")
def backend_name(self):
return "google"
@@ -325,51 +305,55 @@ def plugin_config(self):
def file_name(self):
return "examplecom/beetssong"
+ @pytest.fixture
+ def search_item(self, url_title, url):
+ return {"title": url_title, "link": url}
+
def test_mocked_source_ok(self, backend, lyrics_html):
"""Test that lyrics of the mocked page are correctly scraped"""
result = backend.scrape(lyrics_html).lower()
assert result
- assert backend.is_lyrics(result)
- assert PHRASE_BY_TITLE[self.TITLE] in result
+ assert PHRASE_BY_TITLE["Beets song"] in result
@pytest.mark.parametrize(
- "url_title, artist, expected_title",
+ "url_title, expected_artist, expected_title",
[
- ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
- ("example.com | Beats song by John doe", "John Doe", "beats-song"),
+ ("Artist - beets song Lyrics", "Artist", "beets song"),
+ ("www.azlyrics.com | Beats song by Artist", "Artist", "Beats song"),
+ ("lyric.com | seets bong lyrics by Artist", "Artist", "seets bong"),
+ ("foo", "", "foo"),
+ ("Artist - Beets Song lyrics | AZLyrics", "Artist", "Beets Song"),
+ ("Letra de Artist - Beets Song", "Artist", "Beets Song"),
+ ("Letra de Artist - Beets ...", "Artist", "Beets"),
+ ("Artist Beets Song", "Artist", "Beets Song"),
+ ("BeetsSong - Artist", "Artist", "BeetsSong"),
+ ("Artist - BeetsSong", "Artist", "BeetsSong"),
+ ("Beets Song", "", "Beets Song"),
+ ("Beets Song Artist", "Artist", "Beets Song"),
(
- "example.com | seets bong lyrics by John doe",
- "John Doe",
- "seets-bong",
+ "BeetsSong (feat. Other & Another) - Artist",
+ "Artist",
+ "BeetsSong (feat. Other & Another)",
+ ),
+ (
+ (
+ "Beets song lyrics by Artist - original song full text. "
+ "Official Beets song lyrics, 2024 version | LyricsMode.com"
+ ),
+ "Artist",
+ "Beets song",
),
- ("foo", "Sun O)))", "foo"),
],
)
+ @pytest.mark.parametrize("url", ["http://doesntmatter.com"])
def test_make_search_result(
- self, backend, url_title, artist, expected_title
+ self, backend, search_item, expected_artist, expected_title
):
- result = backend.make_search_result(
- artist, "https://example.com", url_title
- )
- assert result.title == expected_title
+ result = backend.make_search_result("Artist", "Beets song", search_item)
- @pytest.mark.parametrize(
- "lyrics",
- [
- "LyricsMania.com - Copyright (c) 2013 - All Rights Reserved",
- """All material found on this site is property\n
- of mywickedsongtext brand""",
- """
-Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
-as they'll be released by $ARTIST, check back soon!
-In case you have the lyrics to $TITLE and want to send them to us, fill out
-the following form.
-""",
- ],
- )
- def test_bad_lyrics(self, backend, lyrics):
- assert not backend.is_lyrics(lyrics)
+ assert result.artist == expected_artist
+ assert result.title == expected_title
class TestGeniusLyrics(LyricsBackendTest):
From ce2ee98e165c39a65c5fde742dc1661f001223a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?=
Date: Sun, 13 Oct 2024 13:34:12 +0100
Subject: [PATCH 16/26] Create Html class for cleaning up the html text
Additionally, improve HTML pre-processing:
* Ensure a new line between blocks of lyrics text from letras.mus.br.
* Parse a missing last block of lyrics text from lacocinelle.net.
* Parse a missing last block of lyrics text from paroles.net.
* Fix encoding issues with AZLyrics by setting response encoding to
None, allowing `requests` to handle it.
---
beetsplug/lyrics.py | 105 ++++++++++++++++++++++-------------
test/plugins/lyrics_pages.py | 42 ++++++++++++++
test/plugins/test_lyrics.py | 41 +++++++-------
3 files changed, 127 insertions(+), 61 deletions(-)
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index a15ae726a1..9289f8fc7e 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -57,7 +57,6 @@
except ImportError:
HAS_LANGDETECT = False
-BREAK_RE = re.compile(r"\n?\s* ]*)*>\s*\n?", re.I)
USER_AGENT = f"beets/{beets.__version__}"
INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -231,10 +230,16 @@ def format_url(url: str, params: JSONDict | None) -> str:
def fetch_text(
self, url: str, params: JSONDict | None = None, **kwargs
) -> str:
- """Return text / HTML data from the given URL."""
+ """Return text / HTML data from the given URL.
+
+ Set the encoding to None to let requests handle it because some sites
+ set it incorrectly.
+ """
url = self.format_url(url, params)
self.debug("Fetching HTML from {}", url)
- return r_session.get(url, **kwargs).text
+ r = r_session.get(url, **kwargs)
+ r.encoding = None
+ return r.text
def fetch_json(self, url: str, params: JSONDict | None = None, **kwargs):
"""Return JSON data from the given URL."""
@@ -437,13 +442,60 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
return lyrics
+class Html:
+ collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1")
+ expand_br = partial(re.compile(r"\s* ]*>\s*", re.I).sub, "\n")
+ #: two newlines between paragraphs on the same line (musica, letras.mus.br)
+ merge_blocks = partial(re.compile(r"(?)
]*>").sub, "\n\n")
+ #: a single new line between paragraphs on separate lines
+ #: (paroles.net, sweetslyrics.com, lacoccinelle.net)
+ merge_lines = partial(re.compile(r"