From b8f82607f9bbfbd50f4a2ba23279ee597c2d38aa Mon Sep 17 00:00:00 2001 From: Petr Polezhaev Date: Mon, 9 Mar 2026 22:59:19 +0300 Subject: [PATCH] =?UTF-8?q?Fix=20archive=20plugins=20for=20=D0=9D=D0=AD?= =?UTF-8?q?=D0=91=20and=20Alib;=20add=20network=20integration=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - html_scraper: add img_alt strategy (НЭБ titles from ), bold_text strategy (Alib entries from

), Windows-1251 encoding support, _cls_inner_texts() helper that strips inner HTML tags - rsl: rewrite to POST SearchFilterForm[search] with CSRF token and CQL title:(words) AND author:(word) query format - config: update rusneb (img_alt + correct author_class) and alib_web (encoding + bold_text) to match fixed plugin strategies - tests: add tests/test_archives.py with network-marked tests for all six archive plugins; НЛР and ШПИЛ marked xfail (endpoints return HTTP 404) - presubmit: exclude network tests from default run (-m "not network") Co-Authored-By: Claude Sonnet 4.6 --- config/functions.default.yaml | 8 +- pyproject.toml | 1 + scripts/presubmit.py | 2 +- src/plugins/archives/html_scraper.py | 195 +++++++++++++++++++++++---- src/plugins/archives/rsl.py | 105 +++++++++++++-- tests/test_archives.py | 189 ++++++++++++++++++++++++++ 6 files changed, 458 insertions(+), 42 deletions(-) create mode 100644 tests/test_archives.py diff --git a/config/functions.default.yaml b/config/functions.default.yaml index f2df473..8b2530a 100644 --- a/config/functions.default.yaml +++ b/config/functions.default.yaml @@ -64,8 +64,8 @@ functions: config: url: "https://rusneb.ru/search/" search_param: q - title_class: "title" - author_class: "author" + img_alt: true + author_class: "search-list__item_subtext" alib_web: name: "Alib (web)" @@ -77,8 +77,8 @@ functions: url: "https://www.alib.ru/find3.php4" search_param: tfind extra_params: {f: "5", s: "0"} - link_href_pattern: "t[a-z]+\\.phtml" - author_class: "aut" + encoding: "cp1251" + bold_text: true nlr: name: "НЛР" diff --git a/pyproject.toml b/pyproject.toml index e007cca..c976525 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,6 +56,7 @@ include = ["src", "tests", "scripts"] [tool.pytest.ini_options] pythonpath = ["src"] +markers = ["network: live HTTP requests to external services (deselect with -m 'not network')"] [build-system] requires = ["poetry-core"] diff --git a/scripts/presubmit.py b/scripts/presubmit.py index ac15fbd..e717fbe 100644 --- a/scripts/presubmit.py +++ b/scripts/presubmit.py @@ -24,7 +24,7 @@ def presubmit(): ["black", "--check", "."], ["flake8", "."], ["pyright"], - ["pytest", "tests/"], + ["pytest", "tests/", "-m", "not network"], # JS: tests run via Node built-in runner (no npm packages needed) ["node", "--test", "tests/js/pure-functions.test.js"], ] diff --git a/src/plugins/archives/html_scraper.py b/src/plugins/archives/html_scraper.py index 9b30c84..a3b17d0 100644 --- a/src/plugins/archives/html_scraper.py +++ b/src/plugins/archives/html_scraper.py @@ -2,7 +2,7 @@ import re from typing import Any -from urllib.parse import urlparse +from urllib.parse import quote, urlparse import httpx @@ -12,21 +12,78 @@ from ..rate_limiter import RateLimiter _YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b") +# Matches "Surname I.N. " or "Surname I. " at the start of an entry. +_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL) + def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]: - return re.compile(rf'class="[^"]*{re.escape(cls_frag)}[^"]*"[^>]*>([^<]{{{min_len},{max_len}}})<') + # Support both single and double-quoted class attributes. + return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<') + + +def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]: + """Extract text content from elements whose class contains cls_frag. + + Strips inner HTML tags and normalises whitespace, so elements like + ``Name I.N.`` work correctly. + + Args: + html: Raw HTML string to search. + cls_frag: Substring that must appear in the class attribute value. + min_len: Minimum length of extracted text to keep. + max_len: Maximum length of extracted text to keep. + + Returns: + Up to three non-empty text strings in document order. + """ + raw = re.findall(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>(.*?)]+>", "", m) + text = re.sub(r"\s+", " ", text).strip() + if min_len <= len(text) <= max_len: + out.append(text) + if len(out) == 3: + break + return out + + +def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]: + """Extract non-empty alt attributes from tags, normalising whitespace. + + Args: + html: Raw HTML string to search. + min_len: Minimum character length to include. + max_len: Maximum character length to include. + + Returns: + Up to three non-empty, whitespace-normalised alt strings. + """ + alts = re.findall(r']+alt=[\'"]([^\'"]+)[\'"]', html) + out: list[str] = [] + for a in alts: + text = re.sub(r"\s+", " ", a).strip() + if min_len <= len(text) <= max_len: + out.append(text) + if len(out) == 3: + break + return out class HtmlScraperPlugin: - """ - Config-driven HTML scraper. Supported config keys: - url — search URL - search_param — query param name - extra_params — dict of fixed extra query parameters - title_class — CSS class fragment for title elements (class-based strategy) - author_class — CSS class fragment for author elements - link_href_pattern — href regex to find title links (link strategy, e.g. alib) - brief_class — CSS class for brief record rows (brief strategy, e.g. shpl) + """Config-driven HTML scraper. + + Supported config keys: + url — search URL + search_param — query param name + extra_params — dict of fixed extra query parameters + encoding — character encoding for query and response (e.g. "cp1251") + title_class — CSS class fragment for title elements (class-based strategy) + author_class — CSS class fragment for author elements + link_href_pattern — href regex to find title links (link strategy) + brief_class — CSS class for brief record rows (brief strategy) + img_alt — truthy: extract titles from attributes (rusneb strategy) + bold_text — truthy: extract author/title from

blocks (alib strategy) """ category = "archive_searchers" @@ -51,30 +108,118 @@ class HtmlScraperPlugin: self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id def search(self, query: str) -> list[CandidateRecord]: + """Search for books matching query. + + Args: + query: Free-text search string (author, title, keywords). + + Returns: + Up to three CandidateRecord dicts with source, title, author, year, + isbn, and publisher fields. + """ cfg = self.config self._rl.wait_and_record(self._domain, self.rate_limit_seconds) - params: dict[str, Any] = dict(cfg.get("extra_params") or {}) - params[cfg["search_param"]] = query - r = httpx.get( - cfg["url"], - params=params, - timeout=self.timeout, - headers={"User-Agent": "Mozilla/5.0"}, - ) - html = r.text + + encoding = str(cfg.get("encoding") or "") + if encoding: + # Encode query and extra params in the site's native encoding. + q_enc = quote(query.encode(encoding, "replace")) + ep: dict[str, Any] = dict(cfg.get("extra_params") or {}) + ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()] + raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts) + r = httpx.get( + f'{cfg["url"]}?{raw_qs}', + timeout=self.timeout, + headers={"User-Agent": "Mozilla/5.0"}, + ) + html = r.content.decode(encoding, errors="replace") + else: + params: dict[str, Any] = dict(cfg.get("extra_params") or {}) + params[cfg["search_param"]] = query + r = httpx.get( + cfg["url"], + params=params, + timeout=self.timeout, + headers={"User-Agent": "Mozilla/5.0"}, + ) + html = r.text + years = _YEAR_RE.findall(html) - # Strategy: link_href_pattern (alib-style) + if cfg.get("bold_text"): + return self._parse_bold_text(html, years) + if cfg.get("img_alt"): + return self._parse_img_alt(html, years, cfg) if "link_href_pattern" in cfg: return self._parse_link(html, years, cfg) - - # Strategy: brief_class (shpl-style) if "brief_class" in cfg: return self._parse_brief(html, years, cfg) - - # Strategy: title_class + author_class (rusneb-style) return self._parse_class(html, years, cfg) + def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]: + """Extract records from ``

text`` entries (Alib-style). + + The bold text is expected to begin with ``Surname I.N. Title…``; the + author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible. + + Args: + html: Decoded HTML response. + years: Year strings found in the full HTML (used positionally). + + Returns: + Up to three CandidateRecord dicts. + """ + entries = re.findall(r"

([^<]{5,200})", html)[:3] + out: list[CandidateRecord] = [] + for i, entry in enumerate(entries): + text = entry.strip() + m = _AUTHOR_PREFIX_PAT.match(text) + if m: + author = m.group(1).strip() + title = m.group(2).strip() + else: + author = "" + title = text + out.append( + CandidateRecord( + source=self.plugin_id, + title=title, + author=author, + year=years[i] if i < len(years) else "", + isbn="", + publisher="", + ) + ) + return out + + def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]: + """Extract records using ```` for titles and a CSS class for authors. + + Used for sites like rusneb.ru where thumbnail alt attributes carry the + book title and a separate span contains the author. + + Args: + html: Decoded HTML response. + years: Year strings found in the full HTML (used positionally). + cfg: Plugin config dict (reads ``author_class``). + + Returns: + Up to three CandidateRecord dicts. + """ + titles = _img_alts(html) + authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80) + return [ + CandidateRecord( + source=self.plugin_id, + title=title, + author=authors[i] if i < len(authors) else "", + year=years[i] if i < len(years) else "", + isbn="", + publisher="", + ) + for i, title in enumerate(titles) + ] + def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]: titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3] authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3] diff --git a/src/plugins/archives/rsl.py b/src/plugins/archives/rsl.py index 18edaab..2a2d5fd 100644 --- a/src/plugins/archives/rsl.py +++ b/src/plugins/archives/rsl.py @@ -1,5 +1,17 @@ -"""RSL (Russian State Library) AJAX JSON search API plugin (search.rsl.ru).""" +"""RSL (Russian State Library) search plugin (search.rsl.ru). +The search API requires a POST to ``/site/ajax-search?language=ru`` with +form-encoded body containing ``SearchFilterForm[search]`` and a CSRF token +obtained from the main search page. Query syntax is CQL: +``title:() AND author:(<author words>)``. + +Results come back as an HTML fragment in the ``content`` key of a JSON +envelope; individual records are identified by the CSS classes +``rsl-item-nocover-title`` (author) and ``rsl-item-nocover-descr`` (title). +Both fields contain ``<b>`` highlight tags that are stripped before returning. +""" + +import re from typing import Any import httpx @@ -9,9 +21,27 @@ from models import CandidateRecord from ..rate_limiter import RateLimiter _DOMAIN = "search.rsl.ru" +_SEARCH_URL = "https://search.rsl.ru/site/ajax-search" +_BASE_URL = "https://search.rsl.ru/ru/search" +_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b") + + +def _strip_tags(html_frag: str) -> str: + """Strip HTML tags and decode basic entities from a fragment.""" + text = re.sub(r"<[^>]+>", "", html_frag) + text = text.replace(""", '"').replace("&", "&").replace("<", "<").replace(">", ">") + return re.sub(r"\s+", " ", text).strip() class RSLPlugin: + """Archive searcher for search.rsl.ru. + + Formats the query as CQL ``title:(title_words) AND author:(author_word)`` + by treating the first whitespace-delimited token as the author surname and + the remainder as title keywords. When only one token is present, a plain + ``title:(token) OR author:(token)`` query is used instead. + """ + category = "archive_searchers" def __init__( @@ -32,28 +62,79 @@ class RSLPlugin: self.timeout = timeout def search(self, query: str) -> list[CandidateRecord]: + """Search RSL for books matching query. + + Args: + query: Free-text string; the first token is treated as the author + surname and remaining tokens as title keywords. + + Returns: + Up to three CandidateRecord dicts extracted from the RSL HTML + response, with ``<b>`` highlight tags stripped. + """ self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds) - r = httpx.get( - "https://search.rsl.ru/site/ajax-search", - params={"language": "ru", "q": query, "page": 1, "perPage": 5}, + + cql = self._build_cql(query) + client = httpx.Client() + + # Fetch the main page to obtain a valid CSRF token. + r0 = client.get(_BASE_URL, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"}) + csrf_match = re.search(r'name="_csrf"\s+value="([^"]+)"', r0.text) + csrf = csrf_match.group(1) if csrf_match else "" + + r = client.post( + _SEARCH_URL, + params={"language": "ru"}, + data={"SearchFilterForm[search]": cql, "_csrf": csrf}, timeout=self.timeout, - headers={"Accept": "application/json"}, + headers={ + "Accept": "application/json", + "X-Requested-With": "XMLHttpRequest", + "Referer": _BASE_URL, + "User-Agent": "Mozilla/5.0", + }, ) data: dict[str, Any] = r.json() - records: list[dict[str, Any]] = data.get("records") or data.get("items") or data.get("data") or [] + content = str(data.get("content") or "") + + raw_titles = re.findall(r'rsl-item-nocover-descr[^"]*">(.*?)</div>', content)[:3] + raw_authors = re.findall(r'rsl-item-nocover-title[^"]*">(.*?)</div>', content)[:3] + years = _YEAR_RE.findall(content)[:3] + out: list[CandidateRecord] = [] - for rec in records[:3]: - title = (str(rec.get("title") or rec.get("name") or "")).strip() + for i, raw_title in enumerate(raw_titles): + title = _strip_tags(raw_title) if not title: continue + author = _strip_tags(raw_authors[i]) if i < len(raw_authors) else "" out.append( CandidateRecord( source=self.plugin_id, title=title, - author=(str(rec.get("author") or rec.get("authors") or "")).strip(), - year=str(rec.get("year") or rec.get("pubyear") or "").strip(), - isbn=(str(rec.get("isbn") or "")).strip(), - publisher=(str(rec.get("publisher") or "")).strip(), + author=author, + year=years[i] if i < len(years) else "", + isbn="", + publisher="", ) ) return out + + @staticmethod + def _build_cql(query: str) -> str: + """Build a CQL query string for the RSL search API. + + Args: + query: Raw query string, typically ``"Author Title keywords"``. + + Returns: + CQL string in the form ``title:(…) AND author:(…)`` when the query + contains multiple tokens, or ``title:(…) OR author:(…)`` for a + single token. + """ + tokens = query.split() + if len(tokens) > 1: + author_part = tokens[0] + title_part = " ".join(tokens[1:]) + return f"title:({title_part}) AND author:({author_part})" + token = tokens[0] if tokens else query + return f"title:({token}) OR author:({token})" diff --git a/tests/test_archives.py b/tests/test_archives.py new file mode 100644 index 0000000..c380ead --- /dev/null +++ b/tests/test_archives.py @@ -0,0 +1,189 @@ +"""Network integration tests for archive searcher plugins. + +Each test queries a live external service for "War and Peace" by Tolstoy, +a book universally catalogued in all supported archives. + +Run with: pytest tests/ -m network +Skip with: pytest tests/ -m "not network" (default in presubmit) +""" + +import pytest + +from models import CandidateRecord +from plugins.archives.html_scraper import HtmlScraperPlugin +from plugins.archives.openlibrary import OpenLibraryPlugin +from plugins.archives.rsl import RSLPlugin +from plugins.archives.sru_catalog import SRUCatalogPlugin +from plugins.rate_limiter import RateLimiter + +pytestmark = pytest.mark.network + +_RL = RateLimiter() +_TIMEOUT = 15 + + +def _titles(results: list[CandidateRecord]) -> list[str]: + return [r["title"] for r in results] + + +def _authors(results: list[CandidateRecord]) -> list[str]: + return [r["author"] for r in results] + + +def _has_title(results: list[CandidateRecord], fragment: str) -> bool: + """Return True if any result title contains fragment (case-insensitive).""" + low = fragment.lower() + return any(low in r["title"].lower() for r in results) + + +def _has_author(results: list[CandidateRecord], fragment: str) -> bool: + """Return True if any result author contains fragment (case-insensitive).""" + low = fragment.lower() + return any(low in r["author"].lower() for r in results) + + +# ── OpenLibrary ─────────────────────────────────────────────────────────────── + + +def test_openlibrary_war_and_peace() -> None: + plugin = OpenLibraryPlugin( + plugin_id="openlibrary", + name="OpenLibrary", + rate_limiter=_RL, + rate_limit_seconds=0, + auto_queue=True, + timeout=_TIMEOUT, + config={}, + ) + results = plugin.search("War and Peace Tolstoy") + assert results, "OpenLibrary returned no results" + assert all(r["source"] == "openlibrary" for r in results) + assert _has_title(results, "war and peace"), f"titles={_titles(results)}" + # OpenLibrary stores authors in their original language; accept both forms. + assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}" + + +# ── RSL (РГБ) ───────────────────────────────────────────────────────────────── + + +def test_rsl_voina_i_mir() -> None: + plugin = RSLPlugin( + plugin_id="rsl", + name="РГБ", + rate_limiter=_RL, + rate_limit_seconds=0, + auto_queue=True, + timeout=_TIMEOUT, + config={}, + ) + results = plugin.search("Толстой Война и мир") + assert results, "RSL returned no results" + assert all(r["source"] == "rsl" for r in results) + assert _has_title(results, "война"), f"titles={_titles(results)}" + + +# ── НЭБ (rusneb) ───────────────────────────────────────────────────────────── + + +def test_rusneb_voina_i_mir() -> None: + plugin = HtmlScraperPlugin( + plugin_id="rusneb", + name="НЭБ", + rate_limiter=_RL, + rate_limit_seconds=0, + auto_queue=True, + timeout=_TIMEOUT, + config={ + "url": "https://rusneb.ru/search/", + "search_param": "q", + "img_alt": True, + "author_class": "search-list__item_subtext", + }, + ) + results = plugin.search("Война и мир Толстой") + assert results, "НЭБ returned no results" + assert all(r["source"] == "rusneb" for r in results) + assert _has_title(results, "война"), f"titles={_titles(results)}" + assert _has_author(results, "толст"), f"authors={_authors(results)}" + + +# ── Alib ───────────────────────────────────────────────────────────────────── + + +def test_alib_voina_i_mir() -> None: + plugin = HtmlScraperPlugin( + plugin_id="alib_web", + name="Alib (web)", + rate_limiter=_RL, + rate_limit_seconds=0, + auto_queue=False, + timeout=_TIMEOUT, + config={ + "url": "https://www.alib.ru/find3.php4", + "search_param": "tfind", + "extra_params": {"f": "5", "s": "0"}, + "encoding": "cp1251", + "bold_text": True, + }, + ) + results = plugin.search("Война и мир Толстой") + assert results, "Alib returned no results" + assert all(r["source"] == "alib_web" for r in results) + assert _has_title(results, "война"), f"titles={_titles(results)}" + assert _has_author(results, "толст"), f"authors={_authors(results)}" + + +# ── НЛР (SRU) ──────────────────────────────────────────────────────────────── +# The NLR SRU endpoint (www.nlr.ru/search/query) no longer exists (HTTP 404). + + +@pytest.mark.xfail(reason="nlr.ru SRU endpoint no longer available (HTTP 404)", strict=False) +def test_nlr_voina_i_mir() -> None: + plugin = SRUCatalogPlugin( + plugin_id="nlr", + name="НЛР", + rate_limiter=_RL, + rate_limit_seconds=0, + auto_queue=False, + timeout=_TIMEOUT, + config={ + "url": "http://www.nlr.ru/search/query", + "query_prefix": "title=", + }, + ) + results = plugin.search("Война и мир") + assert results, "НЛР returned no results" + assert all(r["source"] == "nlr" for r in results) + assert _has_title(results, "война"), f"titles={_titles(results)}" + + +# ── ШПИЛ ───────────────────────────────────────────────────────────────────── +# The ШПИЛ IRBIS64 CGI endpoint no longer exists (HTTP 404). + + +@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False) +def test_shpl_voina_i_mir() -> None: + plugin = HtmlScraperPlugin( + plugin_id="shpl", + name="ШПИЛ", + rate_limiter=_RL, + rate_limit_seconds=0, + auto_queue=False, + timeout=_TIMEOUT, + config={ + "url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe", + "search_param": "S21ALL", + "extra_params": { + "C21COM": "S", + "I21DBN": "BIBL", + "P21DBN": "BIBL", + "S21FMT": "briefWebRus", + "Z21ID": "", + }, + "brief_class": "brief", + }, + ) + results = plugin.search("Война и мир") + assert results, "ШПИЛ returned no results" + assert all(r["source"] == "shpl" for r in results) + assert _has_title(results, "война"), f"titles={_titles(results)}"