Fix archive plugins for НЭБ and Alib; add network integration tests

- html_scraper: add img_alt strategy (НЭБ titles from <img alt>), bold_text strategy (Alib entries from <p><b>), Windows-1251 encoding support, _cls_inner_texts() helper that strips inner HTML tags - rsl: rewrite to POST SearchFilterForm[search] with CSRF token and CQL title:(words) AND author:(word) query format - config: update rusneb (img_alt + correct author_class) and alib_web (encoding + bold_text) to match fixed plugin strategies - tests: add tests/test_archives.py with network-marked tests for all six archive plugins; НЛР and ШПИЛ marked xfail (endpoints return HTTP 404) - presubmit: exclude network tests from default run (-m "not network") Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-09 22:59:19 +03:00
parent ce03046e51
commit b8f82607f9
6 changed files with 458 additions and 42 deletions
--- a/config/functions.default.yaml
+++ b/config/functions.default.yaml
@@ -64,8 +64,8 @@ functions:
      config:
        url: "https://rusneb.ru/search/"
        search_param: q
-        title_class: "title"
+        img_alt: true
-        author_class: "author"
+        author_class: "search-list__item_subtext"
    alib_web:
      name: "Alib (web)"
@@ -77,8 +77,8 @@ functions:
        url: "https://www.alib.ru/find3.php4"
        search_param: tfind
        extra_params: {f: "5", s: "0"}
-        link_href_pattern: "t[a-z]+\\.phtml"
+        encoding: "cp1251"
-        author_class: "aut"
+        bold_text: true
    nlr:
      name: "НЛР"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,6 +56,7 @@ include = ["src", "tests", "scripts"]
 [tool.pytest.ini_options]
 pythonpath = ["src"]
 markers = ["network: live HTTP requests to external services (deselect with -m 'not network')"]
 [build-system]
 requires = ["poetry-core"]
--- a/scripts/presubmit.py
+++ b/scripts/presubmit.py
@@ -24,7 +24,7 @@ def presubmit():
        ["black", "--check", "."],
        ["flake8", "."],
        ["pyright"],
-        ["pytest", "tests/"],
+        ["pytest", "tests/", "-m", "not network"],
        # JS: tests run via Node built-in runner (no npm packages needed)
        ["node", "--test", "tests/js/pure-functions.test.js"],
    ]
--- a/src/plugins/archives/html_scraper.py
+++ b/src/plugins/archives/html_scraper.py
@@ -2,7 +2,7 @@
 import re
 from typing import Any
-from urllib.parse import urlparse
+from urllib.parse import quote, urlparse
 import httpx
@@ -12,21 +12,78 @@ from ..rate_limiter import RateLimiter
 _YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
 # Matches "Surname I.N. " or "Surname I. " at the start of an entry.
 _AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
 def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
-    return re.compile(rf'class="[^"]*{re.escape(cls_frag)}[^"]*"[^>]*>([^<]{{{min_len},{max_len}}})<')
+    # Support both single and double-quoted class attributes.
    return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
 def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
    """Extract text content from elements whose class contains cls_frag.
    Strips inner HTML tags and normalises whitespace, so elements like
    ``<span class='…'><b>Name</b> I.N.</span>`` work correctly.
    Args:
        html: Raw HTML string to search.
        cls_frag: Substring that must appear in the class attribute value.
        min_len: Minimum length of extracted text to keep.
        max_len: Maximum length of extracted text to keep.
    Returns:
        Up to three non-empty text strings in document order.
    """
    raw = re.findall(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>(.*?)</', html, re.DOTALL)
    out: list[str] = []
    for m in raw:
        text = re.sub(r"<[^>]+>", "", m)
        text = re.sub(r"\s+", " ", text).strip()
        if min_len <= len(text) <= max_len:
            out.append(text)
        if len(out) == 3:
            break
    return out
 def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
    """Extract non-empty alt attributes from <img> tags, normalising whitespace.
    Args:
        html: Raw HTML string to search.
        min_len: Minimum character length to include.
        max_len: Maximum character length to include.
    Returns:
        Up to three non-empty, whitespace-normalised alt strings.
    """
    alts = re.findall(r'<img[^>]+alt=[\'"]([^\'"]+)[\'"]', html)
    out: list[str] = []
    for a in alts:
        text = re.sub(r"\s+", " ", a).strip()
        if min_len <= len(text) <= max_len:
            out.append(text)
        if len(out) == 3:
            break
    return out
 class HtmlScraperPlugin:
-    """
+    """Config-driven HTML scraper.
-    Config-driven HTML scraper. Supported config keys:
+
-      url           — search URL
+    Supported config keys:
-      search_param  — query param name
+      url               — search URL
-      extra_params  — dict of fixed extra query parameters
+      search_param      — query param name
-      title_class   — CSS class fragment for title elements (class-based strategy)
+      extra_params      — dict of fixed extra query parameters
-      author_class  — CSS class fragment for author elements
+      encoding          — character encoding for query and response (e.g. "cp1251")
-      link_href_pattern — href regex to find title <a> links (link strategy, e.g. alib)
+      title_class       — CSS class fragment for title elements (class-based strategy)
-      brief_class   — CSS class for brief record rows (brief strategy, e.g. shpl)
+      author_class      — CSS class fragment for author elements
      link_href_pattern — href regex to find title <a> links (link strategy)
      brief_class       — CSS class for brief record rows (brief strategy)
      img_alt           — truthy: extract titles from <img alt> attributes (rusneb strategy)
      bold_text         — truthy: extract author/title from <p><b>…</b> blocks (alib strategy)
    """
    category = "archive_searchers"
@@ -51,30 +108,118 @@ class HtmlScraperPlugin:
        self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id
    def search(self, query: str) -> list[CandidateRecord]:
        """Search for books matching query.
        Args:
            query: Free-text search string (author, title, keywords).
        Returns:
            Up to three CandidateRecord dicts with source, title, author, year,
            isbn, and publisher fields.
        """
        cfg = self.config
        self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
-        params: dict[str, Any] = dict(cfg.get("extra_params") or {})
+
-        params[cfg["search_param"]] = query
+        encoding = str(cfg.get("encoding") or "")
-        r = httpx.get(
+        if encoding:
-            cfg["url"],
+            # Encode query and extra params in the site's native encoding.
-            params=params,
+            q_enc = quote(query.encode(encoding, "replace"))
-            timeout=self.timeout,
+            ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
-            headers={"User-Agent": "Mozilla/5.0"},
+            ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
-        )
+            raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
-        html = r.text
+            r = httpx.get(
                f'{cfg["url"]}?{raw_qs}',
                timeout=self.timeout,
                headers={"User-Agent": "Mozilla/5.0"},
            )
            html = r.content.decode(encoding, errors="replace")
        else:
            params: dict[str, Any] = dict(cfg.get("extra_params") or {})
            params[cfg["search_param"]] = query
            r = httpx.get(
                cfg["url"],
                params=params,
                timeout=self.timeout,
                headers={"User-Agent": "Mozilla/5.0"},
            )
            html = r.text
        years = _YEAR_RE.findall(html)
-        # Strategy: link_href_pattern (alib-style)
+        if cfg.get("bold_text"):
            return self._parse_bold_text(html, years)
        if cfg.get("img_alt"):
            return self._parse_img_alt(html, years, cfg)
        if "link_href_pattern" in cfg:
            return self._parse_link(html, years, cfg)
        # Strategy: brief_class (shpl-style)
        if "brief_class" in cfg:
            return self._parse_brief(html, years, cfg)
        # Strategy: title_class + author_class (rusneb-style)
        return self._parse_class(html, years, cfg)
    def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
        """Extract records from ``<p><b>text</b>`` entries (Alib-style).
        The bold text is expected to begin with ``Surname I.N. Title…``; the
        author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
        Args:
            html: Decoded HTML response.
            years: Year strings found in the full HTML (used positionally).
        Returns:
            Up to three CandidateRecord dicts.
        """
        entries = re.findall(r"<p><b>([^<]{5,200})</b>", html)[:3]
        out: list[CandidateRecord] = []
        for i, entry in enumerate(entries):
            text = entry.strip()
            m = _AUTHOR_PREFIX_PAT.match(text)
            if m:
                author = m.group(1).strip()
                title = m.group(2).strip()
            else:
                author = ""
                title = text
            out.append(
                CandidateRecord(
                    source=self.plugin_id,
                    title=title,
                    author=author,
                    year=years[i] if i < len(years) else "",
                    isbn="",
                    publisher="",
                )
            )
        return out
    def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
        """Extract records using ``<img alt>`` for titles and a CSS class for authors.
        Used for sites like rusneb.ru where thumbnail alt attributes carry the
        book title and a separate span contains the author.
        Args:
            html: Decoded HTML response.
            years: Year strings found in the full HTML (used positionally).
            cfg: Plugin config dict (reads ``author_class``).
        Returns:
            Up to three CandidateRecord dicts.
        """
        titles = _img_alts(html)
        authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
        return [
            CandidateRecord(
                source=self.plugin_id,
                title=title,
                author=authors[i] if i < len(authors) else "",
                year=years[i] if i < len(years) else "",
                isbn="",
                publisher="",
            )
            for i, title in enumerate(titles)
        ]
    def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
        titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
        authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]
--- a/src/plugins/archives/rsl.py
+++ b/src/plugins/archives/rsl.py
@@ -1,5 +1,17 @@
-"""RSL (Russian State Library) AJAX JSON search API plugin (search.rsl.ru)."""
+"""RSL (Russian State Library) search plugin (search.rsl.ru).
 The search API requires a POST to ``/site/ajax-search?language=ru`` with
 form-encoded body containing ``SearchFilterForm[search]`` and a CSRF token
 obtained from the main search page.  Query syntax is CQL:
 ``title:(<title words>) AND author:(<author words>)``.
 Results come back as an HTML fragment in the ``content`` key of a JSON
 envelope; individual records are identified by the CSS classes
 ``rsl-item-nocover-title`` (author) and ``rsl-item-nocover-descr`` (title).
 Both fields contain ``<b>`` highlight tags that are stripped before returning.
 """
 import re
 from typing import Any
 import httpx
@@ -9,9 +21,27 @@ from models import CandidateRecord
 from ..rate_limiter import RateLimiter
 _DOMAIN = "search.rsl.ru"
 _SEARCH_URL = "https://search.rsl.ru/site/ajax-search"
 _BASE_URL = "https://search.rsl.ru/ru/search"
 _YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
 def _strip_tags(html_frag: str) -> str:
    """Strip HTML tags and decode basic entities from a fragment."""
    text = re.sub(r"<[^>]+>", "", html_frag)
    text = text.replace("&quot;", '"').replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
    return re.sub(r"\s+", " ", text).strip()
 class RSLPlugin:
    """Archive searcher for search.rsl.ru.
    Formats the query as CQL ``title:(title_words) AND author:(author_word)``
    by treating the first whitespace-delimited token as the author surname and
    the remainder as title keywords.  When only one token is present, a plain
    ``title:(token) OR author:(token)`` query is used instead.
    """
    category = "archive_searchers"
    def __init__(
@@ -32,28 +62,79 @@ class RSLPlugin:
        self.timeout = timeout
    def search(self, query: str) -> list[CandidateRecord]:
        """Search RSL for books matching query.
        Args:
            query: Free-text string; the first token is treated as the author
                surname and remaining tokens as title keywords.
        Returns:
            Up to three CandidateRecord dicts extracted from the RSL HTML
            response, with ``<b>`` highlight tags stripped.
        """
        self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
-        r = httpx.get(
+
-            "https://search.rsl.ru/site/ajax-search",
+        cql = self._build_cql(query)
-            params={"language": "ru", "q": query, "page": 1, "perPage": 5},
+        client = httpx.Client()
        # Fetch the main page to obtain a valid CSRF token.
        r0 = client.get(_BASE_URL, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
        csrf_match = re.search(r'name="_csrf"\s+value="([^"]+)"', r0.text)
        csrf = csrf_match.group(1) if csrf_match else ""
        r = client.post(
            _SEARCH_URL,
            params={"language": "ru"},
            data={"SearchFilterForm[search]": cql, "_csrf": csrf},
            timeout=self.timeout,
-            headers={"Accept": "application/json"},
+            headers={
                "Accept": "application/json",
                "X-Requested-With": "XMLHttpRequest",
                "Referer": _BASE_URL,
                "User-Agent": "Mozilla/5.0",
            },
        )
        data: dict[str, Any] = r.json()
-        records: list[dict[str, Any]] = data.get("records") or data.get("items") or data.get("data") or []
+        content = str(data.get("content") or "")
        raw_titles = re.findall(r'rsl-item-nocover-descr[^"]*">(.*?)</div>', content)[:3]
        raw_authors = re.findall(r'rsl-item-nocover-title[^"]*">(.*?)</div>', content)[:3]
        years = _YEAR_RE.findall(content)[:3]
        out: list[CandidateRecord] = []
-        for rec in records[:3]:
+        for i, raw_title in enumerate(raw_titles):
-            title = (str(rec.get("title") or rec.get("name") or "")).strip()
+            title = _strip_tags(raw_title)
            if not title:
                continue
            author = _strip_tags(raw_authors[i]) if i < len(raw_authors) else ""
            out.append(
                CandidateRecord(
                    source=self.plugin_id,
                    title=title,
-                    author=(str(rec.get("author") or rec.get("authors") or "")).strip(),
+                    author=author,
-                    year=str(rec.get("year") or rec.get("pubyear") or "").strip(),
+                    year=years[i] if i < len(years) else "",
-                    isbn=(str(rec.get("isbn") or "")).strip(),
+                    isbn="",
-                    publisher=(str(rec.get("publisher") or "")).strip(),
+                    publisher="",
                )
            )
        return out
    @staticmethod
    def _build_cql(query: str) -> str:
        """Build a CQL query string for the RSL search API.
        Args:
            query: Raw query string, typically ``"Author Title keywords"``.
        Returns:
            CQL string in the form ``title:(…) AND author:(…)`` when the query
            contains multiple tokens, or ``title:(…) OR author:(…)`` for a
            single token.
        """
        tokens = query.split()
        if len(tokens) > 1:
            author_part = tokens[0]
            title_part = " ".join(tokens[1:])
            return f"title:({title_part}) AND author:({author_part})"
        token = tokens[0] if tokens else query
        return f"title:({token}) OR author:({token})"
--- a/tests/test_archives.py
+++ b/tests/test_archives.py
@@ -0,0 +1,189 @@
 """Network integration tests for archive searcher plugins.
 Each test queries a live external service for "War and Peace" by Tolstoy,
 a book universally catalogued in all supported archives.
 Run with:  pytest tests/ -m network
 Skip with: pytest tests/ -m "not network"  (default in presubmit)
 """
 import pytest
 from models import CandidateRecord
 from plugins.archives.html_scraper import HtmlScraperPlugin
 from plugins.archives.openlibrary import OpenLibraryPlugin
 from plugins.archives.rsl import RSLPlugin
 from plugins.archives.sru_catalog import SRUCatalogPlugin
 from plugins.rate_limiter import RateLimiter
 pytestmark = pytest.mark.network
 _RL = RateLimiter()
 _TIMEOUT = 15
 def _titles(results: list[CandidateRecord]) -> list[str]:
    return [r["title"] for r in results]
 def _authors(results: list[CandidateRecord]) -> list[str]:
    return [r["author"] for r in results]
 def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
    """Return True if any result title contains fragment (case-insensitive)."""
    low = fragment.lower()
    return any(low in r["title"].lower() for r in results)
 def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
    """Return True if any result author contains fragment (case-insensitive)."""
    low = fragment.lower()
    return any(low in r["author"].lower() for r in results)
 # ── OpenLibrary ───────────────────────────────────────────────────────────────
 def test_openlibrary_war_and_peace() -> None:
    plugin = OpenLibraryPlugin(
        plugin_id="openlibrary",
        name="OpenLibrary",
        rate_limiter=_RL,
        rate_limit_seconds=0,
        auto_queue=True,
        timeout=_TIMEOUT,
        config={},
    )
    results = plugin.search("War and Peace Tolstoy")
    assert results, "OpenLibrary returned no results"
    assert all(r["source"] == "openlibrary" for r in results)
    assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
    # OpenLibrary stores authors in their original language; accept both forms.
    assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
 # ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
 def test_rsl_voina_i_mir() -> None:
    plugin = RSLPlugin(
        plugin_id="rsl",
        name="РГБ",
        rate_limiter=_RL,
        rate_limit_seconds=0,
        auto_queue=True,
        timeout=_TIMEOUT,
        config={},
    )
    results = plugin.search("Толстой Война и мир")
    assert results, "RSL returned no results"
    assert all(r["source"] == "rsl" for r in results)
    assert _has_title(results, "война"), f"titles={_titles(results)}"
 # ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
 def test_rusneb_voina_i_mir() -> None:
    plugin = HtmlScraperPlugin(
        plugin_id="rusneb",
        name="НЭБ",
        rate_limiter=_RL,
        rate_limit_seconds=0,
        auto_queue=True,
        timeout=_TIMEOUT,
        config={
            "url": "https://rusneb.ru/search/",
            "search_param": "q",
            "img_alt": True,
            "author_class": "search-list__item_subtext",
        },
    )
    results = plugin.search("Война и мир Толстой")
    assert results, "НЭБ returned no results"
    assert all(r["source"] == "rusneb" for r in results)
    assert _has_title(results, "война"), f"titles={_titles(results)}"
    assert _has_author(results, "толст"), f"authors={_authors(results)}"
 # ── Alib ─────────────────────────────────────────────────────────────────────
 def test_alib_voina_i_mir() -> None:
    plugin = HtmlScraperPlugin(
        plugin_id="alib_web",
        name="Alib (web)",
        rate_limiter=_RL,
        rate_limit_seconds=0,
        auto_queue=False,
        timeout=_TIMEOUT,
        config={
            "url": "https://www.alib.ru/find3.php4",
            "search_param": "tfind",
            "extra_params": {"f": "5", "s": "0"},
            "encoding": "cp1251",
            "bold_text": True,
        },
    )
    results = plugin.search("Война и мир Толстой")
    assert results, "Alib returned no results"
    assert all(r["source"] == "alib_web" for r in results)
    assert _has_title(results, "война"), f"titles={_titles(results)}"
    assert _has_author(results, "толст"), f"authors={_authors(results)}"
 # ── НЛР (SRU) ────────────────────────────────────────────────────────────────
 # The NLR SRU endpoint (www.nlr.ru/search/query) no longer exists (HTTP 404).
@pytest.mark.xfail(reason="nlr.ru SRU endpoint no longer available (HTTP 404)", strict=False)
 def test_nlr_voina_i_mir() -> None:
    plugin = SRUCatalogPlugin(
        plugin_id="nlr",
        name="НЛР",
        rate_limiter=_RL,
        rate_limit_seconds=0,
        auto_queue=False,
        timeout=_TIMEOUT,
        config={
            "url": "http://www.nlr.ru/search/query",
            "query_prefix": "title=",
        },
    )
    results = plugin.search("Война и мир")
    assert results, "НЛР returned no results"
    assert all(r["source"] == "nlr" for r in results)
    assert _has_title(results, "война"), f"titles={_titles(results)}"
 # ── ШПИЛ ─────────────────────────────────────────────────────────────────────
 # The ШПИЛ IRBIS64 CGI endpoint no longer exists (HTTP 404).
@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
 def test_shpl_voina_i_mir() -> None:
    plugin = HtmlScraperPlugin(
        plugin_id="shpl",
        name="ШПИЛ",
        rate_limiter=_RL,
        rate_limit_seconds=0,
        auto_queue=False,
        timeout=_TIMEOUT,
        config={
            "url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe",
            "search_param": "S21ALL",
            "extra_params": {
                "C21COM": "S",
                "I21DBN": "BIBL",
                "P21DBN": "BIBL",
                "S21FMT": "briefWebRus",
                "Z21ID": "",
            },
            "brief_class": "brief",
        },
    )
    results = plugin.search("Война и мир")
    assert results, "ШПИЛ returned no results"
    assert all(r["source"] == "shpl" for r in results)
    assert _has_title(results, "война"), f"titles={_titles(results)}"