Fix archive plugins for НЭБ and Alib; add network integration tests

- html_scraper: add img_alt strategy (НЭБ titles from <img alt>), bold_text strategy (Alib entries from <p><b>), Windows-1251 encoding support, _cls_inner_texts() helper that strips inner HTML tags - rsl: rewrite to POST SearchFilterForm[search] with CSRF token and CQL title:(words) AND author:(word) query format - config: update rusneb (img_alt + correct author_class) and alib_web (encoding + bold_text) to match fixed plugin strategies - tests: add tests/test_archives.py with network-marked tests for all six archive plugins; НЛР and ШПИЛ marked xfail (endpoints return HTTP 404) - presubmit: exclude network tests from default run (-m "not network") Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-09 22:59:19 +03:00
parent ce03046e51
commit b8f82607f9
6 changed files with 458 additions and 42 deletions
--- a/src/plugins/archives/html_scraper.py
+++ b/src/plugins/archives/html_scraper.py
@@ -2,7 +2,7 @@

 import re
 from typing import Any
-from urllib.parse import urlparse
+from urllib.parse import quote, urlparse

 import httpx

@@ -12,21 +12,78 @@ from ..rate_limiter import RateLimiter

 _YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")

+# Matches "Surname I.N. " or "Surname I. " at the start of an entry.
+_AUTHOR_PREFIX_PAT = re.compile(r"^(\S+\s+(?:[А-ЯЁA-Z]\.){1,3}\s*)(.+)", re.DOTALL)
+

 def _cls_re(cls_frag: str, min_len: int = 3, max_len: int = 120) -> re.Pattern[str]:
-    return re.compile(rf'class="[^"]*{re.escape(cls_frag)}[^"]*"[^>]*>([^<]{{{min_len},{max_len}}})<')
+    # Support both single and double-quoted class attributes.
+    return re.compile(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>([^<]{{{min_len},{max_len}}})<')
+
+
+def _cls_inner_texts(html: str, cls_frag: str, min_len: int = 3, max_len: int = 80) -> list[str]:
+    """Extract text content from elements whose class contains cls_frag.
+
+    Strips inner HTML tags and normalises whitespace, so elements like
+    ``<span class='…'><b>Name</b> I.N.</span>`` work correctly.
+
+    Args:
+        html: Raw HTML string to search.
+        cls_frag: Substring that must appear in the class attribute value.
+        min_len: Minimum length of extracted text to keep.
+        max_len: Maximum length of extracted text to keep.
+
+    Returns:
+        Up to three non-empty text strings in document order.
+    """
+    raw = re.findall(rf'class=["\'][^"\']*{re.escape(cls_frag)}[^"\']*["\'][^>]*>(.*?)</', html, re.DOTALL)
+    out: list[str] = []
+    for m in raw:
+        text = re.sub(r"<[^>]+>", "", m)
+        text = re.sub(r"\s+", " ", text).strip()
+        if min_len <= len(text) <= max_len:
+            out.append(text)
+        if len(out) == 3:
+            break
+    return out
+
+
+def _img_alts(html: str, min_len: int = 5, max_len: int = 120) -> list[str]:
+    """Extract non-empty alt attributes from <img> tags, normalising whitespace.
+
+    Args:
+        html: Raw HTML string to search.
+        min_len: Minimum character length to include.
+        max_len: Maximum character length to include.
+
+    Returns:
+        Up to three non-empty, whitespace-normalised alt strings.
+    """
+    alts = re.findall(r'<img[^>]+alt=[\'"]([^\'"]+)[\'"]', html)
+    out: list[str] = []
+    for a in alts:
+        text = re.sub(r"\s+", " ", a).strip()
+        if min_len <= len(text) <= max_len:
+            out.append(text)
+        if len(out) == 3:
+            break
+    return out


 class HtmlScraperPlugin:
-    """
-    Config-driven HTML scraper. Supported config keys:
-      url           — search URL
-      search_param  — query param name
-      extra_params  — dict of fixed extra query parameters
-      title_class   — CSS class fragment for title elements (class-based strategy)
-      author_class  — CSS class fragment for author elements
-      link_href_pattern — href regex to find title <a> links (link strategy, e.g. alib)
-      brief_class   — CSS class for brief record rows (brief strategy, e.g. shpl)
+    """Config-driven HTML scraper.
+
+    Supported config keys:
+      url               — search URL
+      search_param      — query param name
+      extra_params      — dict of fixed extra query parameters
+      encoding          — character encoding for query and response (e.g. "cp1251")
+      title_class       — CSS class fragment for title elements (class-based strategy)
+      author_class      — CSS class fragment for author elements
+      link_href_pattern — href regex to find title <a> links (link strategy)
+      brief_class       — CSS class for brief record rows (brief strategy)
+      img_alt           — truthy: extract titles from <img alt> attributes (rusneb strategy)
+      bold_text         — truthy: extract author/title from <p><b>…</b> blocks (alib strategy)
    """

    category = "archive_searchers"
@@ -51,30 +108,118 @@ class HtmlScraperPlugin:
        self._domain: str = urlparse(str(config.get("url") or "")).netloc or plugin_id

    def search(self, query: str) -> list[CandidateRecord]:
+        """Search for books matching query.
+
+        Args:
+            query: Free-text search string (author, title, keywords).
+
+        Returns:
+            Up to three CandidateRecord dicts with source, title, author, year,
+            isbn, and publisher fields.
+        """
        cfg = self.config
        self._rl.wait_and_record(self._domain, self.rate_limit_seconds)
-        params: dict[str, Any] = dict(cfg.get("extra_params") or {})
-        params[cfg["search_param"]] = query
-        r = httpx.get(
-            cfg["url"],
-            params=params,
-            timeout=self.timeout,
-            headers={"User-Agent": "Mozilla/5.0"},
-        )
-        html = r.text
+
+        encoding = str(cfg.get("encoding") or "")
+        if encoding:
+            # Encode query and extra params in the site's native encoding.
+            q_enc = quote(query.encode(encoding, "replace"))
+            ep: dict[str, Any] = dict(cfg.get("extra_params") or {})
+            ep_parts = [f"{k}={quote(str(v).encode(encoding, 'replace'))}" for k, v in ep.items()]
+            raw_qs = "&".join([f'{cfg["search_param"]}={q_enc}'] + ep_parts)
+            r = httpx.get(
+                f'{cfg["url"]}?{raw_qs}',
+                timeout=self.timeout,
+                headers={"User-Agent": "Mozilla/5.0"},
+            )
+            html = r.content.decode(encoding, errors="replace")
+        else:
+            params: dict[str, Any] = dict(cfg.get("extra_params") or {})
+            params[cfg["search_param"]] = query
+            r = httpx.get(
+                cfg["url"],
+                params=params,
+                timeout=self.timeout,
+                headers={"User-Agent": "Mozilla/5.0"},
+            )
+            html = r.text
+
        years = _YEAR_RE.findall(html)

-        # Strategy: link_href_pattern (alib-style)
+        if cfg.get("bold_text"):
+            return self._parse_bold_text(html, years)
+        if cfg.get("img_alt"):
+            return self._parse_img_alt(html, years, cfg)
        if "link_href_pattern" in cfg:
            return self._parse_link(html, years, cfg)
-
-        # Strategy: brief_class (shpl-style)
        if "brief_class" in cfg:
            return self._parse_brief(html, years, cfg)
-
-        # Strategy: title_class + author_class (rusneb-style)
        return self._parse_class(html, years, cfg)

+    def _parse_bold_text(self, html: str, years: list[str]) -> list[CandidateRecord]:
+        """Extract records from ``<p><b>text</b>`` entries (Alib-style).
+
+        The bold text is expected to begin with ``Surname I.N. Title…``; the
+        author prefix is split off with ``_AUTHOR_PREFIX_PAT`` if possible.
+
+        Args:
+            html: Decoded HTML response.
+            years: Year strings found in the full HTML (used positionally).
+
+        Returns:
+            Up to three CandidateRecord dicts.
+        """
+        entries = re.findall(r"<p><b>([^<]{5,200})</b>", html)[:3]
+        out: list[CandidateRecord] = []
+        for i, entry in enumerate(entries):
+            text = entry.strip()
+            m = _AUTHOR_PREFIX_PAT.match(text)
+            if m:
+                author = m.group(1).strip()
+                title = m.group(2).strip()
+            else:
+                author = ""
+                title = text
+            out.append(
+                CandidateRecord(
+                    source=self.plugin_id,
+                    title=title,
+                    author=author,
+                    year=years[i] if i < len(years) else "",
+                    isbn="",
+                    publisher="",
+                )
+            )
+        return out
+
+    def _parse_img_alt(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
+        """Extract records using ``<img alt>`` for titles and a CSS class for authors.
+
+        Used for sites like rusneb.ru where thumbnail alt attributes carry the
+        book title and a separate span contains the author.
+
+        Args:
+            html: Decoded HTML response.
+            years: Year strings found in the full HTML (used positionally).
+            cfg: Plugin config dict (reads ``author_class``).
+
+        Returns:
+            Up to three CandidateRecord dicts.
+        """
+        titles = _img_alts(html)
+        authors = _cls_inner_texts(html, cfg.get("author_class", "author"), 3, 80)
+        return [
+            CandidateRecord(
+                source=self.plugin_id,
+                title=title,
+                author=authors[i] if i < len(authors) else "",
+                year=years[i] if i < len(years) else "",
+                isbn="",
+                publisher="",
+            )
+            for i, title in enumerate(titles)
+        ]
+
    def _parse_class(self, html: str, years: list[str], cfg: dict[str, Any]) -> list[CandidateRecord]:
        titles = _cls_re(cfg.get("title_class", "title")).findall(html)[:3]
        authors = _cls_re(cfg.get("author_class", "author"), 3, 80).findall(html)[:3]