bookshelf/src/plugins/archives/rsl.py

"""RSL (Russian State Library) search plugin (search.rsl.ru).

The search API requires a POST to ``/site/ajax-search?language=ru`` with
form-encoded body containing ``SearchFilterForm[search]`` and a CSRF token
obtained from the main search page.  Query syntax is CQL:
``title:(<title words>) AND author:(<author words>)``.

Results come back as an HTML fragment in the ``content`` key of a JSON
envelope; individual records are identified by the CSS classes
``rsl-item-nocover-title`` (author) and ``rsl-item-nocover-descr`` (title).
Both fields contain ``<b>`` highlight tags that are stripped before returning.
"""

import re
from typing import Any

import httpx

from models import CandidateRecord

from ..rate_limiter import RateLimiter

_DOMAIN = "search.rsl.ru"
_SEARCH_URL = "https://search.rsl.ru/site/ajax-search"
_BASE_URL = "https://search.rsl.ru/ru/search"
_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")


def _strip_tags(html_frag: str) -> str:
    """Strip HTML tags and decode basic entities from a fragment."""
    text = re.sub(r"<[^>]+>", "", html_frag)
    text = text.replace("&quot;", '"').replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">")
    return re.sub(r"\s+", " ", text).strip()


class RSLPlugin:
    """Archive searcher for search.rsl.ru.

    Formats the query as CQL ``title:(title_words) AND author:(author_word)``
    by treating the first whitespace-delimited token as the author surname and
    the remainder as title keywords.  When only one token is present, a plain
    ``title:(token) OR author:(token)`` query is used instead.
    """

    category = "archive_searchers"

    def __init__(
        self,
        plugin_id: str,
        name: str,
        rate_limiter: RateLimiter,
        rate_limit_seconds: float,
        auto_queue: bool,
        timeout: int,
        config: dict[str, Any],
    ):
        self.plugin_id = plugin_id
        self.name = name
        self._rl = rate_limiter
        self.rate_limit_seconds = rate_limit_seconds
        self.auto_queue = auto_queue
        self.timeout = timeout

    def search(self, query: str) -> list[CandidateRecord]:
        """Search RSL for books matching query.

        Args:
            query: Free-text string; the first token is treated as the author
                surname and remaining tokens as title keywords.

        Returns:
            Up to three CandidateRecord dicts extracted from the RSL HTML
            response, with ``<b>`` highlight tags stripped.
        """
        self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)

        cql = self._build_cql(query)
        client = httpx.Client()

        # Fetch the main page to obtain a valid CSRF token.
        r0 = client.get(_BASE_URL, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
        csrf_match = re.search(r'name="_csrf"\s+value="([^"]+)"', r0.text)
        csrf = csrf_match.group(1) if csrf_match else ""

        r = client.post(
            _SEARCH_URL,
            params={"language": "ru"},
            data={"SearchFilterForm[search]": cql, "_csrf": csrf},
            timeout=self.timeout,
            headers={
                "Accept": "application/json",
                "X-Requested-With": "XMLHttpRequest",
                "Referer": _BASE_URL,
                "User-Agent": "Mozilla/5.0",
            },
        )
        data: dict[str, Any] = r.json()
        content = str(data.get("content") or "")

        raw_titles = re.findall(r'rsl-item-nocover-descr[^"]*">(.*?)</div>', content)[:3]
        raw_authors = re.findall(r'rsl-item-nocover-title[^"]*">(.*?)</div>', content)[:3]
        years = _YEAR_RE.findall(content)[:3]

        out: list[CandidateRecord] = []
        for i, raw_title in enumerate(raw_titles):
            title = _strip_tags(raw_title)
            if not title:
                continue
            author = _strip_tags(raw_authors[i]) if i < len(raw_authors) else ""
            out.append(
                CandidateRecord(
                    source=self.plugin_id,
                    title=title,
                    author=author,
                    year=years[i] if i < len(years) else "",
                    isbn="",
                    publisher="",
                )
            )
        return out

    @staticmethod
    def _build_cql(query: str) -> str:
        """Build a CQL query string for the RSL search API.

        Args:
            query: Raw query string, typically ``"Author Title keywords"``.

        Returns:
            CQL string in the form ``title:(…) AND author:(…)`` when the query
            contains multiple tokens, or ``title:(…) OR author:(…)`` for a
            single token.
        """
        tokens = query.split()
        if len(tokens) > 1:
            author_part = tokens[0]
            title_part = " ".join(tokens[1:])
            return f"title:({title_part}) AND author:({author_part})"
        token = tokens[0] if tokens else query
        return f"title:({token}) OR author:({token})"