"""RSL (Russian State Library) search plugin (search.rsl.ru). The search API requires a POST to ``/site/ajax-search?language=ru`` with form-encoded body containing ``SearchFilterForm[search]`` and a CSRF token obtained from the main search page. Query syntax is CQL: ``title:() AND author:(<author words>)``. Results come back as an HTML fragment in the ``content`` key of a JSON envelope; individual records are identified by the CSS classes ``rsl-item-nocover-title`` (author) and ``rsl-item-nocover-descr`` (title). Both fields contain ``<b>`` highlight tags that are stripped before returning. """ import re from typing import Any import httpx from models import CandidateRecord from ..rate_limiter import RateLimiter _DOMAIN = "search.rsl.ru" _SEARCH_URL = "https://search.rsl.ru/site/ajax-search" _BASE_URL = "https://search.rsl.ru/ru/search" _YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b") def _strip_tags(html_frag: str) -> str: """Strip HTML tags and decode basic entities from a fragment.""" text = re.sub(r"<[^>]+>", "", html_frag) text = text.replace(""", '"').replace("&", "&").replace("<", "<").replace(">", ">") return re.sub(r"\s+", " ", text).strip() class RSLPlugin: """Archive searcher for search.rsl.ru. Formats the query as CQL ``title:(title_words) AND author:(author_word)`` by treating the first whitespace-delimited token as the author surname and the remainder as title keywords. When only one token is present, a plain ``title:(token) OR author:(token)`` query is used instead. """ category = "archive_searchers" def __init__( self, plugin_id: str, name: str, rate_limiter: RateLimiter, rate_limit_seconds: float, auto_queue: bool, timeout: int, config: dict[str, Any], ): self.plugin_id = plugin_id self.name = name self._rl = rate_limiter self.rate_limit_seconds = rate_limit_seconds self.auto_queue = auto_queue self.timeout = timeout def search(self, query: str) -> list[CandidateRecord]: """Search RSL for books matching query. Args: query: Free-text string; the first token is treated as the author surname and remaining tokens as title keywords. Returns: Up to three CandidateRecord dicts extracted from the RSL HTML response, with ``<b>`` highlight tags stripped. """ self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds) cql = self._build_cql(query) client = httpx.Client() # Fetch the main page to obtain a valid CSRF token. r0 = client.get(_BASE_URL, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"}) csrf_match = re.search(r'name="_csrf"\s+value="([^"]+)"', r0.text) csrf = csrf_match.group(1) if csrf_match else "" r = client.post( _SEARCH_URL, params={"language": "ru"}, data={"SearchFilterForm[search]": cql, "_csrf": csrf}, timeout=self.timeout, headers={ "Accept": "application/json", "X-Requested-With": "XMLHttpRequest", "Referer": _BASE_URL, "User-Agent": "Mozilla/5.0", }, ) data: dict[str, Any] = r.json() content = str(data.get("content") or "") raw_titles = re.findall(r'rsl-item-nocover-descr[^"]*">(.*?)</div>', content)[:3] raw_authors = re.findall(r'rsl-item-nocover-title[^"]*">(.*?)</div>', content)[:3] years = _YEAR_RE.findall(content)[:3] out: list[CandidateRecord] = [] for i, raw_title in enumerate(raw_titles): title = _strip_tags(raw_title) if not title: continue author = _strip_tags(raw_authors[i]) if i < len(raw_authors) else "" out.append( CandidateRecord( source=self.plugin_id, title=title, author=author, year=years[i] if i < len(years) else "", isbn="", publisher="", ) ) return out @staticmethod def _build_cql(query: str) -> str: """Build a CQL query string for the RSL search API. Args: query: Raw query string, typically ``"Author Title keywords"``. Returns: CQL string in the form ``title:(…) AND author:(…)`` when the query contains multiple tokens, or ``title:(…) OR author:(…)`` for a single token. """ tokens = query.split() if len(tokens) > 1: author_part = tokens[0] title_part = " ".join(tokens[1:]) return f"title:({title_part}) AND author:({author_part})" token = tokens[0] if tokens else query return f"title:({token}) OR author:({token})"