- html_scraper: add img_alt strategy (НЭБ titles from <img alt>), bold_text strategy (Alib entries from <p><b>), Windows-1251 encoding support, _cls_inner_texts() helper that strips inner HTML tags - rsl: rewrite to POST SearchFilterForm[search] with CSRF token and CQL title:(words) AND author:(word) query format - config: update rusneb (img_alt + correct author_class) and alib_web (encoding + bold_text) to match fixed plugin strategies - tests: add tests/test_archives.py with network-marked tests for all six archive plugins; НЛР and ШПИЛ marked xfail (endpoints return HTTP 404) - presubmit: exclude network tests from default run (-m "not network") Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
141 lines
4.9 KiB
Python
141 lines
4.9 KiB
Python
"""RSL (Russian State Library) search plugin (search.rsl.ru).
|
|
|
|
The search API requires a POST to ``/site/ajax-search?language=ru`` with
|
|
form-encoded body containing ``SearchFilterForm[search]`` and a CSRF token
|
|
obtained from the main search page. Query syntax is CQL:
|
|
``title:(<title words>) AND author:(<author words>)``.
|
|
|
|
Results come back as an HTML fragment in the ``content`` key of a JSON
|
|
envelope; individual records are identified by the CSS classes
|
|
``rsl-item-nocover-title`` (author) and ``rsl-item-nocover-descr`` (title).
|
|
Both fields contain ``<b>`` highlight tags that are stripped before returning.
|
|
"""
|
|
|
|
import re
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
from models import CandidateRecord
|
|
|
|
from ..rate_limiter import RateLimiter
|
|
|
|
_DOMAIN = "search.rsl.ru"
|
|
_SEARCH_URL = "https://search.rsl.ru/site/ajax-search"
|
|
_BASE_URL = "https://search.rsl.ru/ru/search"
|
|
_YEAR_RE = re.compile(r"\b(1[0-9]{3}|20[012][0-9])\b")
|
|
|
|
|
|
def _strip_tags(html_frag: str) -> str:
|
|
"""Strip HTML tags and decode basic entities from a fragment."""
|
|
text = re.sub(r"<[^>]+>", "", html_frag)
|
|
text = text.replace(""", '"').replace("&", "&").replace("<", "<").replace(">", ">")
|
|
return re.sub(r"\s+", " ", text).strip()
|
|
|
|
|
|
class RSLPlugin:
|
|
"""Archive searcher for search.rsl.ru.
|
|
|
|
Formats the query as CQL ``title:(title_words) AND author:(author_word)``
|
|
by treating the first whitespace-delimited token as the author surname and
|
|
the remainder as title keywords. When only one token is present, a plain
|
|
``title:(token) OR author:(token)`` query is used instead.
|
|
"""
|
|
|
|
category = "archive_searchers"
|
|
|
|
def __init__(
|
|
self,
|
|
plugin_id: str,
|
|
name: str,
|
|
rate_limiter: RateLimiter,
|
|
rate_limit_seconds: float,
|
|
auto_queue: bool,
|
|
timeout: int,
|
|
config: dict[str, Any],
|
|
):
|
|
self.plugin_id = plugin_id
|
|
self.name = name
|
|
self._rl = rate_limiter
|
|
self.rate_limit_seconds = rate_limit_seconds
|
|
self.auto_queue = auto_queue
|
|
self.timeout = timeout
|
|
|
|
def search(self, query: str) -> list[CandidateRecord]:
|
|
"""Search RSL for books matching query.
|
|
|
|
Args:
|
|
query: Free-text string; the first token is treated as the author
|
|
surname and remaining tokens as title keywords.
|
|
|
|
Returns:
|
|
Up to three CandidateRecord dicts extracted from the RSL HTML
|
|
response, with ``<b>`` highlight tags stripped.
|
|
"""
|
|
self._rl.wait_and_record(_DOMAIN, self.rate_limit_seconds)
|
|
|
|
cql = self._build_cql(query)
|
|
client = httpx.Client()
|
|
|
|
# Fetch the main page to obtain a valid CSRF token.
|
|
r0 = client.get(_BASE_URL, timeout=self.timeout, headers={"User-Agent": "Mozilla/5.0"})
|
|
csrf_match = re.search(r'name="_csrf"\s+value="([^"]+)"', r0.text)
|
|
csrf = csrf_match.group(1) if csrf_match else ""
|
|
|
|
r = client.post(
|
|
_SEARCH_URL,
|
|
params={"language": "ru"},
|
|
data={"SearchFilterForm[search]": cql, "_csrf": csrf},
|
|
timeout=self.timeout,
|
|
headers={
|
|
"Accept": "application/json",
|
|
"X-Requested-With": "XMLHttpRequest",
|
|
"Referer": _BASE_URL,
|
|
"User-Agent": "Mozilla/5.0",
|
|
},
|
|
)
|
|
data: dict[str, Any] = r.json()
|
|
content = str(data.get("content") or "")
|
|
|
|
raw_titles = re.findall(r'rsl-item-nocover-descr[^"]*">(.*?)</div>', content)[:3]
|
|
raw_authors = re.findall(r'rsl-item-nocover-title[^"]*">(.*?)</div>', content)[:3]
|
|
years = _YEAR_RE.findall(content)[:3]
|
|
|
|
out: list[CandidateRecord] = []
|
|
for i, raw_title in enumerate(raw_titles):
|
|
title = _strip_tags(raw_title)
|
|
if not title:
|
|
continue
|
|
author = _strip_tags(raw_authors[i]) if i < len(raw_authors) else ""
|
|
out.append(
|
|
CandidateRecord(
|
|
source=self.plugin_id,
|
|
title=title,
|
|
author=author,
|
|
year=years[i] if i < len(years) else "",
|
|
isbn="",
|
|
publisher="",
|
|
)
|
|
)
|
|
return out
|
|
|
|
@staticmethod
|
|
def _build_cql(query: str) -> str:
|
|
"""Build a CQL query string for the RSL search API.
|
|
|
|
Args:
|
|
query: Raw query string, typically ``"Author Title keywords"``.
|
|
|
|
Returns:
|
|
CQL string in the form ``title:(…) AND author:(…)`` when the query
|
|
contains multiple tokens, or ``title:(…) OR author:(…)`` for a
|
|
single token.
|
|
"""
|
|
tokens = query.split()
|
|
if len(tokens) > 1:
|
|
author_part = tokens[0]
|
|
title_part = " ".join(tokens[1:])
|
|
return f"title:({title_part}) AND author:({author_part})"
|
|
token = tokens[0] if tokens else query
|
|
return f"title:({token}) OR author:({token})"
|