Replace config-driven HtmlScraperPlugin with specific archive classes
Each archive scraper now has its own class with hardcoded URL and parsing logic; config only carries auto_queue, timeout, and rate_limit_seconds. - html_scraper: refactor to base class with public shared utilities (YEAR_RE, AUTHOR_PREFIX_PAT, cls_inner_texts, img_alts) - rusneb.py (new): RusnebPlugin extracts year per list item rather than globally, eliminating wrong page-level dates - alib.py (new): AlibPlugin extracts year from within each <p><b> entry rather than globally, fixing nonsensical year values - shpl.py (new): ShplPlugin retains the dead ШПИЛ endpoint with hardcoded params; config type updated from html_scraper to shpl - config: remove config: subsections from rusneb, alib_web, shpl entries; update type fields to rusneb, alib_web, shpl respectively - plugins/__init__.py: register new specific types, remove html_scraper - tests: use specific plugin classes; assert all CandidateRecord fields (source, title, author, year, isbn, publisher) with appropriate constraints Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,12 +7,16 @@ Run with: pytest tests/ -m network
|
||||
Skip with: pytest tests/ -m "not network" (default in presubmit)
|
||||
"""
|
||||
|
||||
import re
|
||||
|
||||
import pytest
|
||||
|
||||
from models import CandidateRecord
|
||||
from plugins.archives.html_scraper import HtmlScraperPlugin
|
||||
from plugins.archives.alib import AlibPlugin
|
||||
from plugins.archives.openlibrary import OpenLibraryPlugin
|
||||
from plugins.archives.rsl import RSLPlugin
|
||||
from plugins.archives.rusneb import RusnebPlugin
|
||||
from plugins.archives.shpl import ShplPlugin
|
||||
from plugins.archives.sru_catalog import SRUCatalogPlugin
|
||||
from plugins.rate_limiter import RateLimiter
|
||||
|
||||
@@ -21,6 +25,8 @@ pytestmark = pytest.mark.network
|
||||
_RL = RateLimiter()
|
||||
_TIMEOUT = 15
|
||||
|
||||
_YEAR_PAT = re.compile(r"^\d{4}$")
|
||||
|
||||
|
||||
def _titles(results: list[CandidateRecord]) -> list[str]:
|
||||
return [r["title"] for r in results]
|
||||
@@ -30,6 +36,10 @@ def _authors(results: list[CandidateRecord]) -> list[str]:
|
||||
return [r["author"] for r in results]
|
||||
|
||||
|
||||
def _years(results: list[CandidateRecord]) -> list[str]:
|
||||
return [r["year"] for r in results]
|
||||
|
||||
|
||||
def _has_title(results: list[CandidateRecord], fragment: str) -> bool:
|
||||
"""Return True if any result title contains fragment (case-insensitive)."""
|
||||
low = fragment.lower()
|
||||
@@ -42,6 +52,11 @@ def _has_author(results: list[CandidateRecord], fragment: str) -> bool:
|
||||
return any(low in r["author"].lower() for r in results)
|
||||
|
||||
|
||||
def _valid_year(year: str) -> bool:
|
||||
"""Return True if year is a 4-digit string or empty."""
|
||||
return year == "" or bool(_YEAR_PAT.match(year))
|
||||
|
||||
|
||||
# ── OpenLibrary ───────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -61,6 +76,10 @@ def test_openlibrary_war_and_peace() -> None:
|
||||
assert _has_title(results, "war and peace"), f"titles={_titles(results)}"
|
||||
# OpenLibrary stores authors in their original language; accept both forms.
|
||||
assert _has_author(results, "tolstoy") or _has_author(results, "толст"), f"authors={_authors(results)}"
|
||||
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
||||
# OpenLibrary returns isbn and publisher from its JSON API.
|
||||
assert all(isinstance(r["isbn"], str) for r in results)
|
||||
assert all(isinstance(r["publisher"], str) for r in results)
|
||||
|
||||
|
||||
# ── RSL (РГБ) ─────────────────────────────────────────────────────────────────
|
||||
@@ -80,57 +99,56 @@ def test_rsl_voina_i_mir() -> None:
|
||||
assert results, "RSL returned no results"
|
||||
assert all(r["source"] == "rsl" for r in results)
|
||||
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
||||
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
||||
assert all(r["isbn"] == "" for r in results)
|
||||
assert all(r["publisher"] == "" for r in results)
|
||||
|
||||
|
||||
# ── НЭБ (rusneb) ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_rusneb_voina_i_mir() -> None:
|
||||
plugin = HtmlScraperPlugin(
|
||||
plugin = RusnebPlugin(
|
||||
plugin_id="rusneb",
|
||||
name="НЭБ",
|
||||
rate_limiter=_RL,
|
||||
rate_limit_seconds=0,
|
||||
auto_queue=True,
|
||||
timeout=_TIMEOUT,
|
||||
config={
|
||||
"url": "https://rusneb.ru/search/",
|
||||
"search_param": "q",
|
||||
"img_alt": True,
|
||||
"author_class": "search-list__item_subtext",
|
||||
},
|
||||
config={},
|
||||
)
|
||||
results = plugin.search("Война и мир Толстой")
|
||||
assert results, "НЭБ returned no results"
|
||||
assert all(r["source"] == "rusneb" for r in results)
|
||||
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
||||
assert _has_author(results, "толст"), f"authors={_authors(results)}"
|
||||
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
||||
assert all(r["isbn"] == "" for r in results)
|
||||
assert all(r["publisher"] == "" for r in results)
|
||||
|
||||
|
||||
# ── Alib ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def test_alib_voina_i_mir() -> None:
|
||||
plugin = HtmlScraperPlugin(
|
||||
plugin = AlibPlugin(
|
||||
plugin_id="alib_web",
|
||||
name="Alib (web)",
|
||||
rate_limiter=_RL,
|
||||
rate_limit_seconds=0,
|
||||
auto_queue=False,
|
||||
timeout=_TIMEOUT,
|
||||
config={
|
||||
"url": "https://www.alib.ru/find3.php4",
|
||||
"search_param": "tfind",
|
||||
"extra_params": {"f": "5", "s": "0"},
|
||||
"encoding": "cp1251",
|
||||
"bold_text": True,
|
||||
},
|
||||
config={},
|
||||
)
|
||||
results = plugin.search("Война и мир Толстой")
|
||||
assert results, "Alib returned no results"
|
||||
assert all(r["source"] == "alib_web" for r in results)
|
||||
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
||||
assert _has_author(results, "толст"), f"authors={_authors(results)}"
|
||||
# Alib entries always include a publication year in the bibliographic text.
|
||||
assert all(_YEAR_PAT.match(r["year"]) for r in results), f"years={_years(results)}"
|
||||
assert all(r["isbn"] == "" for r in results)
|
||||
assert all(r["publisher"] == "" for r in results)
|
||||
|
||||
|
||||
# ── НЛР (SRU) ────────────────────────────────────────────────────────────────
|
||||
@@ -155,6 +173,9 @@ def test_nlr_voina_i_mir() -> None:
|
||||
assert results, "НЛР returned no results"
|
||||
assert all(r["source"] == "nlr" for r in results)
|
||||
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
||||
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
||||
assert all(r["isbn"] == "" for r in results)
|
||||
assert all(r["publisher"] == "" for r in results)
|
||||
|
||||
|
||||
# ── ШПИЛ ─────────────────────────────────────────────────────────────────────
|
||||
@@ -163,27 +184,19 @@ def test_nlr_voina_i_mir() -> None:
|
||||
|
||||
@pytest.mark.xfail(reason="shpl.ru IRBIS64 CGI endpoint no longer available (HTTP 404)", strict=False)
|
||||
def test_shpl_voina_i_mir() -> None:
|
||||
plugin = HtmlScraperPlugin(
|
||||
plugin = ShplPlugin(
|
||||
plugin_id="shpl",
|
||||
name="ШПИЛ",
|
||||
rate_limiter=_RL,
|
||||
rate_limit_seconds=0,
|
||||
auto_queue=False,
|
||||
timeout=_TIMEOUT,
|
||||
config={
|
||||
"url": "https://www.shpl.ru/cgi-bin/irbis64/cgiirbis_64.exe",
|
||||
"search_param": "S21ALL",
|
||||
"extra_params": {
|
||||
"C21COM": "S",
|
||||
"I21DBN": "BIBL",
|
||||
"P21DBN": "BIBL",
|
||||
"S21FMT": "briefWebRus",
|
||||
"Z21ID": "",
|
||||
},
|
||||
"brief_class": "brief",
|
||||
},
|
||||
config={},
|
||||
)
|
||||
results = plugin.search("Война и мир")
|
||||
assert results, "ШПИЛ returned no results"
|
||||
assert all(r["source"] == "shpl" for r in results)
|
||||
assert _has_title(results, "война"), f"titles={_titles(results)}"
|
||||
assert all(_valid_year(r["year"]) for r in results), f"years={_years(results)}"
|
||||
assert all(r["isbn"] == "" for r in results)
|
||||
assert all(r["publisher"] == "" for r in results)
|
||||
|
||||
Reference in New Issue
Block a user