add APIs to parse data from SWB and Lehmanns

2025-09-01 14:30:37 +02:00
parent c6cbb1d825
commit 5bf5eeae00
2 changed files with 728 additions and 0 deletions
--- a/src/logic/lehmannsapi.py
+++ b/src/logic/lehmannsapi.py
@@ -0,0 +1,280 @@
 from __future__ import annotations
 import re
 from dataclasses import dataclass, asdict, field
 from typing import Optional, List, Iterable
 from urllib.parse import urljoin, quote_plus
 import httpx
 from bs4 import BeautifulSoup
 BASE = "https://www.lehmanns.de"
 SEARCH_URL = "https://www.lehmanns.de/search/quick?mediatype_id=&q="
@dataclass
 class LehmannsSearchResult:
    title: str
    url: str
    # Core fields from the listing card
    year: Optional[int] = None
    edition: Optional[int] = None
    publisher: Optional[str] = None
    isbn13: Optional[str] = None
    # Extras from the listing card
    description: Optional[str] = None
    authors: list[str] = field(default_factory=list)
    media_type: Optional[str] = None
    book_format: Optional[str] = None
    price_eur: Optional[float] = None
    currency: str = "EUR"
    image: Optional[str] = None
    # From detail page:
    pages: Optional[str] = None              # "<N> Seiten"
    buyable: bool = True                     # set in enrich_pages (detail page)
    unavailable_hint: Optional[str] = None   # e.g. "Titel ist leider vergriffen; keine Neuauflage"
    def to_dict(self) -> dict:
        return asdict(self)
 class LehmannsClient:
    """Scrapes quick-search results, then enriches (and filters) via product pages."""
    def __init__(self, timeout: float = 20.0):
        self.client = httpx.Client(
            headers={
                "User-Agent": (
                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                    "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
                ),
                "Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            },
            timeout=timeout,
            follow_redirects=True,
        )
    def close(self):
        self.client.close()
    def __enter__(self):
        return self
    def __exit__(self, *exc):
        self.close()
    # ------------------- Search (listing) -------------------
    def build_search_url(self, title: str) -> str:
        # spaces -> '+'
        return SEARCH_URL + quote_plus(title)
    def search_by_title(self, title: str, limit: Optional[int] = None, strict: bool = False) -> List[LehmannsSearchResult]:
        """
        Parse the listing page only (no availability check here).
        Use enrich_pages(...) afterwards to fetch detail pages, add 'pages',
        and drop unbuyable items.
        """
        url = self.build_search_url(title)
        html = self._get(url)
        if not html:
            return []
        results = self._parse_results(html)
        self.enrich_pages(results)
        if strict:
            # filter results to only those with exact title match (case-insensitive)
            title_lower = title.lower()
            results = [r for r in results if r.title and r.title.lower() == title_lower]
            results = [r for r in results if r.buyable]
            return results
        if limit is not None:
            results = results[:max(0, limit)]
        return results
    # ------------------- Detail enrichment & filtering -------------------
    def enrich_pages(self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True) -> List[LehmannsSearchResult]:
        """
        Fetch each result.url, extract:
          - pages: from <span class="book-meta meta-seiten" itemprop="numberOfPages">...</span>
          - availability: from <li class="availability-3">...</li>
            * if it contains "Titel ist leider vergriffen", mark buyable=False
            * if it also contains "keine Neuauflage", set unavailable_hint accordingly
        If drop_unbuyable=True, exclude non-buyable results from the returned list.
        """
        enriched: List[LehmannsSearchResult] = []
        for r in results:
            try:
                html = self._get(r.url)
                if not html:
                    # Can't verify; keep as-is when not dropping, else skip
                    if not drop_unbuyable:
                        enriched.append(r)
                    continue
                soup = BeautifulSoup(html, "html.parser")
                # Pages
                pages_node = soup.select_one(
                    "span.book-meta.meta-seiten[itemprop='numberOfPages'], "
                    "span.book-meta.meta-seiten[itemprop='numberofpages'], "
                    ".meta-seiten [itemprop='numberOfPages'], "
                    ".meta-seiten[itemprop='numberOfPages'], "
                    ".book-meta.meta-seiten"
                )
                if pages_node:
                    text = pages_node.get_text(" ", strip=True)
                    m = re.search(r"\d+", text)
                    if m:
                        r.pages = f"{m.group(0)} Seiten"
                # Availability via li.availability-3
                avail_li = soup.select_one("li.availability-3")
                if avail_li:
                    avail_text = " ".join(avail_li.get_text(" ", strip=True).split()).lower()
                    if "titel ist leider vergriffen" in avail_text:
                        r.buyable = False
                        if "keine neuauflage" in avail_text:
                            r.unavailable_hint = "Titel ist leider vergriffen; keine Neuauflage"
                        else:
                            r.unavailable_hint = "Titel ist leider vergriffen"
                # Append or drop
                if (not drop_unbuyable) or r.buyable:
                    enriched.append(r)
            except Exception:
                # On any per-item error, keep the record if not dropping; else skip
                if not drop_unbuyable:
                    enriched.append(r)
                continue
        return enriched
    # ------------------- Internals -------------------
    def _get(self, url: str) -> Optional[str]:
        try:
            r = self.client.get(url)
            r.encoding = "utf-8"
            if r.status_code == 200 and "text/html" in (r.headers.get("content-type") or ""):
                return r.text
        except httpx.HTTPError:
            pass
        return None
    def _parse_results(self, html: str) -> List[LehmannsSearchResult]:
        soup = BeautifulSoup(html, "html.parser")
        results: list[LehmannsSearchResult] = []
        for block in soup.select("div.info-block"):
            a = block.select_one(".title a[href]")
            if not a:
                continue
            url = urljoin(BASE, a["href"].strip())
            base_title = (block.select_one(".title [itemprop='name']") or a).get_text(strip=True)
            # Alternative headline => extend title
            alt_tag = block.select_one(".description[itemprop='alternativeHeadline']")
            alternative_headline = alt_tag.get_text(strip=True) if alt_tag else None
            title = f"{base_title} : {alternative_headline}" if alternative_headline else base_title
            description = alternative_headline
            # Authors from .author
            authors: list[str] = []
            author_div = block.select_one("div.author")
            if author_div:
                t = author_div.get_text(" ", strip=True)
                t = re.sub(r"^\s*von\s+", "", t, flags=re.I)
                for part in re.split(r"\s*;\s*|\s*&\s*|\s+und\s+", t):
                    name = " ".join(part.split())
                    if name:
                        authors.append(name)
            # Media + format
            media_type = None
            book_format = None
            type_text = block.select_one(".type")
            if type_text:
                t = type_text.get_text(" ", strip=True)
                m = re.search(r"\b(Buch|eBook|Hörbuch)\b", t)
                if m:
                    media_type = m.group(1)
                fm = re.search(r"\(([^)]+)\)", t)
                if fm:
                    book_format = fm.group(1).strip().upper()
            # Year
            year = None
            y = block.select_one("[itemprop='copyrightYear']")
            if y:
                try:
                    year = int(y.get_text(strip=True))
                except ValueError:
                    pass
            # Edition
            edition = None
            ed = block.select_one("[itemprop='bookEdition']")
            if ed:
                m = re.search(r"\d+", ed.get_text(strip=True))
                if m:
                    edition = int(m.group())
            # Publisher
            publisher = None
            pub = block.select_one(".publisherprop [itemprop='name']") or block.select_one(".publisher [itemprop='name']")
            if pub:
                publisher = pub.get_text(strip=True)
            # ISBN-13
            isbn13 = None
            isbn_tag = block.select_one(".isbn [itemprop='isbn'], [itemprop='isbn']")
            if isbn_tag:
                digits = re.sub(r"[^0-9Xx]", "", isbn_tag.get_text(strip=True))
                m = re.search(r"(97[89]\d{10})", digits)
                if m:
                    isbn13 = m.group(1)
            # Price (best effort)
            price_eur = None
            txt = block.get_text(" ", strip=True)
            mprice = re.search(r"(\d{1,3}(?:\.\d{3})*,\d{2})\s*€", txt)
            if not mprice and block.parent:
                sib = block.parent.get_text(" ", strip=True)
                mprice = re.search(r"(\d{1,3}(?:\.\d{3})*,\d{2})\s*€", sib)
            if mprice:
                num = mprice.group(1).replace(".", "").replace(",", ".")
                try:
                    price_eur = float(num)
                except ValueError:
                    pass
            # Image (best-effort)
            image = None
            left_img = block.find_previous("img")
            if left_img and left_img.get("src"):
                image = urljoin(BASE, left_img["src"])
            results.append(
                LehmannsSearchResult(
                    title=title,
                    url=url,
                    description=description,
                    authors=authors,
                    media_type=media_type,
                    book_format=book_format,
                    year=year,
                    edition=edition,
                    publisher=publisher,
                    isbn13=isbn13,
                    price_eur=price_eur,
                    image=image,
                )
            )
        return results
--- a/src/logic/swb.py
+++ b/src/logic/swb.py
@@ -0,0 +1,448 @@
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass, field
 from typing import Dict, Iterable, List, Optional, Tuple
 import requests
 from src.logic.dataclass import BookData
 # -----------------------
 # Dataclasses
 # -----------------------
 # --- MARC XML structures ---
@dataclass
 class ControlField:
    tag: str
    value: str
@dataclass
 class SubField:
    code: str
    value: str
@dataclass
 class DataField:
    tag: str
    ind1: str = " "
    ind2: str = " "
    subfields: List[SubField] = field(default_factory=list)
@dataclass
 class MarcRecord:
    leader: str
    controlfields: List[ControlField] = field(default_factory=list)
    datafields: List[DataField] = field(default_factory=list)
 # --- SRU record wrapper ---
@dataclass
 class Record:
    recordSchema: str
    recordPacking: str
    recordData: MarcRecord
    recordPosition: int
@dataclass
 class EchoedSearchRequest:
    version: str
    query: str
    maximumRecords: int
    recordPacking: str
    recordSchema: str
@dataclass
 class SearchRetrieveResponse:
    version: str
    numberOfRecords: int
    records: List[Record] = field(default_factory=list)
    echoedSearchRetrieveRequest: Optional[EchoedSearchRequest] = None
 # -----------------------
 # Parser
 # -----------------------
 ZS = "http://www.loc.gov/zing/srw/"
 MARC = "http://www.loc.gov/MARC21/slim"
 NS = {"zs": ZS, "marc": MARC}
 def _text(elem: Optional[ET.Element]) -> str:
    return (elem.text or "") if elem is not None else ""
 def _req_text(parent: ET.Element, path: str) -> str:
    el = parent.find(path, NS)
    if el is None or el.text is None:
        raise ValueError(f"Required element not found or empty: {path}")
    return el.text
 def parse_marc_record(record_el: ET.Element) -> MarcRecord:
    """
    record_el is the <marc:record> element (default ns MARC in your sample)
    """
    # leader
    leader_text = _req_text(record_el, "marc:leader")
    # controlfields
    controlfields: List[ControlField] = []
    for cf in record_el.findall("marc:controlfield", NS):
        tag = cf.get("tag", "").strip()
        controlfields.append(ControlField(tag=tag, value=_text(cf)))
    # datafields
    datafields: List[DataField] = []
    for df in record_el.findall("marc:datafield", NS):
        tag = df.get("tag", "").strip()
        ind1 = df.get("ind1") or " "
        ind2 = df.get("ind2") or " "
        subfields: List[SubField] = []
        for sf in df.findall("marc:subfield", NS):
            code = sf.get("code", "")
            subfields.append(SubField(code=code, value=_text(sf)))
        datafields.append(DataField(tag=tag, ind1=ind1, ind2=ind2, subfields=subfields))
    return MarcRecord(
        leader=leader_text, controlfields=controlfields, datafields=datafields
    )
 def parse_record(zs_record_el: ET.Element) -> Record:
    recordSchema = _req_text(zs_record_el, "zs:recordSchema")
    recordPacking = _req_text(zs_record_el, "zs:recordPacking")
    # recordData contains a MARC <record> with default MARC namespace in your sample
    recordData_el = zs_record_el.find("zs:recordData", NS)
    if recordData_el is None:
        raise ValueError("Missing zs:recordData")
    marc_record_el = recordData_el.find("marc:record", NS)
    if marc_record_el is None:
        # If the MARC record uses default ns (xmlns="...") ElementTree still needs the ns-qualified name
        # We already searched with prefix; this covers both default and prefixed cases.
        raise ValueError("Missing MARC21 record inside zs:recordData")
    marc_record = parse_marc_record(marc_record_el)
    recordPosition = int(_req_text(zs_record_el, "zs:recordPosition"))
    return Record(
        recordSchema=recordSchema,
        recordPacking=recordPacking,
        recordData=marc_record,
        recordPosition=recordPosition,
    )
 def parse_echoed_request(root: ET.Element) -> Optional[EchoedSearchRequest]:
    el = root.find("zs:echoedSearchRetrieveRequest", NS)
    if el is None:
        return None
    # Be permissive with missing fields
    version = _text(el.find("zs:version", NS))
    query = _text(el.find("zs:query", NS))
    maximumRecords_text = _text(el.find("zs:maximumRecords", NS)) or "0"
    recordPacking = _text(el.find("zs:recordPacking", NS))
    recordSchema = _text(el.find("zs:recordSchema", NS))
    try:
        maximumRecords = int(maximumRecords_text)
    except ValueError:
        maximumRecords = 0
    return EchoedSearchRequest(
        version=version,
        query=query,
        maximumRecords=maximumRecords,
        recordPacking=recordPacking,
        recordSchema=recordSchema,
    )
 def parse_search_retrieve_response(xml_str: str) -> SearchRetrieveResponse:
    root = ET.fromstring(xml_str)
    # Root is zs:searchRetrieveResponse
    version = _req_text(root, "zs:version")
    numberOfRecords = int(_req_text(root, "zs:numberOfRecords"))
    records_parent = root.find("zs:records", NS)
    records: List[Record] = []
    if records_parent is not None:
        for r in records_parent.findall("zs:record", NS):
            records.append(parse_record(r))
    echoed = parse_echoed_request(root)
    return SearchRetrieveResponse(
        version=version,
        numberOfRecords=numberOfRecords,
        records=records,
        echoedSearchRetrieveRequest=echoed,
    )
 # --- Query helpers over MarcRecord ---
 def iter_datafields(
    rec: MarcRecord,
    tag: Optional[str] = None,
    ind1: Optional[str] = None,
    ind2: Optional[str] = None,
 ) -> Iterable[DataField]:
    """Yield datafields, optionally filtered by tag/indicators."""
    for df in rec.datafields:
        if tag is not None and df.tag != tag:
            continue
        if ind1 is not None and df.ind1 != ind1:
            continue
        if ind2 is not None and df.ind2 != ind2:
            continue
        yield df
 def subfield_values(
    rec: MarcRecord,
    tag: str,
    code: str,
    *,
    ind1: Optional[str] = None,
    ind2: Optional[str] = None,
 ) -> List[str]:
    """All values for subfield `code` in every `tag` field (respecting indicators)."""
    out: List[str] = []
    for df in iter_datafields(rec, tag, ind1, ind2):
        out.extend(sf.value for sf in df.subfields if sf.code == code)
    return out
 def first_subfield_value(
    rec: MarcRecord,
    tag: str,
    code: str,
    *,
    ind1: Optional[str] = None,
    ind2: Optional[str] = None,
    default: Optional[str] = None,
 ) -> Optional[str]:
    """First value for subfield `code` in `tag` (respecting indicators)."""
    for df in iter_datafields(rec, tag, ind1, ind2):
        for sf in df.subfields:
            if sf.code == code:
                return sf.value
    return default
 def find_datafields_with_subfields(
    rec: MarcRecord,
    tag: str,
    *,
    where_all: Optional[Dict[str, str]] = None,
    where_any: Optional[Dict[str, str]] = None,
    casefold: bool = False,
    ind1: Optional[str] = None,
    ind2: Optional[str] = None,
 ) -> List[DataField]:
    """
    Return datafields of `tag` whose subfields match constraints:
      - where_all: every (code -> exact value) must be present
      - where_any: at least one (code -> exact value) present
    Set `casefold=True` for case-insensitive comparison.
    """
    where_all = where_all or {}
    where_any = where_any or {}
    matched: List[DataField] = []
    for df in iter_datafields(rec, tag, ind1, ind2):
        # Map code -> list of values (with optional casefold applied)
        vals: Dict[str, List[str]] = {}
        for sf in df.subfields:
            v = sf.value.casefold() if casefold else sf.value
            vals.setdefault(sf.code, []).append(v)
        ok = True
        for c, v in where_all.items():
            vv = v.casefold() if casefold else v
            if c not in vals or vv not in vals[c]:
                ok = False
                break
        if ok and where_any:
            any_ok = any(
                (c in vals) and ((v.casefold() if casefold else v) in vals[c])
                for c, v in where_any.items()
            )
            if not any_ok:
                ok = False
        if ok:
            matched.append(df)
    return matched
 def controlfield_value(
    rec: MarcRecord, tag: str, default: Optional[str] = None
 ) -> Optional[str]:
    """Get the first controlfield value by tag (e.g., '001', '005')."""
    for cf in rec.controlfields:
        if cf.tag == tag:
            return cf.value
    return default
 def datafields_value(
    data: List[DataField], code: str, default: Optional[str] = None
 ) -> Optional[str]:
    """Get the first value for a specific subfield code in a list of datafields."""
    for df in data:
        for sf in df.subfields:
            if sf.code == code:
                return sf.value
    return default
 def datafield_value(
    df: DataField, code: str, default: Optional[str] = None
 ) -> Optional[str]:
    """Get the first value for a specific subfield code in a datafield."""
    for sf in df.subfields:
        if sf.code == code:
            return sf.value
    return default
 def _smart_join_title(a: str, b: Optional[str]) -> str:
    """
    Join 245 $a and $b with MARC-style punctuation.
    If $b is present, join with ' : ' unless either side already supplies punctuation.
    """
    a = a.strip()
    if not b:
        return a
    b = b.strip()
    if a.endswith((":", ";", "/")) or b.startswith((":", ";", "/")):
        return f"{a} {b}"
    return f"{a} : {b}"
 def subfield_values_from_fields(
    fields: Iterable[DataField],
    code: str,
 ) -> List[str]:
    """All subfield values with given `code` across a list of DataField."""
    return [sf.value for df in fields for sf in df.subfields if sf.code == code]
 def first_subfield_value_from_fields(
    fields: Iterable[DataField],
    code: str,
    default: Optional[str] = None,
 ) -> Optional[str]:
    """First subfield value with given `code` across a list of DataField."""
    for df in fields:
        for sf in df.subfields:
            if sf.code == code:
                return sf.value
    return default
 def subfield_value_pairs_from_fields(
    fields: Iterable[DataField],
    code: str,
 ) -> List[Tuple[DataField, str]]:
    """
    Return (DataField, value) pairs for all subfields with `code`.
    Useful if you need to know which field a value came from.
    """
    out: List[Tuple[DataField, str]] = []
    for df in fields:
        for sf in df.subfields:
            if sf.code == code:
                out.append((df, sf.value))
    return out
 def book_from_marc(rec: MarcRecord) -> BookData:
    # PPN from controlfield 001
    ppn = controlfield_value(rec, "001")
    # Title = 245 $a + 245 $b (if present)
    t_a = first_subfield_value(rec, "245", "a")
    t_b = first_subfield_value(rec, "245", "b")
    title = _smart_join_title(t_a, t_b) if t_a else None
    # Signature = 924 where $9 == "Frei 129" → take that field's $g
    frei_fields = find_datafields_with_subfields(
        rec, "924", where_all={"9": "Frei 129"}
    )
    signature = first_subfield_value_from_fields(frei_fields, "g")
    # Year = 264 $c (prefer ind2="1" publication; fallback to any 264)
    year = first_subfield_value(rec, "264", "c", ind2="1") or first_subfield_value(
        rec, "264", "c"
    )
    isbn = subfield_values(rec, "020", "a")
    return BookData(
        ppn=ppn,
        title=title,
        signature=signature,
        edition=first_subfield_value(rec, "250", "a"),
        year=year,
        pages=first_subfield_value(rec, "300", "a"),
        publisher=first_subfield_value(rec, "264", "b"),
        isbn=isbn,
    )
 class SWB:
    def __init__(self):
        self.url = "https://sru.k10plus.de/opac-de-627!rec=1?version=1.1&operation=searchRetrieve&query={}&maximumRecords=10&recordSchema=marcxml"
        self.bib_id = 20735
    def get(self, query_args: Iterable[str]) -> List[Record]:
        # if any query_arg ends with =, remove it
        query_args = [arg for arg in query_args if not arg.endswith("=")]
        query = "+and+".join(query_args)
        query = query.replace(" ", "%20").replace("&", "%26")
        url = self.url.format(query)
        print("Fetching from SWB:", url)
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
            "Accept": "application/xml",
            "Accept-Charset": "latin1,utf-8;q=0.7,*;q=0.3",
        }
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            raise Exception(f"Error fetching data from SWB: {response.status_code}")
        # print(response.text)
        data = response.content
        # extract top-level response
        response = parse_search_retrieve_response(data)
        return response.records
    def getBooks(self, query_args: Iterable[str]) -> List[BookData]:
        records: List[Record] = self.get(query_args)
        books: List[BookData] = []
        title = query_args[1].split("=")[1]
        # print(len(records), "records found")
        for rec in records:
            book = book_from_marc(rec.recordData)
            books.append(book)
        books = [
            b for b in books if b.title and b.title.lower().startswith(title.lower())
        ]
        return books