minor and major reworks: rename swb to SRU, add a test for pdf parsing

major: rework mail to send mail as plaintext instead of html, preventing the bleed-in of html text
2025-10-07 14:15:10 +02:00
parent 0df7fd9fe6
commit 06965db26a
25 changed files with 1174 additions and 303 deletions
--- a/src/logic/lehmannsapi.py
+++ b/src/logic/lehmannsapi.py
@@ -1,13 +1,15 @@
 from __future__ import annotations

 import re
-from dataclasses import dataclass, asdict, field
-from typing import Optional, List, Iterable
-from urllib.parse import urljoin, quote_plus
+from dataclasses import asdict, dataclass, field
+from typing import Iterable, List, Optional
+from urllib.parse import quote_plus, urljoin

 import httpx
 from bs4 import BeautifulSoup

+from src.logic.dataclass import BookData
+
 BASE = "https://www.lehmanns.de"
 SEARCH_URL = "https://www.lehmanns.de/search/quick?mediatype_id=&q="

@@ -33,9 +35,11 @@ class LehmannsSearchResult:
    image: Optional[str] = None

    # From detail page:
-    pages: Optional[str] = None              # "<N> Seiten"
-    buyable: bool = True                     # set in enrich_pages (detail page)
-    unavailable_hint: Optional[str] = None   # e.g. "Titel ist leider vergriffen; keine Neuauflage"
+    pages: Optional[str] = None  # "<N> Seiten"
+    buyable: bool = True  # set in enrich_pages (detail page)
+    unavailable_hint: Optional[str] = (
+        None  # e.g. "Titel ist leider vergriffen; keine Neuauflage"
+    )

    def to_dict(self) -> dict:
        return asdict(self)
@@ -73,31 +77,45 @@ class LehmannsClient:
        # spaces -> '+'
        return SEARCH_URL + quote_plus(title)

-    def search_by_title(self, title: str, limit: Optional[int] = None, strict: bool = False) -> List[LehmannsSearchResult]:
+    def search_by_title(
+        self,
+        title: str,
+        limit: Optional[int] = None,
+        strict: bool = False,
+        only_latest: bool = True,
+    ) -> List[BookData]:
        """
        Parse the listing page only (no availability check here).
        Use enrich_pages(...) afterwards to fetch detail pages, add 'pages',
        and drop unbuyable items.
        """
-        url = self.build_search_url(title)
+        url = self.build_search_url(title=title)
        html = self._get(url)
        if not html:
            return []
        results = self._parse_results(html)
        self.enrich_pages(results)
+
+        results = [BookData().from_LehmannsSearchResult(r) for r in results]
        if strict:
            # filter results to only those with exact title match (case-insensitive)
            title_lower = title.lower()
            results = [r for r in results if r.title and r.title.lower() == title_lower]
-            results = [r for r in results if r.buyable]
+            # results = [r for r in results if r.buyable]
            return results
        if limit is not None:
-            results = results[:max(0, limit)]
+            results = results[: max(0, limit)]
+        if only_latest and len(results) > 1:
+            # keep only the latest edition (highest edition number)
+            results.sort(key=lambda r: (r.edition_number or 0), reverse=True)
+            results = [results[0]]
        return results

    # ------------------- Detail enrichment & filtering -------------------

-    def enrich_pages(self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True) -> List[LehmannsSearchResult]:
+    def enrich_pages(
+        self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True
+    ) -> List[LehmannsSearchResult]:
        """
        Fetch each result.url, extract:
          - pages: from <span class="book-meta meta-seiten" itemprop="numberOfPages">...</span>
@@ -135,11 +153,15 @@ class LehmannsClient:
                # Availability via li.availability-3
                avail_li = soup.select_one("li.availability-3")
                if avail_li:
-                    avail_text = " ".join(avail_li.get_text(" ", strip=True).split()).lower()
+                    avail_text = " ".join(
+                        avail_li.get_text(" ", strip=True).split()
+                    ).lower()
                    if "titel ist leider vergriffen" in avail_text:
                        r.buyable = False
                        if "keine neuauflage" in avail_text:
-                            r.unavailable_hint = "Titel ist leider vergriffen; keine Neuauflage"
+                            r.unavailable_hint = (
+                                "Titel ist leider vergriffen; keine Neuauflage"
+                            )
                        else:
                            r.unavailable_hint = "Titel ist leider vergriffen"

@@ -161,7 +183,9 @@ class LehmannsClient:
        try:
            r = self.client.get(url)
            r.encoding = "utf-8"
-            if r.status_code == 200 and "text/html" in (r.headers.get("content-type") or ""):
+            if r.status_code == 200 and "text/html" in (
+                r.headers.get("content-type") or ""
+            ):
                return r.text
        except httpx.HTTPError:
            pass
@@ -176,12 +200,18 @@ class LehmannsClient:
            if not a:
                continue
            url = urljoin(BASE, a["href"].strip())
-            base_title = (block.select_one(".title [itemprop='name']") or a).get_text(strip=True)
+            base_title = (block.select_one(".title [itemprop='name']") or a).get_text(
+                strip=True
+            )

            # Alternative headline => extend title
            alt_tag = block.select_one(".description[itemprop='alternativeHeadline']")
            alternative_headline = alt_tag.get_text(strip=True) if alt_tag else None
-            title = f"{base_title} : {alternative_headline}" if alternative_headline else base_title
+            title = (
+                f"{base_title} : {alternative_headline}"
+                if alternative_headline
+                else base_title
+            )
            description = alternative_headline

            # Authors from .author
@@ -227,7 +257,9 @@ class LehmannsClient:

            # Publisher
            publisher = None
-            pub = block.select_one(".publisherprop [itemprop='name']") or block.select_one(".publisher [itemprop='name']")
+            pub = block.select_one(
+                ".publisherprop [itemprop='name']"
+            ) or block.select_one(".publisher [itemprop='name']")
            if pub:
                publisher = pub.get_text(strip=True)