chore: restructured project, updated readme

2025-10-29 09:31:40 +01:00
parent a4460ec17b
commit ee62c65ae7
70 changed files with 8518 additions and 100 deletions
--- a/src/services/lehmanns.py
+++ b/src/services/lehmanns.py
@@ -0,0 +1,312 @@
+from __future__ import annotations
+
+import re
+from dataclasses import asdict, dataclass, field
+from typing import Iterable, List, Optional
+from urllib.parse import quote_plus, urljoin
+
+import httpx
+from bs4 import BeautifulSoup
+
+from src.core.models import BookData
+
+BASE = "https://www.lehmanns.de"
+SEARCH_URL = "https://www.lehmanns.de/search/quick?mediatype_id=&q="
+
+
+@dataclass
+class LehmannsSearchResult:
+    title: str
+    url: str
+
+    # Core fields from the listing card
+    year: Optional[int] = None
+    edition: Optional[int] = None
+    publisher: Optional[str] = None
+    isbn13: Optional[str] = None
+
+    # Extras from the listing card
+    description: Optional[str] = None
+    authors: list[str] = field(default_factory=list)
+    media_type: Optional[str] = None
+    book_format: Optional[str] = None
+    price_eur: Optional[float] = None
+    currency: str = "EUR"
+    image: Optional[str] = None
+
+    # From detail page:
+    pages: Optional[str] = None  # "<N> Seiten"
+    buyable: bool = True  # set in enrich_pages (detail page)
+    unavailable_hint: Optional[str] = (
+        None  # e.g. "Titel ist leider vergriffen; keine Neuauflage"
+    )
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+
+class LehmannsClient:
+    """Scrapes quick-search results, then enriches (and filters) via product pages."""
+
+    def __init__(self, timeout: float = 20.0):
+        self.client = httpx.Client(
+            headers={
+                "User-Agent": (
+                    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+                    "(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
+                ),
+                "Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
+                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+            },
+            timeout=timeout,
+            follow_redirects=True,
+        )
+
+    def close(self):
+        self.client.close()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc):
+        self.close()
+
+    # ------------------- Search (listing) -------------------
+
+    def build_search_url(self, title: str) -> str:
+        # spaces -> '+'
+        return SEARCH_URL + quote_plus(title)
+
+    def search_by_title(
+        self,
+        title: str,
+        limit: Optional[int] = None,
+        strict: bool = False,
+        only_latest: bool = True,
+    ) -> List[BookData]:
+        """
+        Parse the listing page only (no availability check here).
+        Use enrich_pages(...) afterwards to fetch detail pages, add 'pages',
+        and drop unbuyable items.
+        """
+        url = self.build_search_url(title=title)
+        html = self._get(url)
+        if not html:
+            return []
+        results = self._parse_results(html)
+        self.enrich_pages(results)
+
+        results = [BookData().from_LehmannsSearchResult(r) for r in results]
+        if strict:
+            # filter results to only those with exact title match (case-insensitive)
+            title_lower = title.lower()
+            results = [r for r in results if r.title and r.title.lower() == title_lower]
+            # results = [r for r in results if r.buyable]
+            return results
+        if limit is not None:
+            results = results[: max(0, limit)]
+        if only_latest and len(results) > 1:
+            # keep only the latest edition (highest edition number)
+            results.sort(key=lambda r: (r.edition_number or 0), reverse=True)
+            results = [results[0]]
+        return results
+
+    # ------------------- Detail enrichment & filtering -------------------
+
+    def enrich_pages(
+        self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True
+    ) -> List[LehmannsSearchResult]:
+        """
+        Fetch each result.url, extract:
+          - pages: from <span class="book-meta meta-seiten" itemprop="numberOfPages">...</span>
+          - availability: from <li class="availability-3">...</li>
+            * if it contains "Titel ist leider vergriffen", mark buyable=False
+            * if it also contains "keine Neuauflage", set unavailable_hint accordingly
+        If drop_unbuyable=True, exclude non-buyable results from the returned list.
+        """
+        enriched: List[LehmannsSearchResult] = []
+        for r in results:
+            try:
+                html = self._get(r.url)
+                if not html:
+                    # Can't verify; keep as-is when not dropping, else skip
+                    if not drop_unbuyable:
+                        enriched.append(r)
+                    continue
+
+                soup = BeautifulSoup(html, "html.parser")  # type: ignore
+
+                # Pages
+                pages_node = soup.select_one(  # type: ignore
+                    "span.book-meta.meta-seiten[itemprop='numberOfPages'], "
+                    "span.book-meta.meta-seiten[itemprop='numberofpages'], "
+                    ".meta-seiten [itemprop='numberOfPages'], "
+                    ".meta-seiten[itemprop='numberOfPages'], "
+                    ".book-meta.meta-seiten"
+                )
+                if pages_node:
+                    text = pages_node.get_text(" ", strip=True)
+                    m = re.search(r"\d+", text)
+                    if m:
+                        r.pages = f"{m.group(0)} Seiten"
+
+                # Availability via li.availability-3
+                avail_li = soup.select_one("li.availability-3")  # type: ignore
+                if avail_li:
+                    avail_text = " ".join(
+                        avail_li.get_text(" ", strip=True).split()
+                    ).lower()
+                    if "titel ist leider vergriffen" in avail_text:
+                        r.buyable = False
+                        if "keine neuauflage" in avail_text:
+                            r.unavailable_hint = (
+                                "Titel ist leider vergriffen; keine Neuauflage"
+                            )
+                        else:
+                            r.unavailable_hint = "Titel ist leider vergriffen"
+
+                # Append or drop
+                if (not drop_unbuyable) or r.buyable:
+                    enriched.append(r)
+
+            except Exception:
+                # On any per-item error, keep the record if not dropping; else skip
+                if not drop_unbuyable:
+                    enriched.append(r)
+                continue
+
+        return enriched
+
+    # ------------------- Internals -------------------
+
+    def _get(self, url: str) -> Optional[str]:
+        try:
+            r = self.client.get(url)
+            r.encoding = "utf-8"
+            if r.status_code == 200 and "text/html" in (
+                r.headers.get("content-type") or ""
+            ):
+                return r.text
+        except httpx.HTTPError:
+            pass
+        return None
+
+    def _parse_results(self, html: str) -> List[LehmannsSearchResult]:
+        soup = BeautifulSoup(html, "html.parser")
+        results: list[LehmannsSearchResult] = []
+
+        for block in soup.select("div.info-block"):
+            a = block.select_one(".title a[href]")
+            if not a:
+                continue
+            url = urljoin(BASE, a["href"].strip())
+            base_title = (block.select_one(".title [itemprop='name']") or a).get_text(  # type: ignore
+                strip=True
+            )
+
+            # Alternative headline => extend title
+            alt_tag = block.select_one(".description[itemprop='alternativeHeadline']")  # type: ignore
+            alternative_headline = alt_tag.get_text(strip=True) if alt_tag else None
+            title = (
+                f"{base_title} : {alternative_headline}"
+                if alternative_headline
+                else base_title
+            )
+            description = alternative_headline
+
+            # Authors from .author
+            authors: list[str] = []
+            author_div = block.select_one("div.author")  # type: ignore
+            if author_div:
+                t = author_div.get_text(" ", strip=True)
+                t = re.sub(r"^\s*von\s+", "", t, flags=re.I)
+                for part in re.split(r"\s*;\s*|\s*&\s*|\s+und\s+", t):
+                    name = " ".join(part.split())
+                    if name:
+                        authors.append(name)
+
+            # Media + format
+            media_type = None
+            book_format = None
+            type_text = block.select_one(".type")  # type: ignore
+            if type_text:
+                t = type_text.get_text(" ", strip=True)
+                m = re.search(r"\b(Buch|eBook|Hörbuch)\b", t)
+                if m:
+                    media_type = m.group(1)
+                fm = re.search(r"\(([^)]+)\)", t)
+                if fm:
+                    book_format = fm.group(1).strip().upper()
+
+            # Year
+            year = None
+            y = block.select_one("[itemprop='copyrightYear']")  # type: ignore
+            if y:
+                try:
+                    year = int(y.get_text(strip=True))
+                except ValueError:
+                    pass
+
+            # Edition
+            edition = None
+            ed = block.select_one("[itemprop='bookEdition']")  # type: ignore
+            if ed:
+                m = re.search(r"\d+", ed.get_text(strip=True))
+                if m:
+                    edition = int(m.group())
+
+            # Publisher
+            publisher = None
+            pub = block.select_one(  # type: ignore
+                ".publisherprop [itemprop='name']"
+            ) or block.select_one(".publisher [itemprop='name']")  # type: ignore
+            if pub:
+                publisher = pub.get_text(strip=True)
+
+            # ISBN-13
+            isbn13 = None
+            isbn_tag = block.select_one(".isbn [itemprop='isbn'], [itemprop='isbn']")  # type: ignore
+            if isbn_tag:
+                digits = re.sub(r"[^0-9Xx]", "", isbn_tag.get_text(strip=True))
+                m = re.search(r"(97[89]\d{10})", digits)
+                if m:
+                    isbn13 = m.group(1)
+
+            # Price (best effort)
+            price_eur = None
+            txt = block.get_text(" ", strip=True)
+            mprice = re.search(r"(\d{1,3}(?:\.\d{3})*,\d{2})\s*€", txt)
+            if not mprice and block.parent:
+                sib = block.parent.get_text(" ", strip=True)
+                mprice = re.search(r"(\d{1,3}(?:\.\d{3})*,\d{2})\s*€", sib)
+            if mprice:
+                num = mprice.group(1).replace(".", "").replace(",", ".")
+                try:
+                    price_eur = float(num)
+                except ValueError:
+                    pass
+
+            # Image (best-effort)
+            image = None
+            left_img = block.find_previous("img")  # type: ignore
+            if left_img and left_img.get("src"):
+                image = urljoin(BASE, left_img["src"])
+
+            results.append(
+                LehmannsSearchResult(
+                    title=title,
+                    url=url,
+                    description=description,
+                    authors=authors,
+                    media_type=media_type,
+                    book_format=book_format,
+                    year=year,
+                    edition=edition,
+                    publisher=publisher,
+                    isbn13=isbn13,
+                    price_eur=price_eur,
+                    image=image,
+                )
+            )
+
+        return results