minor and major reworks: rename swb to SRU, add a test for pdf parsing

major: rework mail to send mail as plaintext instead of html, preventing the bleed-in of html text
This commit is contained in:
2025-10-07 14:15:10 +02:00
parent 0df7fd9fe6
commit 06965db26a
25 changed files with 1174 additions and 303 deletions

View File

@@ -1,13 +1,15 @@
from __future__ import annotations
import re
from dataclasses import dataclass, asdict, field
from typing import Optional, List, Iterable
from urllib.parse import urljoin, quote_plus
from dataclasses import asdict, dataclass, field
from typing import Iterable, List, Optional
from urllib.parse import quote_plus, urljoin
import httpx
from bs4 import BeautifulSoup
from src.logic.dataclass import BookData
BASE = "https://www.lehmanns.de"
SEARCH_URL = "https://www.lehmanns.de/search/quick?mediatype_id=&q="
@@ -33,9 +35,11 @@ class LehmannsSearchResult:
image: Optional[str] = None
# From detail page:
pages: Optional[str] = None # "<N> Seiten"
buyable: bool = True # set in enrich_pages (detail page)
unavailable_hint: Optional[str] = None # e.g. "Titel ist leider vergriffen; keine Neuauflage"
pages: Optional[str] = None # "<N> Seiten"
buyable: bool = True # set in enrich_pages (detail page)
unavailable_hint: Optional[str] = (
None # e.g. "Titel ist leider vergriffen; keine Neuauflage"
)
def to_dict(self) -> dict:
return asdict(self)
@@ -73,31 +77,45 @@ class LehmannsClient:
# spaces -> '+'
return SEARCH_URL + quote_plus(title)
def search_by_title(self, title: str, limit: Optional[int] = None, strict: bool = False) -> List[LehmannsSearchResult]:
def search_by_title(
self,
title: str,
limit: Optional[int] = None,
strict: bool = False,
only_latest: bool = True,
) -> List[BookData]:
"""
Parse the listing page only (no availability check here).
Use enrich_pages(...) afterwards to fetch detail pages, add 'pages',
and drop unbuyable items.
"""
url = self.build_search_url(title)
url = self.build_search_url(title=title)
html = self._get(url)
if not html:
return []
results = self._parse_results(html)
self.enrich_pages(results)
results = [BookData().from_LehmannsSearchResult(r) for r in results]
if strict:
# filter results to only those with exact title match (case-insensitive)
title_lower = title.lower()
results = [r for r in results if r.title and r.title.lower() == title_lower]
results = [r for r in results if r.buyable]
# results = [r for r in results if r.buyable]
return results
if limit is not None:
results = results[:max(0, limit)]
results = results[: max(0, limit)]
if only_latest and len(results) > 1:
# keep only the latest edition (highest edition number)
results.sort(key=lambda r: (r.edition_number or 0), reverse=True)
results = [results[0]]
return results
# ------------------- Detail enrichment & filtering -------------------
def enrich_pages(self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True) -> List[LehmannsSearchResult]:
def enrich_pages(
self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True
) -> List[LehmannsSearchResult]:
"""
Fetch each result.url, extract:
- pages: from <span class="book-meta meta-seiten" itemprop="numberOfPages">...</span>
@@ -135,11 +153,15 @@ class LehmannsClient:
# Availability via li.availability-3
avail_li = soup.select_one("li.availability-3")
if avail_li:
avail_text = " ".join(avail_li.get_text(" ", strip=True).split()).lower()
avail_text = " ".join(
avail_li.get_text(" ", strip=True).split()
).lower()
if "titel ist leider vergriffen" in avail_text:
r.buyable = False
if "keine neuauflage" in avail_text:
r.unavailable_hint = "Titel ist leider vergriffen; keine Neuauflage"
r.unavailable_hint = (
"Titel ist leider vergriffen; keine Neuauflage"
)
else:
r.unavailable_hint = "Titel ist leider vergriffen"
@@ -161,7 +183,9 @@ class LehmannsClient:
try:
r = self.client.get(url)
r.encoding = "utf-8"
if r.status_code == 200 and "text/html" in (r.headers.get("content-type") or ""):
if r.status_code == 200 and "text/html" in (
r.headers.get("content-type") or ""
):
return r.text
except httpx.HTTPError:
pass
@@ -176,12 +200,18 @@ class LehmannsClient:
if not a:
continue
url = urljoin(BASE, a["href"].strip())
base_title = (block.select_one(".title [itemprop='name']") or a).get_text(strip=True)
base_title = (block.select_one(".title [itemprop='name']") or a).get_text(
strip=True
)
# Alternative headline => extend title
alt_tag = block.select_one(".description[itemprop='alternativeHeadline']")
alternative_headline = alt_tag.get_text(strip=True) if alt_tag else None
title = f"{base_title} : {alternative_headline}" if alternative_headline else base_title
title = (
f"{base_title} : {alternative_headline}"
if alternative_headline
else base_title
)
description = alternative_headline
# Authors from .author
@@ -227,7 +257,9 @@ class LehmannsClient:
# Publisher
publisher = None
pub = block.select_one(".publisherprop [itemprop='name']") or block.select_one(".publisher [itemprop='name']")
pub = block.select_one(
".publisherprop [itemprop='name']"
) or block.select_one(".publisher [itemprop='name']")
if pub:
publisher = pub.get_text(strip=True)