Files
SemesterapparatsManager/src/logic/lehmannsapi.py

281 lines
10 KiB
Python

from __future__ import annotations
import re
from dataclasses import dataclass, asdict, field
from typing import Optional, List, Iterable
from urllib.parse import urljoin, quote_plus
import httpx
from bs4 import BeautifulSoup
BASE = "https://www.lehmanns.de"
SEARCH_URL = "https://www.lehmanns.de/search/quick?mediatype_id=&q="
@dataclass
class LehmannsSearchResult:
title: str
url: str
# Core fields from the listing card
year: Optional[int] = None
edition: Optional[int] = None
publisher: Optional[str] = None
isbn13: Optional[str] = None
# Extras from the listing card
description: Optional[str] = None
authors: list[str] = field(default_factory=list)
media_type: Optional[str] = None
book_format: Optional[str] = None
price_eur: Optional[float] = None
currency: str = "EUR"
image: Optional[str] = None
# From detail page:
pages: Optional[str] = None # "<N> Seiten"
buyable: bool = True # set in enrich_pages (detail page)
unavailable_hint: Optional[str] = None # e.g. "Titel ist leider vergriffen; keine Neuauflage"
def to_dict(self) -> dict:
return asdict(self)
class LehmannsClient:
"""Scrapes quick-search results, then enriches (and filters) via product pages."""
def __init__(self, timeout: float = 20.0):
self.client = httpx.Client(
headers={
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0 Safari/537.36"
),
"Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
timeout=timeout,
follow_redirects=True,
)
def close(self):
self.client.close()
def __enter__(self):
return self
def __exit__(self, *exc):
self.close()
# ------------------- Search (listing) -------------------
def build_search_url(self, title: str) -> str:
# spaces -> '+'
return SEARCH_URL + quote_plus(title)
def search_by_title(self, title: str, limit: Optional[int] = None, strict: bool = False) -> List[LehmannsSearchResult]:
"""
Parse the listing page only (no availability check here).
Use enrich_pages(...) afterwards to fetch detail pages, add 'pages',
and drop unbuyable items.
"""
url = self.build_search_url(title)
html = self._get(url)
if not html:
return []
results = self._parse_results(html)
self.enrich_pages(results)
if strict:
# filter results to only those with exact title match (case-insensitive)
title_lower = title.lower()
results = [r for r in results if r.title and r.title.lower() == title_lower]
results = [r for r in results if r.buyable]
return results
if limit is not None:
results = results[:max(0, limit)]
return results
# ------------------- Detail enrichment & filtering -------------------
def enrich_pages(self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True) -> List[LehmannsSearchResult]:
"""
Fetch each result.url, extract:
- pages: from <span class="book-meta meta-seiten" itemprop="numberOfPages">...</span>
- availability: from <li class="availability-3">...</li>
* if it contains "Titel ist leider vergriffen", mark buyable=False
* if it also contains "keine Neuauflage", set unavailable_hint accordingly
If drop_unbuyable=True, exclude non-buyable results from the returned list.
"""
enriched: List[LehmannsSearchResult] = []
for r in results:
try:
html = self._get(r.url)
if not html:
# Can't verify; keep as-is when not dropping, else skip
if not drop_unbuyable:
enriched.append(r)
continue
soup = BeautifulSoup(html, "html.parser")
# Pages
pages_node = soup.select_one(
"span.book-meta.meta-seiten[itemprop='numberOfPages'], "
"span.book-meta.meta-seiten[itemprop='numberofpages'], "
".meta-seiten [itemprop='numberOfPages'], "
".meta-seiten[itemprop='numberOfPages'], "
".book-meta.meta-seiten"
)
if pages_node:
text = pages_node.get_text(" ", strip=True)
m = re.search(r"\d+", text)
if m:
r.pages = f"{m.group(0)} Seiten"
# Availability via li.availability-3
avail_li = soup.select_one("li.availability-3")
if avail_li:
avail_text = " ".join(avail_li.get_text(" ", strip=True).split()).lower()
if "titel ist leider vergriffen" in avail_text:
r.buyable = False
if "keine neuauflage" in avail_text:
r.unavailable_hint = "Titel ist leider vergriffen; keine Neuauflage"
else:
r.unavailable_hint = "Titel ist leider vergriffen"
# Append or drop
if (not drop_unbuyable) or r.buyable:
enriched.append(r)
except Exception:
# On any per-item error, keep the record if not dropping; else skip
if not drop_unbuyable:
enriched.append(r)
continue
return enriched
# ------------------- Internals -------------------
def _get(self, url: str) -> Optional[str]:
try:
r = self.client.get(url)
r.encoding = "utf-8"
if r.status_code == 200 and "text/html" in (r.headers.get("content-type") or ""):
return r.text
except httpx.HTTPError:
pass
return None
def _parse_results(self, html: str) -> List[LehmannsSearchResult]:
soup = BeautifulSoup(html, "html.parser")
results: list[LehmannsSearchResult] = []
for block in soup.select("div.info-block"):
a = block.select_one(".title a[href]")
if not a:
continue
url = urljoin(BASE, a["href"].strip())
base_title = (block.select_one(".title [itemprop='name']") or a).get_text(strip=True)
# Alternative headline => extend title
alt_tag = block.select_one(".description[itemprop='alternativeHeadline']")
alternative_headline = alt_tag.get_text(strip=True) if alt_tag else None
title = f"{base_title} : {alternative_headline}" if alternative_headline else base_title
description = alternative_headline
# Authors from .author
authors: list[str] = []
author_div = block.select_one("div.author")
if author_div:
t = author_div.get_text(" ", strip=True)
t = re.sub(r"^\s*von\s+", "", t, flags=re.I)
for part in re.split(r"\s*;\s*|\s*&\s*|\s+und\s+", t):
name = " ".join(part.split())
if name:
authors.append(name)
# Media + format
media_type = None
book_format = None
type_text = block.select_one(".type")
if type_text:
t = type_text.get_text(" ", strip=True)
m = re.search(r"\b(Buch|eBook|Hörbuch)\b", t)
if m:
media_type = m.group(1)
fm = re.search(r"\(([^)]+)\)", t)
if fm:
book_format = fm.group(1).strip().upper()
# Year
year = None
y = block.select_one("[itemprop='copyrightYear']")
if y:
try:
year = int(y.get_text(strip=True))
except ValueError:
pass
# Edition
edition = None
ed = block.select_one("[itemprop='bookEdition']")
if ed:
m = re.search(r"\d+", ed.get_text(strip=True))
if m:
edition = int(m.group())
# Publisher
publisher = None
pub = block.select_one(".publisherprop [itemprop='name']") or block.select_one(".publisher [itemprop='name']")
if pub:
publisher = pub.get_text(strip=True)
# ISBN-13
isbn13 = None
isbn_tag = block.select_one(".isbn [itemprop='isbn'], [itemprop='isbn']")
if isbn_tag:
digits = re.sub(r"[^0-9Xx]", "", isbn_tag.get_text(strip=True))
m = re.search(r"(97[89]\d{10})", digits)
if m:
isbn13 = m.group(1)
# Price (best effort)
price_eur = None
txt = block.get_text(" ", strip=True)
mprice = re.search(r"(\d{1,3}(?:\.\d{3})*,\d{2})\s*€", txt)
if not mprice and block.parent:
sib = block.parent.get_text(" ", strip=True)
mprice = re.search(r"(\d{1,3}(?:\.\d{3})*,\d{2})\s*€", sib)
if mprice:
num = mprice.group(1).replace(".", "").replace(",", ".")
try:
price_eur = float(num)
except ValueError:
pass
# Image (best-effort)
image = None
left_img = block.find_previous("img")
if left_img and left_img.get("src"):
image = urljoin(BASE, left_img["src"])
results.append(
LehmannsSearchResult(
title=title,
url=url,
description=description,
authors=authors,
media_type=media_type,
book_format=book_format,
year=year,
edition=edition,
publisher=publisher,
isbn13=isbn13,
price_eur=price_eur,
image=image,
)
)
return results