minor and major reworks: rename swb to SRU, add a test for pdf parsing
major: rework mail to send mail as plaintext instead of html, preventing the bleed-in of html text
This commit is contained in:
@@ -1,13 +1,15 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, asdict, field
|
||||
from typing import Optional, List, Iterable
|
||||
from urllib.parse import urljoin, quote_plus
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from typing import Iterable, List, Optional
|
||||
from urllib.parse import quote_plus, urljoin
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from src.logic.dataclass import BookData
|
||||
|
||||
BASE = "https://www.lehmanns.de"
|
||||
SEARCH_URL = "https://www.lehmanns.de/search/quick?mediatype_id=&q="
|
||||
|
||||
@@ -33,9 +35,11 @@ class LehmannsSearchResult:
|
||||
image: Optional[str] = None
|
||||
|
||||
# From detail page:
|
||||
pages: Optional[str] = None # "<N> Seiten"
|
||||
buyable: bool = True # set in enrich_pages (detail page)
|
||||
unavailable_hint: Optional[str] = None # e.g. "Titel ist leider vergriffen; keine Neuauflage"
|
||||
pages: Optional[str] = None # "<N> Seiten"
|
||||
buyable: bool = True # set in enrich_pages (detail page)
|
||||
unavailable_hint: Optional[str] = (
|
||||
None # e.g. "Titel ist leider vergriffen; keine Neuauflage"
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return asdict(self)
|
||||
@@ -73,31 +77,45 @@ class LehmannsClient:
|
||||
# spaces -> '+'
|
||||
return SEARCH_URL + quote_plus(title)
|
||||
|
||||
def search_by_title(self, title: str, limit: Optional[int] = None, strict: bool = False) -> List[LehmannsSearchResult]:
|
||||
def search_by_title(
|
||||
self,
|
||||
title: str,
|
||||
limit: Optional[int] = None,
|
||||
strict: bool = False,
|
||||
only_latest: bool = True,
|
||||
) -> List[BookData]:
|
||||
"""
|
||||
Parse the listing page only (no availability check here).
|
||||
Use enrich_pages(...) afterwards to fetch detail pages, add 'pages',
|
||||
and drop unbuyable items.
|
||||
"""
|
||||
url = self.build_search_url(title)
|
||||
url = self.build_search_url(title=title)
|
||||
html = self._get(url)
|
||||
if not html:
|
||||
return []
|
||||
results = self._parse_results(html)
|
||||
self.enrich_pages(results)
|
||||
|
||||
results = [BookData().from_LehmannsSearchResult(r) for r in results]
|
||||
if strict:
|
||||
# filter results to only those with exact title match (case-insensitive)
|
||||
title_lower = title.lower()
|
||||
results = [r for r in results if r.title and r.title.lower() == title_lower]
|
||||
results = [r for r in results if r.buyable]
|
||||
# results = [r for r in results if r.buyable]
|
||||
return results
|
||||
if limit is not None:
|
||||
results = results[:max(0, limit)]
|
||||
results = results[: max(0, limit)]
|
||||
if only_latest and len(results) > 1:
|
||||
# keep only the latest edition (highest edition number)
|
||||
results.sort(key=lambda r: (r.edition_number or 0), reverse=True)
|
||||
results = [results[0]]
|
||||
return results
|
||||
|
||||
# ------------------- Detail enrichment & filtering -------------------
|
||||
|
||||
def enrich_pages(self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True) -> List[LehmannsSearchResult]:
|
||||
def enrich_pages(
|
||||
self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True
|
||||
) -> List[LehmannsSearchResult]:
|
||||
"""
|
||||
Fetch each result.url, extract:
|
||||
- pages: from <span class="book-meta meta-seiten" itemprop="numberOfPages">...</span>
|
||||
@@ -135,11 +153,15 @@ class LehmannsClient:
|
||||
# Availability via li.availability-3
|
||||
avail_li = soup.select_one("li.availability-3")
|
||||
if avail_li:
|
||||
avail_text = " ".join(avail_li.get_text(" ", strip=True).split()).lower()
|
||||
avail_text = " ".join(
|
||||
avail_li.get_text(" ", strip=True).split()
|
||||
).lower()
|
||||
if "titel ist leider vergriffen" in avail_text:
|
||||
r.buyable = False
|
||||
if "keine neuauflage" in avail_text:
|
||||
r.unavailable_hint = "Titel ist leider vergriffen; keine Neuauflage"
|
||||
r.unavailable_hint = (
|
||||
"Titel ist leider vergriffen; keine Neuauflage"
|
||||
)
|
||||
else:
|
||||
r.unavailable_hint = "Titel ist leider vergriffen"
|
||||
|
||||
@@ -161,7 +183,9 @@ class LehmannsClient:
|
||||
try:
|
||||
r = self.client.get(url)
|
||||
r.encoding = "utf-8"
|
||||
if r.status_code == 200 and "text/html" in (r.headers.get("content-type") or ""):
|
||||
if r.status_code == 200 and "text/html" in (
|
||||
r.headers.get("content-type") or ""
|
||||
):
|
||||
return r.text
|
||||
except httpx.HTTPError:
|
||||
pass
|
||||
@@ -176,12 +200,18 @@ class LehmannsClient:
|
||||
if not a:
|
||||
continue
|
||||
url = urljoin(BASE, a["href"].strip())
|
||||
base_title = (block.select_one(".title [itemprop='name']") or a).get_text(strip=True)
|
||||
base_title = (block.select_one(".title [itemprop='name']") or a).get_text(
|
||||
strip=True
|
||||
)
|
||||
|
||||
# Alternative headline => extend title
|
||||
alt_tag = block.select_one(".description[itemprop='alternativeHeadline']")
|
||||
alternative_headline = alt_tag.get_text(strip=True) if alt_tag else None
|
||||
title = f"{base_title} : {alternative_headline}" if alternative_headline else base_title
|
||||
title = (
|
||||
f"{base_title} : {alternative_headline}"
|
||||
if alternative_headline
|
||||
else base_title
|
||||
)
|
||||
description = alternative_headline
|
||||
|
||||
# Authors from .author
|
||||
@@ -227,7 +257,9 @@ class LehmannsClient:
|
||||
|
||||
# Publisher
|
||||
publisher = None
|
||||
pub = block.select_one(".publisherprop [itemprop='name']") or block.select_one(".publisher [itemprop='name']")
|
||||
pub = block.select_one(
|
||||
".publisherprop [itemprop='name']"
|
||||
) or block.select_one(".publisher [itemprop='name']")
|
||||
if pub:
|
||||
publisher = pub.get_text(strip=True)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user