Refactor and enhance type hints across multiple modules

- Updated the `from_tuple` method in `Prof` class to specify return type.
- Added type hints for various methods in `LehmannsClient`, `OpenAI`, `WebRequest`, and `ZoteroController` classes to improve code clarity and type safety.
- Modified `pdf_to_csv` function to return a string instead of a DataFrame.
- Enhanced error handling and type hints in `wordparser` and `xmlparser` modules.
- Removed unused UI file `Ui_medianadder.ts`.
- Improved the layout and structure of the `semesterapparat_ui` to enhance user experience.
- Updated file picker to support `.doc` files in addition to `.docx`.
- Added unique item handling in `Ui` class to prevent duplicates in apparat list.
- General code cleanup and consistency improvements across various files.
This commit is contained in:
2025-10-21 09:09:54 +02:00
parent 560d8285b5
commit 0406fe4f6f
26 changed files with 437 additions and 396 deletions

View File

@@ -134,10 +134,10 @@ class LehmannsClient:
enriched.append(r)
continue
soup = BeautifulSoup(html, "html.parser")
soup = BeautifulSoup(html, "html.parser") # type: ignore
# Pages
pages_node = soup.select_one(
pages_node = soup.select_one( # type: ignore
"span.book-meta.meta-seiten[itemprop='numberOfPages'], "
"span.book-meta.meta-seiten[itemprop='numberofpages'], "
".meta-seiten [itemprop='numberOfPages'], "
@@ -151,7 +151,7 @@ class LehmannsClient:
r.pages = f"{m.group(0)} Seiten"
# Availability via li.availability-3
avail_li = soup.select_one("li.availability-3")
avail_li = soup.select_one("li.availability-3") # type: ignore
if avail_li:
avail_text = " ".join(
avail_li.get_text(" ", strip=True).split()
@@ -200,12 +200,12 @@ class LehmannsClient:
if not a:
continue
url = urljoin(BASE, a["href"].strip())
base_title = (block.select_one(".title [itemprop='name']") or a).get_text(
base_title = (block.select_one(".title [itemprop='name']") or a).get_text( # type: ignore
strip=True
)
# Alternative headline => extend title
alt_tag = block.select_one(".description[itemprop='alternativeHeadline']")
alt_tag = block.select_one(".description[itemprop='alternativeHeadline']") # type: ignore
alternative_headline = alt_tag.get_text(strip=True) if alt_tag else None
title = (
f"{base_title} : {alternative_headline}"
@@ -216,7 +216,7 @@ class LehmannsClient:
# Authors from .author
authors: list[str] = []
author_div = block.select_one("div.author")
author_div = block.select_one("div.author") # type: ignore
if author_div:
t = author_div.get_text(" ", strip=True)
t = re.sub(r"^\s*von\s+", "", t, flags=re.I)
@@ -228,7 +228,7 @@ class LehmannsClient:
# Media + format
media_type = None
book_format = None
type_text = block.select_one(".type")
type_text = block.select_one(".type") # type: ignore
if type_text:
t = type_text.get_text(" ", strip=True)
m = re.search(r"\b(Buch|eBook|Hörbuch)\b", t)
@@ -240,7 +240,7 @@ class LehmannsClient:
# Year
year = None
y = block.select_one("[itemprop='copyrightYear']")
y = block.select_one("[itemprop='copyrightYear']") # type: ignore
if y:
try:
year = int(y.get_text(strip=True))
@@ -249,7 +249,7 @@ class LehmannsClient:
# Edition
edition = None
ed = block.select_one("[itemprop='bookEdition']")
ed = block.select_one("[itemprop='bookEdition']") # type: ignore
if ed:
m = re.search(r"\d+", ed.get_text(strip=True))
if m:
@@ -257,15 +257,15 @@ class LehmannsClient:
# Publisher
publisher = None
pub = block.select_one(
pub = block.select_one( # type: ignore
".publisherprop [itemprop='name']"
) or block.select_one(".publisher [itemprop='name']")
) or block.select_one(".publisher [itemprop='name']") # type: ignore
if pub:
publisher = pub.get_text(strip=True)
# ISBN-13
isbn13 = None
isbn_tag = block.select_one(".isbn [itemprop='isbn'], [itemprop='isbn']")
isbn_tag = block.select_one(".isbn [itemprop='isbn'], [itemprop='isbn']") # type: ignore
if isbn_tag:
digits = re.sub(r"[^0-9Xx]", "", isbn_tag.get_text(strip=True))
m = re.search(r"(97[89]\d{10})", digits)
@@ -288,7 +288,7 @@ class LehmannsClient:
# Image (best-effort)
image = None
left_img = block.find_previous("img")
left_img = block.find_previous("img") # type: ignore
if left_img and left_img.get("src"):
image = urljoin(BASE, left_img["src"])