chore: restructured project, updated readme

This commit is contained in:
2025-10-29 09:31:40 +01:00
parent a4460ec17b
commit ee62c65ae7
70 changed files with 8518 additions and 100 deletions

View File

@@ -0,0 +1,345 @@
import os
import re
from concurrent.futures import ThreadPoolExecutor
from math import ceil
from queue import Empty, Queue
from time import monotonic # <-- NEW
from typing import List, Optional
from PySide6.QtCore import QThread, Signal
# from src.services.webrequest import BibTextTransformer, WebRequest
from src.services.catalogue import Catalogue
from src.core.models import BookData
from src.services.sru import SWB
from src.shared.logging import log
# use all available cores - 2, but at least 1
THREAD_COUNT = max(os.cpu_count() - 2, 1)
THREAD_MIN_ITEMS = 5
# Logger configured centrally in main; use shared `log`
swb = SWB()
dnb = SWB()
cat = Catalogue()
RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative RVK character set
def find_newer_edition(
swb_result: BookData, dnb_result: List[BookData]
) -> Optional[List[BookData]]:
"""
New edition if:
- year > swb.year OR
- edition_number > swb.edition_number
BUT: discard any candidate with year < swb.year (if both years are known).
Same-work check:
- Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
- If both have signatures and RVKs differ -> skip.
Preferences (in order):
1) RVK matches SWB
2) Print over Online-Ressource
3) Has signature
4) Newer: (year desc, edition_number desc)
"""
def strip_copy_and_edition(s: str) -> str:
s = re.sub(r"\(\s*\d+\s*\)", "", s) # remove '(N)'
s = re.sub(r"\s*\+\s*\d+\s*$", "", s) # remove trailing '+N'
return s
def extract_rvk_root(sig: Optional[str]) -> str:
if not sig:
return ""
t = strip_copy_and_edition(sig.upper())
t = re.sub(r"\s+", " ", t).strip()
m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
if not m:
cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
return cleaned.split(" ")[0] if cleaned else ""
return re.sub(r"\s+", " ", m.group(1)).strip()
def has_sig(b: BookData) -> bool:
return bool(getattr(b, "signature", None))
def is_online(b: BookData) -> bool:
return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"
def is_print(b: BookData) -> bool:
return not is_online(b)
def rvk_matches_swb(b: BookData) -> bool:
if not has_sig(b) or not has_sig(swb_result):
return False
return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)
def strictly_newer(b: BookData) -> bool:
# Hard guard: if both years are known and candidate is older, discard
if (
b.year is not None
and swb_result.year is not None
and b.year < swb_result.year
):
return False
newer_by_year = (
b.year is not None
and swb_result.year is not None
and b.year > swb_result.year
)
newer_by_edition = (
b.edition_number is not None
and swb_result.edition_number is not None
and b.edition_number > swb_result.edition_number
)
# Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
return newer_by_year or newer_by_edition
swb_has_sig = has_sig(swb_result)
swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))
# 1) Filter: same work (by RVK if both have sigs) AND strictly newer
candidates: List[BookData] = []
for b in dnb_result:
if has_sig(b) and swb_has_sig:
if extract_rvk_root(b.signature) != swb_rvk:
continue # different work
if strictly_newer(b):
candidates.append(b)
if not candidates:
return None
# 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
def pref_score(x: BookData) -> tuple[int, int, int]:
return (
1 if rvk_matches_swb(x) else 0,
1 if is_print(x) else 0,
1 if has_sig(x) else 0,
)
by_ppn: dict[Optional[str], BookData] = {}
for b in candidates:
key = getattr(b, "ppn", None)
prev = by_ppn.get(key)
if prev is None or pref_score(b) > pref_score(prev):
by_ppn[key] = b
deduped = list(by_ppn.values())
if not deduped:
return None
# 3) Preserve all qualifying newer editions, but order by preference
def sort_key(b: BookData):
year = b.year if b.year is not None else -1
ed = b.edition_number if b.edition_number is not None else -1
return (
1 if rvk_matches_swb(b) else 0,
1 if is_print(b) else 0,
1 if has_sig(b) else 0,
year,
ed,
)
deduped.sort(key=sort_key, reverse=True)
return deduped
class NewEditionCheckerThread(QThread):
updateSignal = Signal(int, int) # (processed, total)
updateProgress = Signal(int, int) # (processed, total)
total_entries_signal = Signal(int)
resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]]
# NEW: metrics signals
rateSignal = Signal(float) # items per second ("it/s")
etaSignal = Signal(int) # seconds remaining (-1 when unknown)
def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
super().__init__(parent)
self.entries: list["BookData"] = entries if entries is not None else []
self.results: list[tuple["BookData", list["BookData"]]] = []
def reset(self):
self.entries = []
self.results = []
# ---------- internal helpers ----------
@staticmethod
def _split_evenly(items: list, parts: int) -> list[list]:
"""Split items as evenly as possible into `parts` chunks (no empty tails)."""
if parts <= 1 or len(items) <= 1:
return [items]
n = len(items)
base = n // parts
extra = n % parts
chunks = []
i = 0
for k in range(parts):
size = base + (1 if k < extra else 0)
if size == 0:
continue
chunks.append(items[i : i + size])
i += size
return chunks
@staticmethod
def _clean_title(raw: str) -> str:
title = raw.rstrip(" .:,;!?")
title = re.sub(r"\s*\(.*\)", "", title)
return title.strip()
@classmethod
def _process_book(
cls, book: "BookData"
) -> tuple["BookData", list["BookData"]] | None:
"""Process one book; returns (original, [found editions]) or None on failure."""
if not book.title:
return None
response: list["BookData"] = []
query = [
f"pica.tit={book.title}",
f"pica.vlg={book.publisher}",
]
swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
dnb_results = swb.getBooks(query)
new_editions = find_newer_edition(swb_result, dnb_results)
if new_editions is not None:
for new_edition in new_editions:
new_edition.library_location = cat.get_location(new_edition.ppn)
try:
isbn = (
str(new_edition.isbn[0])
if isinstance(new_edition.isbn, list)
else str(new_edition.isbn)
)
new_edition.link = (
f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
)
except (IndexError, TypeError):
isbn = None
new_edition.in_library = cat.in_library(new_edition.ppn)
response = new_editions
# client = SWB()
# response: list["BookData"] = []
# # First, search by title only
# results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])
# lehmanns = LehmannsClient()
# results = lehmanns.search_by_title(title)
# for result in results:
# if "(eBook)" in result.title:
# result.title = result.title.replace("(eBook)", "").strip()
# swb_results = client.getBooks(
# [
# f"pica.tit={result.title}",
# f"pica.vlg={result.publisher.split(',')[0]}",
# ]
# )
# for swb in swb_results:
# if swb.isbn == result.isbn:
# result.ppn = swb.ppn
# result.signature = swb.signature
# response.append(result)
# if (result.edition_number < swb.edition_number) and (
# swb.year > result.year
# ):
# response.append(result)
if response == []:
return None
# Remove duplicates based on ppn
return (book, response)
@classmethod
def _worker(cls, items: list["BookData"], q: Queue) -> None:
"""Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None)."""
try:
for book in items:
try:
result = cls._process_book(book)
except Exception:
result = None
if result is not None:
q.put(("result", result))
q.put(("progress", 1))
finally:
q.put(("done", None))
# ---------- thread entry point ----------
def run(self):
total = len(self.entries)
self.total_entries_signal.emit(total)
# start timer for metrics
t0 = monotonic()
if total == 0:
log.debug("No entries to process.")
# emit metrics (zero work)
self.rateSignal.emit(0.0)
self.etaSignal.emit(0)
self.resultsSignal.emit([])
return
# Up to 4 workers; ~20 items per worker
num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
chunks = self._split_evenly(self.entries, num_workers)
sizes = [len(ch) for ch in chunks]
q: Queue = Queue()
processed = 0
finished_workers = 0
with ThreadPoolExecutor(max_workers=len(chunks)) as ex:
futures = [ex.submit(self._worker, ch, q) for ch in chunks]
log.info(
f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread."
)
for idx, sz in enumerate(sizes, 1):
log.debug(f"Thread {idx}: {sz} entries")
# Aggregate progress/results
while finished_workers < len(chunks):
try:
kind, payload = q.get(timeout=0.1)
except Empty:
continue
if kind == "progress":
processed += int(payload)
self.updateSignal.emit(processed, total)
self.updateProgress.emit(processed, total)
# ---- NEW: compute & emit metrics ----
elapsed = max(1e-9, monotonic() - t0)
rate = processed / elapsed # items per second
remaining = max(0, total - processed)
eta_sec = int(round(remaining / rate)) if rate > 0 else -1
self.rateSignal.emit(rate)
# clamp negative just in case
self.etaSignal.emit(max(0, eta_sec) if eta_sec >= 0 else -1)
# -------------------------------------
elif kind == "result":
self.results.append(payload)
elif kind == "done":
finished_workers += 1
# Final metrics on completion
elapsed_total = max(1e-9, monotonic() - t0)
final_rate = total / elapsed_total
self.rateSignal.emit(final_rate)
self.etaSignal.emit(0)
self.resultsSignal.emit(self.results)