chore: restructured project, updated readme

2025-10-29 09:31:40 +01:00
parent a4460ec17b
commit ee62c65ae7
70 changed files with 8518 additions and 100 deletions
--- a/src/background/new_editions.py
+++ b/src/background/new_editions.py
@@ -0,0 +1,345 @@
+import os
+import re
+from concurrent.futures import ThreadPoolExecutor
+from math import ceil
+from queue import Empty, Queue
+from time import monotonic  # <-- NEW
+from typing import List, Optional
+
+from PySide6.QtCore import QThread, Signal
+
+# from src.services.webrequest import BibTextTransformer, WebRequest
+from src.services.catalogue import Catalogue
+from src.core.models import BookData
+from src.services.sru import SWB
+from src.shared.logging import log
+
+# use all available cores - 2, but at least 1
+THREAD_COUNT = max(os.cpu_count() - 2, 1)
+THREAD_MIN_ITEMS = 5
+
+# Logger configured centrally in main; use shared `log`
+
+swb = SWB()
+dnb = SWB()
+cat = Catalogue()
+
+RVK_ALLOWED = r"[A-Z0-9.\-\/]"  # conservative RVK character set
+
+
+def find_newer_edition(
+    swb_result: BookData, dnb_result: List[BookData]
+) -> Optional[List[BookData]]:
+    """
+    New edition if:
+      - year > swb.year OR
+      - edition_number > swb.edition_number
+    BUT: discard any candidate with year < swb.year (if both years are known).
+
+    Same-work check:
+      - Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
+      - If both have signatures and RVKs differ -> skip.
+
+    Preferences (in order):
+      1) RVK matches SWB
+      2) Print over Online-Ressource
+      3) Has signature
+      4) Newer: (year desc, edition_number desc)
+    """
+
+    def strip_copy_and_edition(s: str) -> str:
+        s = re.sub(r"\(\s*\d+\s*\)", "", s)  # remove '(N)'
+        s = re.sub(r"\s*\+\s*\d+\s*$", "", s)  # remove trailing '+N'
+        return s
+
+    def extract_rvk_root(sig: Optional[str]) -> str:
+        if not sig:
+            return ""
+        t = strip_copy_and_edition(sig.upper())
+        t = re.sub(r"\s+", " ", t).strip()
+        m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
+        if not m:
+            cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
+            return cleaned.split(" ")[0] if cleaned else ""
+        return re.sub(r"\s+", " ", m.group(1)).strip()
+
+    def has_sig(b: BookData) -> bool:
+        return bool(getattr(b, "signature", None))
+
+    def is_online(b: BookData) -> bool:
+        return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"
+
+    def is_print(b: BookData) -> bool:
+        return not is_online(b)
+
+    def rvk_matches_swb(b: BookData) -> bool:
+        if not has_sig(b) or not has_sig(swb_result):
+            return False
+        return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)
+
+    def strictly_newer(b: BookData) -> bool:
+        # Hard guard: if both years are known and candidate is older, discard
+        if (
+            b.year is not None
+            and swb_result.year is not None
+            and b.year < swb_result.year
+        ):
+            return False
+
+        newer_by_year = (
+            b.year is not None
+            and swb_result.year is not None
+            and b.year > swb_result.year
+        )
+        newer_by_edition = (
+            b.edition_number is not None
+            and swb_result.edition_number is not None
+            and b.edition_number > swb_result.edition_number
+        )
+        # Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
+        return newer_by_year or newer_by_edition
+
+    swb_has_sig = has_sig(swb_result)
+    swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))
+
+    # 1) Filter: same work (by RVK if both have sigs) AND strictly newer
+    candidates: List[BookData] = []
+    for b in dnb_result:
+        if has_sig(b) and swb_has_sig:
+            if extract_rvk_root(b.signature) != swb_rvk:
+                continue  # different work
+        if strictly_newer(b):
+            candidates.append(b)
+
+    if not candidates:
+        return None
+
+    # 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
+    def pref_score(x: BookData) -> tuple[int, int, int]:
+        return (
+            1 if rvk_matches_swb(x) else 0,
+            1 if is_print(x) else 0,
+            1 if has_sig(x) else 0,
+        )
+
+    by_ppn: dict[Optional[str], BookData] = {}
+    for b in candidates:
+        key = getattr(b, "ppn", None)
+        prev = by_ppn.get(key)
+        if prev is None or pref_score(b) > pref_score(prev):
+            by_ppn[key] = b
+
+    deduped = list(by_ppn.values())
+    if not deduped:
+        return None
+
+    # 3) Preserve all qualifying newer editions, but order by preference
+    def sort_key(b: BookData):
+        year = b.year if b.year is not None else -1
+        ed = b.edition_number if b.edition_number is not None else -1
+        return (
+            1 if rvk_matches_swb(b) else 0,
+            1 if is_print(b) else 0,
+            1 if has_sig(b) else 0,
+            year,
+            ed,
+        )
+
+    deduped.sort(key=sort_key, reverse=True)
+    return deduped
+
+
+class NewEditionCheckerThread(QThread):
+    updateSignal = Signal(int, int)  # (processed, total)
+    updateProgress = Signal(int, int)  # (processed, total)
+    total_entries_signal = Signal(int)
+    resultsSignal = Signal(list)  # list[tuple[BookData, list[BookData]]]
+
+    # NEW: metrics signals
+    rateSignal = Signal(float)  # items per second ("it/s")
+    etaSignal = Signal(int)  # seconds remaining (-1 when unknown)
+
+    def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
+        super().__init__(parent)
+        self.entries: list["BookData"] = entries if entries is not None else []
+        self.results: list[tuple["BookData", list["BookData"]]] = []
+
+    def reset(self):
+        self.entries = []
+        self.results = []
+
+    # ---------- internal helpers ----------
+
+    @staticmethod
+    def _split_evenly(items: list, parts: int) -> list[list]:
+        """Split items as evenly as possible into `parts` chunks (no empty tails)."""
+        if parts <= 1 or len(items) <= 1:
+            return [items]
+        n = len(items)
+        base = n // parts
+        extra = n % parts
+        chunks = []
+        i = 0
+        for k in range(parts):
+            size = base + (1 if k < extra else 0)
+            if size == 0:
+                continue
+            chunks.append(items[i : i + size])
+            i += size
+        return chunks
+
+    @staticmethod
+    def _clean_title(raw: str) -> str:
+        title = raw.rstrip(" .:,;!?")
+        title = re.sub(r"\s*\(.*\)", "", title)
+        return title.strip()
+
+    @classmethod
+    def _process_book(
+        cls, book: "BookData"
+    ) -> tuple["BookData", list["BookData"]] | None:
+        """Process one book; returns (original, [found editions]) or None on failure."""
+        if not book.title:
+            return None
+        response: list["BookData"] = []
+        query = [
+            f"pica.tit={book.title}",
+            f"pica.vlg={book.publisher}",
+        ]
+
+        swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
+        dnb_results = swb.getBooks(query)
+        new_editions = find_newer_edition(swb_result, dnb_results)
+
+        if new_editions is not None:
+            for new_edition in new_editions:
+                new_edition.library_location = cat.get_location(new_edition.ppn)
+                try:
+                    isbn = (
+                        str(new_edition.isbn[0])
+                        if isinstance(new_edition.isbn, list)
+                        else str(new_edition.isbn)
+                    )
+                    new_edition.link = (
+                        f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
+                    )
+                except (IndexError, TypeError):
+                    isbn = None
+                new_edition.in_library = cat.in_library(new_edition.ppn)
+            response = new_editions
+
+        # client = SWB()
+        # response: list["BookData"] = []
+        # # First, search by title only
+        # results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])
+
+        # lehmanns = LehmannsClient()
+        # results = lehmanns.search_by_title(title)
+        # for result in results:
+        #     if "(eBook)" in result.title:
+        #         result.title = result.title.replace("(eBook)", "").strip()
+        #     swb_results = client.getBooks(
+        #         [
+        #             f"pica.tit={result.title}",
+        #             f"pica.vlg={result.publisher.split(',')[0]}",
+        #         ]
+        #     )
+        #     for swb in swb_results:
+        #         if swb.isbn == result.isbn:
+        #             result.ppn = swb.ppn
+        #             result.signature = swb.signature
+        #             response.append(result)
+        #         if (result.edition_number < swb.edition_number) and (
+        #             swb.year > result.year
+        #         ):
+        #             response.append(result)
+        if response == []:
+            return None
+        # Remove duplicates based on ppn
+        return (book, response)
+
+    @classmethod
+    def _worker(cls, items: list["BookData"], q: Queue) -> None:
+        """Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None)."""
+        try:
+            for book in items:
+                try:
+                    result = cls._process_book(book)
+                except Exception:
+                    result = None
+                if result is not None:
+                    q.put(("result", result))
+                q.put(("progress", 1))
+        finally:
+            q.put(("done", None))
+
+    # ---------- thread entry point ----------
+
+    def run(self):
+        total = len(self.entries)
+        self.total_entries_signal.emit(total)
+
+        # start timer for metrics
+        t0 = monotonic()
+
+        if total == 0:
+            log.debug("No entries to process.")
+            # emit metrics (zero work)
+            self.rateSignal.emit(0.0)
+            self.etaSignal.emit(0)
+            self.resultsSignal.emit([])
+            return
+
+        # Up to 4 workers; ~20 items per worker
+        num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
+        chunks = self._split_evenly(self.entries, num_workers)
+        sizes = [len(ch) for ch in chunks]
+
+        q: Queue = Queue()
+        processed = 0
+        finished_workers = 0
+
+        with ThreadPoolExecutor(max_workers=len(chunks)) as ex:
+            futures = [ex.submit(self._worker, ch, q) for ch in chunks]
+
+            log.info(
+                f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread."
+            )
+            for idx, sz in enumerate(sizes, 1):
+                log.debug(f"Thread {idx}: {sz} entries")
+
+            # Aggregate progress/results
+            while finished_workers < len(chunks):
+                try:
+                    kind, payload = q.get(timeout=0.1)
+                except Empty:
+                    continue
+
+                if kind == "progress":
+                    processed += int(payload)
+                    self.updateSignal.emit(processed, total)
+                    self.updateProgress.emit(processed, total)
+
+                    # ---- NEW: compute & emit metrics ----
+                    elapsed = max(1e-9, monotonic() - t0)
+                    rate = processed / elapsed  # items per second
+                    remaining = max(0, total - processed)
+                    eta_sec = int(round(remaining / rate)) if rate > 0 else -1
+
+                    self.rateSignal.emit(rate)
+                    # clamp negative just in case
+                    self.etaSignal.emit(max(0, eta_sec) if eta_sec >= 0 else -1)
+                    # -------------------------------------
+
+                elif kind == "result":
+                    self.results.append(payload)
+                elif kind == "done":
+                    finished_workers += 1
+
+        # Final metrics on completion
+        elapsed_total = max(1e-9, monotonic() - t0)
+        final_rate = total / elapsed_total
+        self.rateSignal.emit(final_rate)
+        self.etaSignal.emit(0)
+
+        self.resultsSignal.emit(self.results)