rework threads and also use app_ids where applicable

2025-10-07 14:11:14 +02:00
parent 8e9eff4f3a
commit e061c1f5a9
4 changed files with 225 additions and 157 deletions
--- a/src/backend/thread_neweditions.py
+++ b/src/backend/thread_neweditions.py
@@ -1,19 +1,26 @@
+import os
 import re
 import sys
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 from math import ceil
 from queue import Empty, Queue
-from typing import List, Optional, Set, Union
 from time import monotonic  # <-- NEW
+from typing import List, Optional

 import loguru
 from PySide6.QtCore import QThread, Signal

 from src import LOG_DIR
+
+# from src.logic.webrequest import BibTextTransformer, WebRequest
+from src.backend.catalogue import Catalogue
 from src.logic import BookData
-from src.logic.lehmannsapi import LehmannsClient
-from src.logic.swb import SWB
+from src.logic.SRU import SWB
+
+# use all available cores - 2, but at least 1
+THREAD_COUNT = max(os.cpu_count() - 2, 1)
+THREAD_MIN_ITEMS = 5

 log = loguru.logger
 log.remove()
@@ -23,89 +30,136 @@ log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
 log.add(
    f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
    rotation="1 day",
-    retention="1 month",
+    retention="7 days",
 )

+swb = SWB()
+dnb = SWB()
+cat = Catalogue()

-def _norm_text(s: Optional[str]) -> str:
-    if not s:
-        return ""
-    # lowercase, collapse whitespace, drop some punctuation
-    s = s.lower()
-    s = re.sub(r"[\s\-\u2013\u2014]+", " ", s)  # spaces/dashes
-    s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s)
-    return s.strip()
+RVK_ALLOWED = r"[A-Z0-9.\-\/]"  # conservative RVK character set


-def _same_book(a: BookData, b: BookData) -> bool:
-    """Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized."""
-    isbns_a = _norm_isbns(a.isbn)
-    isbns_b = _norm_isbns(b.isbn)
-    if isbns_a and isbns_b and (isbns_a & isbns_b):
-        return True
-
-    ta, tb = _norm_text(a.title), _norm_text(b.title)
-    aa, ab = _norm_text(a.author), _norm_text(b.author)
-    ya, yb = (a.year or "").strip(), (b.year or "").strip()
-
-    # strong title match required; then author if available; then year if available
-    if ta and tb and ta == tb:
-        if aa and ab and aa == ab:
-            if ya and yb:
-                return ya == yb
-            return True
-        if ya and yb:
-            return ya == yb
-        return True
-
-    return False
-
-
-def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
-    """Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
-    if value is None:
-        return set()
-    vals = value if isinstance(value, list) else [value]
-    out: Set[str] = set()
-    for v in vals:
-        s = str(v)
-        digits = re.sub(r"[^0-9Xx]", "", s)
-        # keep 13-digit or 10-digit tokens
-        m13 = re.findall(r"97[89]\d{10}", digits)
-        if m13:
-            out.update(m13)
-        else:
-            m10 = re.findall(r"\d{9}[0-9Xx]", digits)
-            out.update(x.upper() for x in m10)
-    return out
-
-
-def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
+def find_newer_edition(
+    swb_result: BookData, dnb_result: List[BookData]
+) -> Optional[List[BookData]]:
    """
-    If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
-    Returns a NEW list (does not mutate the input).
+    New edition if:
+      - year > swb.year OR
+      - edition_number > swb.edition_number
+    BUT: discard any candidate with year < swb.year (if both years are known).
+
+    Same-work check:
+      - Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
+      - If both have signatures and RVKs differ -> skip.
+
+    Preferences (in order):
+      1) RVK matches SWB
+      2) Print over Online-Ressource
+      3) Has signature
+      4) Newer: (year desc, edition_number desc)
    """
-    swb_with_sig = [
-        r
-        for r in records
-        if (r.link == "SWB") and (r.signature and r.signature.strip())
-    ]
-    if not swb_with_sig:
-        return list(records)

-    to_remove: Set[int] = set()
+    def strip_copy_and_edition(s: str) -> str:
+        s = re.sub(r"\(\s*\d+\s*\)", "", s)  # remove '(N)'
+        s = re.sub(r"\s*\+\s*\d+\s*$", "", s)  # remove trailing '+N'
+        return s

-    # For each URL entry, see if it matches any SWB-with-signature entry
-    for idx, rec in enumerate(records):
-        if not rec.link or not rec.link.lower().startswith("http"):
-            continue
-        for swb in swb_with_sig:
-            if _same_book(swb, rec):
-                to_remove.add(idx)
-                break
+    def extract_rvk_root(sig: Optional[str]) -> str:
+        if not sig:
+            return ""
+        t = strip_copy_and_edition(sig.upper())
+        t = re.sub(r"\s+", " ", t).strip()
+        m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
+        if not m:
+            cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
+            return cleaned.split(" ")[0] if cleaned else ""
+        return re.sub(r"\s+", " ", m.group(1)).strip()

-    # Build filtered list
-    return [rec for i, rec in enumerate(records) if i not in to_remove]
+    def has_sig(b: BookData) -> bool:
+        return bool(getattr(b, "signature", None))
+
+    def is_online(b: BookData) -> bool:
+        return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"
+
+    def is_print(b: BookData) -> bool:
+        return not is_online(b)
+
+    def rvk_matches_swb(b: BookData) -> bool:
+        if not has_sig(b) or not has_sig(swb_result):
+            return False
+        return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)
+
+    def strictly_newer(b: BookData) -> bool:
+        # Hard guard: if both years are known and candidate is older, discard
+        if (
+            b.year is not None
+            and swb_result.year is not None
+            and b.year < swb_result.year
+        ):
+            return False
+
+        newer_by_year = (
+            b.year is not None
+            and swb_result.year is not None
+            and b.year > swb_result.year
+        )
+        newer_by_edition = (
+            b.edition_number is not None
+            and swb_result.edition_number is not None
+            and b.edition_number > swb_result.edition_number
+        )
+        # Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
+        return newer_by_year or newer_by_edition
+
+    swb_has_sig = has_sig(swb_result)
+    swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))
+
+    # 1) Filter: same work (by RVK if both have sigs) AND strictly newer
+    candidates: List[BookData] = []
+    for b in dnb_result:
+        if has_sig(b) and swb_has_sig:
+            if extract_rvk_root(b.signature) != swb_rvk:
+                continue  # different work
+        if strictly_newer(b):
+            candidates.append(b)
+
+    if not candidates:
+        return None
+
+    # 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
+    def pref_score(x: BookData) -> tuple[int, int, int]:
+        return (
+            1 if rvk_matches_swb(x) else 0,
+            1 if is_print(x) else 0,
+            1 if has_sig(x) else 0,
+        )
+
+    by_ppn: dict[Optional[str], BookData] = {}
+    for b in candidates:
+        key = getattr(b, "ppn", None)
+        prev = by_ppn.get(key)
+        if prev is None or pref_score(b) > pref_score(prev):
+            by_ppn[key] = b
+
+    deduped = list(by_ppn.values())
+    if not deduped:
+        return None
+
+    # 3) Final pick (single best)
+    def sort_key(b: BookData):
+        year = b.year if b.year is not None else -1
+        ed = b.edition_number if b.edition_number is not None else -1
+        return (
+            1 if rvk_matches_swb(b) else 0,
+            1 if is_print(b) else 0,
+            1 if has_sig(b) else 0,
+            year,
+            ed,
+        )
+
+    best = max(deduped, key=sort_key)
+    return [best] if best else None


 class NewEditionCheckerThread(QThread):
@@ -115,8 +169,8 @@ class NewEditionCheckerThread(QThread):
    resultsSignal = Signal(list)  # list[tuple[BookData, list[BookData]]]

    # NEW: metrics signals
-    rateSignal = Signal(float)   # items per second ("it/s")
-    etaSignal = Signal(int)      # seconds remaining (-1 when unknown)
+    rateSignal = Signal(float)  # items per second ("it/s")
+    etaSignal = Signal(int)  # seconds remaining (-1 when unknown)

    def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
        super().__init__(parent)
@@ -157,54 +211,64 @@ class NewEditionCheckerThread(QThread):
    def _process_book(
        cls, book: "BookData"
    ) -> tuple["BookData", list["BookData"]] | None:
-        author = (
-            book.author.split(";")[0].replace(" ", "")
-            if (book.author and ";" in book.author)
-            else (book.author or "").replace(" ", "")
-        )
-        title = cls._clean_title(book.title or "")
-
-        # Query SWB
-        response: list[BookData] = SWB().getBooks(
-            [
-                "pica.bib=20735",
-                f"pica.tit={title.split(':')[0].strip()}",
-                # f"pica.per={author}",
-            ]
-        )
-
-        # Remove same PPN
-        response = [entry for entry in response if entry.ppn != book.ppn]
-        for respo in response:
-            respo.link = "SWB"
-
-        # Query Lehmanns
-        with LehmannsClient() as client:
-            results = client.search_by_title(title, strict=True)
-            if results:
-                for res in results:
-                    response.append(BookData().from_LehmannsSearchResult(res))
-
-        if not response:
+        """Process one book; returns (original, [found editions]) or None on failure."""
+        if not book.title:
            return None
-
-        response = filter_prefer_swb(response)
-
-        # Remove entries matching the same ISBN as the current book
-        response = [
-            entry
-            for entry in response
-            if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
-        ]
-        response = [
-            entry
-            for entry in response
-            if book.publisher in entry.publisher
+        response: list["BookData"] = []
+        query = [
+            f"pica.tit={book.title}",
+            f"pica.vlg={book.publisher}",
        ]

-        if not response:
-            return None
+        swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
+        dnb_results = swb.getBooks(query)
+        new_editions = find_newer_edition(swb_result, dnb_results)

+        if new_editions is not None:
+            for new_edition in new_editions:
+                new_edition.library_location = cat.get_location(new_edition.ppn)
+                try:
+                    isbn = (
+                        str(new_edition.isbn[0])
+                        if isinstance(new_edition.isbn, list)
+                        else str(new_edition.isbn)
+                    )
+                    new_edition.link = (
+                        f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
+                    )
+                except (IndexError, TypeError):
+                    isbn = None
+                new_edition.in_library = cat.in_library(new_edition.ppn)
+            response = new_editions
+
+        # client = SWB()
+        # response: list["BookData"] = []
+        # # First, search by title only
+        # results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])
+
+        # lehmanns = LehmannsClient()
+        # results = lehmanns.search_by_title(title)
+        # for result in results:
+        #     if "(eBook)" in result.title:
+        #         result.title = result.title.replace("(eBook)", "").strip()
+        #     swb_results = client.getBooks(
+        #         [
+        #             f"pica.tit={result.title}",
+        #             f"pica.vlg={result.publisher.split(',')[0]}",
+        #         ]
+        #     )
+        #     for swb in swb_results:
+        #         if swb.isbn == result.isbn:
+        #             result.ppn = swb.ppn
+        #             result.signature = swb.signature
+        #             response.append(result)
+        #         if (result.edition_number < swb.edition_number) and (
+        #             swb.year > result.year
+        #         ):
+        #             response.append(result)
+        if response == []:
+            return None
+        # Remove duplicates based on ppn
        return (book, response)

    @classmethod
@@ -240,7 +304,7 @@ class NewEditionCheckerThread(QThread):
            return

        # Up to 4 workers; ~20 items per worker
-        num_workers = min(4, max(1, ceil(total / 20)))
+        num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
        chunks = self._split_evenly(self.entries, num_workers)
        sizes = [len(ch) for ch in chunks]