feat: implement NewEditionCheckerThread and related utility functions for book data processing

2025-09-01 14:31:23 +02:00
parent 7da2b3f65d
commit 5f15352401
2 changed files with 169 additions and 6 deletions
--- a/src/backend/thread_neweditions.py
+++ b/src/backend/thread_neweditions.py
@@ -0,0 +1,149 @@
+import re
+from typing import List, Optional, Set, Union
+
+from PySide6.QtCore import QThread
+from PySide6.QtCore import Signal as Signal
+
+from src.logic import BookData
+from src.logic.lehmannsapi import LehmannsClient
+from src.logic.swb import SWB
+
+
+def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
+    """Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
+    if value is None:
+        return set()
+    vals = value if isinstance(value, list) else [value]
+    out: Set[str] = set()
+    for v in vals:
+        s = str(v)
+        digits = re.sub(r"[^0-9Xx]", "", s)
+        # keep 13-digit or 10-digit tokens
+        m13 = re.findall(r"97[89]\d{10}", digits)
+        if m13:
+            out.update(m13)
+        else:
+            m10 = re.findall(r"\d{9}[0-9Xx]", digits)
+            out.update(x.upper() for x in m10)
+    return out
+
+
+def _norm_text(s: Optional[str]) -> str:
+    if not s:
+        return ""
+    # lowercase, collapse whitespace, drop some punctuation
+    s = s.lower()
+    s = re.sub(r"[\s\-\u2013\u2014]+", " ", s)  # spaces/dashes
+    s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s)
+    return s.strip()
+
+
+def _same_book(a: BookData, b: BookData) -> bool:
+    """Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized."""
+    isbns_a = _norm_isbns(a.isbn)
+    isbns_b = _norm_isbns(b.isbn)
+    if isbns_a and isbns_b and (isbns_a & isbns_b):
+        return True
+
+    ta, tb = _norm_text(a.title), _norm_text(b.title)
+    aa, ab = _norm_text(a.author), _norm_text(b.author)
+    ya, yb = (a.year or "").strip(), (b.year or "").strip()
+
+    # strong title match required; then author if available; then year if available
+    if ta and tb and ta == tb:
+        # if both have authors, require match
+        if aa and ab and aa == ab:
+            # if both have year, require match
+            if ya and yb:
+                return ya == yb
+            return True
+        # if one/both authors missing, allow title (+year if both present)
+        if ya and yb:
+            return ya == yb
+        return True
+
+    return False
+
+
+def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
+    """
+    If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
+    Returns a NEW list (does not mutate the input).
+    """
+    swb_with_sig = [
+        r
+        for r in records
+        if (r.link == "SWB") and (r.signature and r.signature.strip())
+    ]
+    if not swb_with_sig:
+        return list(records)
+
+    to_remove: Set[int] = set()
+
+    # For each URL entry, see if it matches any SWB-with-signature entry
+    for idx, rec in enumerate(records):
+        if not rec.link or not rec.link.lower().startswith("http"):
+            continue
+        for swb in swb_with_sig:
+            if _same_book(swb, rec):
+                to_remove.add(idx)
+                break
+
+    # Build filtered list
+    return [rec for i, rec in enumerate(records) if i not in to_remove]
+
+
+class NewEditionCheckerThread(QThread):
+    updateSignal = Signal(int, int)
+    updateProgress = Signal(int, int)
+    total_entries_signal = Signal(int)
+    resultsSignal = Signal(list)
+
+    def __init__(self, entries: list[BookData], parent=None):
+        super().__init__(parent)
+        self.entries: list[BookData] = entries
+        self.total_entries_signal.emit(len(entries))
+        self.results: list[tuple[BookData, list[BookData]]] = []
+
+    def run(self):
+        for book in self.entries:
+            self.updateSignal.emit(self.entries.index(book) + 1, len(self.entries))
+            author = (
+                book.author.split(";")[0].replace(" ", "")
+                if ";" in book.author
+                else book.author.replace(" ", "")
+            )
+            # title = book.title.split(":")[0].strip()
+            # remove trailing punctuation from title
+            title = book.title.rstrip(" .:,;!?")
+            # remove trailing text in parentheses
+            title = re.sub(r"\s*\(.*\)", "", title)
+            title = title.strip()
+            response: list[BookData] = []
+            response = SWB().getBooks(
+                [
+                    "pica.bib=20735",
+                    f"pica.tit={title.split(':')[0].strip()}",
+                    # f"pica.per={author}",
+                ]
+            )
+
+            # in the response, remove the entry with the same ppn
+            response = [entry for entry in response if entry.ppn != book.ppn]
+            for respo in response:
+                respo.link = "SWB"
+            with LehmannsClient() as client:
+                results = client.search_by_title(title, strict=True)
+                # client.enrich_pages(results)
+                if not results:
+                    continue
+                for res in results:
+                    response.append(BookData().from_LehmannsSearchResult(res))
+            if response == []:
+                continue
+            # check results if lehmanns has a result with the same isbn from the results of swb. if so, if we have a signature, remove, else keep
+            response = filter_prefer_swb(response)
+
+            result = (book, response)
+
+            self.results.append(result)