From 5f15352401655978386ae35f965f0b25416beb9d Mon Sep 17 00:00:00 2001 From: WorldTeacher Date: Mon, 1 Sep 2025 14:31:23 +0200 Subject: [PATCH] feat: implement NewEditionCheckerThread and related utility functions for book data processing --- src/backend/__init__.py | 26 ++++-- src/backend/thread_neweditions.py | 149 ++++++++++++++++++++++++++++++ 2 files changed, 169 insertions(+), 6 deletions(-) create mode 100644 src/backend/thread_neweditions.py diff --git a/src/backend/__init__.py b/src/backend/__init__.py index 8c548c9..7321e96 100644 --- a/src/backend/__init__.py +++ b/src/backend/__init__.py @@ -1,8 +1,22 @@ -from .semester import Semester -from .database import Database +__all__ = [ + "AdminCommands", + "Semester", + "AutoAdder", + "AvailChecker", + "BookGrabber", + "Database", + "DocumentationThread", + "NewEditionCheckerThread", + "recreateElsaFile", + "recreateFile", +] + from .admin_console import AdminCommands -from .thread_bookgrabber import BookGrabber -from .threads_availchecker import AvailChecker -from .threads_autoadder import AutoAdder +from .create_file import recreateElsaFile, recreateFile +from .database import Database from .documentation_thread import DocumentationThread -from .create_file import recreateFile, recreateElsaFile +from .semester import Semester +from .thread_bookgrabber import BookGrabber +from .thread_neweditions import NewEditionCheckerThread +from .threads_autoadder import AutoAdder +from .threads_availchecker import AvailChecker diff --git a/src/backend/thread_neweditions.py b/src/backend/thread_neweditions.py new file mode 100644 index 0000000..f3dfe77 --- /dev/null +++ b/src/backend/thread_neweditions.py @@ -0,0 +1,149 @@ +import re +from typing import List, Optional, Set, Union + +from PySide6.QtCore import QThread +from PySide6.QtCore import Signal as Signal + +from src.logic import BookData +from src.logic.lehmannsapi import LehmannsClient +from src.logic.swb import SWB + + +def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]: + """Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present).""" + if value is None: + return set() + vals = value if isinstance(value, list) else [value] + out: Set[str] = set() + for v in vals: + s = str(v) + digits = re.sub(r"[^0-9Xx]", "", s) + # keep 13-digit or 10-digit tokens + m13 = re.findall(r"97[89]\d{10}", digits) + if m13: + out.update(m13) + else: + m10 = re.findall(r"\d{9}[0-9Xx]", digits) + out.update(x.upper() for x in m10) + return out + + +def _norm_text(s: Optional[str]) -> str: + if not s: + return "" + # lowercase, collapse whitespace, drop some punctuation + s = s.lower() + s = re.sub(r"[\s\-\u2013\u2014]+", " ", s) # spaces/dashes + s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s) + return s.strip() + + +def _same_book(a: BookData, b: BookData) -> bool: + """Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized.""" + isbns_a = _norm_isbns(a.isbn) + isbns_b = _norm_isbns(b.isbn) + if isbns_a and isbns_b and (isbns_a & isbns_b): + return True + + ta, tb = _norm_text(a.title), _norm_text(b.title) + aa, ab = _norm_text(a.author), _norm_text(b.author) + ya, yb = (a.year or "").strip(), (b.year or "").strip() + + # strong title match required; then author if available; then year if available + if ta and tb and ta == tb: + # if both have authors, require match + if aa and ab and aa == ab: + # if both have year, require match + if ya and yb: + return ya == yb + return True + # if one/both authors missing, allow title (+year if both present) + if ya and yb: + return ya == yb + return True + + return False + + +def filter_prefer_swb(records: List[BookData]) -> List[BookData]: + """ + If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s). + Returns a NEW list (does not mutate the input). + """ + swb_with_sig = [ + r + for r in records + if (r.link == "SWB") and (r.signature and r.signature.strip()) + ] + if not swb_with_sig: + return list(records) + + to_remove: Set[int] = set() + + # For each URL entry, see if it matches any SWB-with-signature entry + for idx, rec in enumerate(records): + if not rec.link or not rec.link.lower().startswith("http"): + continue + for swb in swb_with_sig: + if _same_book(swb, rec): + to_remove.add(idx) + break + + # Build filtered list + return [rec for i, rec in enumerate(records) if i not in to_remove] + + +class NewEditionCheckerThread(QThread): + updateSignal = Signal(int, int) + updateProgress = Signal(int, int) + total_entries_signal = Signal(int) + resultsSignal = Signal(list) + + def __init__(self, entries: list[BookData], parent=None): + super().__init__(parent) + self.entries: list[BookData] = entries + self.total_entries_signal.emit(len(entries)) + self.results: list[tuple[BookData, list[BookData]]] = [] + + def run(self): + for book in self.entries: + self.updateSignal.emit(self.entries.index(book) + 1, len(self.entries)) + author = ( + book.author.split(";")[0].replace(" ", "") + if ";" in book.author + else book.author.replace(" ", "") + ) + # title = book.title.split(":")[0].strip() + # remove trailing punctuation from title + title = book.title.rstrip(" .:,;!?") + # remove trailing text in parentheses + title = re.sub(r"\s*\(.*\)", "", title) + title = title.strip() + response: list[BookData] = [] + response = SWB().getBooks( + [ + "pica.bib=20735", + f"pica.tit={title.split(':')[0].strip()}", + # f"pica.per={author}", + ] + ) + + # in the response, remove the entry with the same ppn + response = [entry for entry in response if entry.ppn != book.ppn] + for respo in response: + respo.link = "SWB" + with LehmannsClient() as client: + results = client.search_by_title(title, strict=True) + # client.enrich_pages(results) + if not results: + continue + for res in results: + response.append(BookData().from_LehmannsSearchResult(res)) + if response == []: + continue + # check results if lehmanns has a result with the same isbn from the results of swb. if so, if we have a signature, remove, else keep + response = filter_prefer_swb(response) + + result = (book, response) + + self.results.append(result)