refactor: reorganize imports and enhance logging setup; improve book processing logic in NewEditionCheckerThread

2025-09-03 10:33:15 +02:00
parent 0e3199e289
commit b344d806e2
1 changed files with 177 additions and 63 deletions
--- a/src/backend/thread_neweditions.py
+++ b/src/backend/thread_neweditions.py
@@ -1,31 +1,29 @@
 import re
 import sys
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 from math import ceil
 from queue import Empty, Queue
 from typing import List, Optional, Set, Union
-from PySide6.QtCore import QThread
+import loguru
-from PySide6.QtCore import Signal as Signal
+from PySide6.QtCore import QThread, Signal
 from src import LOG_DIR
 from src.logic import BookData
 from src.logic.lehmannsapi import LehmannsClient
 from src.logic.swb import SWB
 log = loguru.logger
 log.remove()
 log.add(sys.stdout, level="INFO")
 log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
-def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
+log.add(
-    """Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
+    f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
-    if value is None:
+    rotation="1 day",
-        return set()
+    retention="1 month",
-    vals = value if isinstance(value, list) else [value]
+)
    out: Set[str] = set()
    for v in vals:
        s = str(v)
        digits = re.sub(r"[^0-9Xx]", "", s)
        # keep 13-digit or 10-digit tokens
        m13 = re.findall(r"97[89]\d{10}", digits)
        if m13:
            out.update(m13)
        else:
            m10 = re.findall(r"\d{9}[0-9Xx]", digits)
            out.update(x.upper() for x in m10)
    return out
 def _norm_text(s: Optional[str]) -> str:
@@ -65,6 +63,25 @@ def _same_book(a: BookData, b: BookData) -> bool:
    return False
 def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
    """Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
    if value is None:
        return set()
    vals = value if isinstance(value, list) else [value]
    out: Set[str] = set()
    for v in vals:
        s = str(v)
        digits = re.sub(r"[^0-9Xx]", "", s)
        # keep 13-digit or 10-digit tokens
        m13 = re.findall(r"97[89]\d{10}", digits)
        if m13:
            out.update(m13)
        else:
            m10 = re.findall(r"\d{9}[0-9Xx]", digits)
            out.update(x.upper() for x in m10)
    return out
 def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
    """
    If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
@@ -94,56 +111,153 @@ def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
 class NewEditionCheckerThread(QThread):
-    updateSignal = Signal(int, int)
+    updateSignal = Signal(int, int)  # (processed, total)
-    updateProgress = Signal(int, int)
+    updateProgress = Signal(int, int)  # (processed, total)
    total_entries_signal = Signal(int)
-    resultsSignal = Signal(list)
+    resultsSignal = Signal(list)  # list[tuple[BookData, list[BookData]]]
-    def __init__(self, entries: list[BookData], parent=None):
+    def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
        super().__init__(parent)
-        self.entries: list[BookData] = entries
+        self.entries: list["BookData"] = entries if entries is not None else []
-        self.total_entries_signal.emit(len(entries))
+        self.results: list[tuple["BookData", list["BookData"]]] = []
        self.results: list[tuple[BookData, list[BookData]]] = []
-    def run(self):
+    def reset(self):
-        for book in self.entries:
+        self.entries = []
-            self.updateSignal.emit(self.entries.index(book) + 1, len(self.entries))
+        self.results = []
            author = (
                book.author.split(";")[0].replace(" ", "")
                if ";" in book.author
                else book.author.replace(" ", "")
            )
            # title = book.title.split(":")[0].strip()
            # remove trailing punctuation from title
            title = book.title.rstrip(" .:,;!?")
            # remove trailing text in parentheses
            title = re.sub(r"\s*\(.*\)", "", title)
            title = title.strip()
            response: list[BookData] = []
            response = SWB().getBooks(
                [
                    "pica.bib=20735",
                    f"pica.tit={title.split(':')[0].strip()}",
                    # f"pica.per={author}",
                ]
            )
-            # in the response, remove the entry with the same ppn
+    # ---------- internal helpers ----------
-            response = [entry for entry in response if entry.ppn != book.ppn]
+
-            for respo in response:
+    @staticmethod
-                respo.link = "SWB"
+    def _split_evenly(items: list, parts: int) -> list[list]:
-            with LehmannsClient() as client:
+        """Split items as evenly as possible into `parts` chunks (no empty tails)."""
-                results = client.search_by_title(title, strict=True)
+        if parts <= 1 or len(items) <= 1:
-                # client.enrich_pages(results)
+            return [items]
-                if not results:
+        n = len(items)
-                    continue
+        base = n // parts
        extra = n % parts
        chunks = []
        i = 0
        for k in range(parts):
            size = base + (1 if k < extra else 0)
            if size == 0:
                continue
            chunks.append(items[i : i + size])
            i += size
        return chunks
    @staticmethod
    def _clean_title(raw: str) -> str:
        title = raw.rstrip(" .:,;!?")
        title = re.sub(r"\s*\(.*\)", "", title)
        return title.strip()
    @classmethod
    def _process_book(
        cls, book: "BookData"
    ) -> tuple["BookData", list["BookData"]] | None:
        author = (
            book.author.split(";")[0].replace(" ", "")
            if (book.author and ";" in book.author)
            else (book.author or "").replace(" ", "")
        )
        title = cls._clean_title(book.title or "")
        # Query SWB
        response: list[BookData] = SWB().getBooks(
            [
                "pica.bib=20735",
                f"pica.tit={title.split(':')[0].strip()}",
                # f"pica.per={author}",
            ]
        )
        # Remove same PPN
        response = [entry for entry in response if entry.ppn != book.ppn]
        for respo in response:
            respo.link = "SWB"
        # Query Lehmanns
        with LehmannsClient() as client:
            results = client.search_by_title(title, strict=True)
            if results:
                for res in results:
                    response.append(BookData().from_LehmannsSearchResult(res))
            if response == []:
                continue
            # check results if lehmanns has a result with the same isbn from the results of swb. if so, if we have a signature, remove, else keep
            response = filter_prefer_swb(response)
-            result = (book, response)
+        if not response:
            return None
-            self.results.append(result)
+        response = filter_prefer_swb(response)
        # Remove entries matching the same ISBN as the current book
        response = [
            entry
            for entry in response
            if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
        ]
        if not response:
            return None
        return (book, response)
    @classmethod
    def _worker(cls, items: list["BookData"], q: Queue) -> None:
        """Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None)."""
        try:
            for book in items:
                try:
                    result = cls._process_book(book)
                except Exception:
                    result = None
                if result is not None:
                    q.put(("result", result))
                q.put(("progress", 1))
        finally:
            q.put(("done", None))
    # ---------- thread entry point ----------
    def run(self):
        total = len(self.entries)
        self.total_entries_signal.emit(total)
        if total == 0:
            log.debug("No entries to process.")
            self.resultsSignal.emit([])
            return
        # Up to 4 workers; ~20 items per worker
        num_workers = min(4, max(1, ceil(total / 20)))
        chunks = self._split_evenly(self.entries, num_workers)
        sizes = [len(ch) for ch in chunks]
        q: Queue = Queue()
        processed = 0
        finished_workers = 0
        with ThreadPoolExecutor(max_workers=len(chunks)) as ex:
            futures = [ex.submit(self._worker, ch, q) for ch in chunks]
            log.info(
                f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread."
            )
            for idx, sz in enumerate(sizes, 1):
                log.debug(f"Thread {idx}: {sz} entries")
            # Aggregate progress/results
            while finished_workers < len(chunks):
                try:
                    kind, payload = q.get(timeout=0.1)
                except Empty:
                    continue
                if kind == "progress":
                    processed += int(payload)
                    self.updateSignal.emit(processed, total)
                    self.updateProgress.emit(processed, total)
                elif kind == "result":
                    self.results.append(payload)
                elif kind == "done":
                    finished_workers += 1
        self.resultsSignal.emit(self.results)