refactor: reorganize imports and enhance logging setup; improve book processing logic in NewEditionCheckerThread

2025-09-03 10:33:15 +02:00
parent 0e3199e289
commit b344d806e2
1 changed files with 177 additions and 63 deletions
--- a/src/backend/thread_neweditions.py
+++ b/src/backend/thread_neweditions.py
@@ -1,31 +1,29 @@
 import re
+import sys
+from concurrent.futures import ThreadPoolExecutor
+from datetime import datetime
+from math import ceil
+from queue import Empty, Queue
 from typing import List, Optional, Set, Union

-from PySide6.QtCore import QThread
-from PySide6.QtCore import Signal as Signal
+import loguru
+from PySide6.QtCore import QThread, Signal

+from src import LOG_DIR
 from src.logic import BookData
 from src.logic.lehmannsapi import LehmannsClient
 from src.logic.swb import SWB

+log = loguru.logger
+log.remove()
+log.add(sys.stdout, level="INFO")
+log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")

-def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
-    """Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
-    if value is None:
-        return set()
-    vals = value if isinstance(value, list) else [value]
-    out: Set[str] = set()
-    for v in vals:
-        s = str(v)
-        digits = re.sub(r"[^0-9Xx]", "", s)
-        # keep 13-digit or 10-digit tokens
-        m13 = re.findall(r"97[89]\d{10}", digits)
-        if m13:
-            out.update(m13)
-        else:
-            m10 = re.findall(r"\d{9}[0-9Xx]", digits)
-            out.update(x.upper() for x in m10)
-    return out
+log.add(
+    f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
+    rotation="1 day",
+    retention="1 month",
+)


 def _norm_text(s: Optional[str]) -> str:
@@ -65,6 +63,25 @@ def _same_book(a: BookData, b: BookData) -> bool:
    return False


+def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
+    """Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
+    if value is None:
+        return set()
+    vals = value if isinstance(value, list) else [value]
+    out: Set[str] = set()
+    for v in vals:
+        s = str(v)
+        digits = re.sub(r"[^0-9Xx]", "", s)
+        # keep 13-digit or 10-digit tokens
+        m13 = re.findall(r"97[89]\d{10}", digits)
+        if m13:
+            out.update(m13)
+        else:
+            m10 = re.findall(r"\d{9}[0-9Xx]", digits)
+            out.update(x.upper() for x in m10)
+    return out
+
+
 def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
    """
    If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
@@ -94,56 +111,153 @@ def filter_prefer_swb(records: List[BookData]) -> List[BookData]:


 class NewEditionCheckerThread(QThread):
-    updateSignal = Signal(int, int)
-    updateProgress = Signal(int, int)
+    updateSignal = Signal(int, int)  # (processed, total)
+    updateProgress = Signal(int, int)  # (processed, total)
    total_entries_signal = Signal(int)
-    resultsSignal = Signal(list)
+    resultsSignal = Signal(list)  # list[tuple[BookData, list[BookData]]]

-    def __init__(self, entries: list[BookData], parent=None):
+    def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
        super().__init__(parent)
-        self.entries: list[BookData] = entries
-        self.total_entries_signal.emit(len(entries))
-        self.results: list[tuple[BookData, list[BookData]]] = []
+        self.entries: list["BookData"] = entries if entries is not None else []
+        self.results: list[tuple["BookData", list["BookData"]]] = []

-    def run(self):
-        for book in self.entries:
-            self.updateSignal.emit(self.entries.index(book) + 1, len(self.entries))
-            author = (
-                book.author.split(";")[0].replace(" ", "")
-                if ";" in book.author
-                else book.author.replace(" ", "")
-            )
-            # title = book.title.split(":")[0].strip()
-            # remove trailing punctuation from title
-            title = book.title.rstrip(" .:,;!?")
-            # remove trailing text in parentheses
-            title = re.sub(r"\s*\(.*\)", "", title)
-            title = title.strip()
-            response: list[BookData] = []
-            response = SWB().getBooks(
-                [
-                    "pica.bib=20735",
-                    f"pica.tit={title.split(':')[0].strip()}",
-                    # f"pica.per={author}",
-                ]
-            )
+    def reset(self):
+        self.entries = []
+        self.results = []

-            # in the response, remove the entry with the same ppn
-            response = [entry for entry in response if entry.ppn != book.ppn]
-            for respo in response:
-                respo.link = "SWB"
-            with LehmannsClient() as client:
-                results = client.search_by_title(title, strict=True)
-                # client.enrich_pages(results)
-                if not results:
-                    continue
+    # ---------- internal helpers ----------
+
+    @staticmethod
+    def _split_evenly(items: list, parts: int) -> list[list]:
+        """Split items as evenly as possible into `parts` chunks (no empty tails)."""
+        if parts <= 1 or len(items) <= 1:
+            return [items]
+        n = len(items)
+        base = n // parts
+        extra = n % parts
+        chunks = []
+        i = 0
+        for k in range(parts):
+            size = base + (1 if k < extra else 0)
+            if size == 0:
+                continue
+            chunks.append(items[i : i + size])
+            i += size
+        return chunks
+
+    @staticmethod
+    def _clean_title(raw: str) -> str:
+        title = raw.rstrip(" .:,;!?")
+        title = re.sub(r"\s*\(.*\)", "", title)
+        return title.strip()
+
+    @classmethod
+    def _process_book(
+        cls, book: "BookData"
+    ) -> tuple["BookData", list["BookData"]] | None:
+        author = (
+            book.author.split(";")[0].replace(" ", "")
+            if (book.author and ";" in book.author)
+            else (book.author or "").replace(" ", "")
+        )
+        title = cls._clean_title(book.title or "")
+
+        # Query SWB
+        response: list[BookData] = SWB().getBooks(
+            [
+                "pica.bib=20735",
+                f"pica.tit={title.split(':')[0].strip()}",
+                # f"pica.per={author}",
+            ]
+        )
+
+        # Remove same PPN
+        response = [entry for entry in response if entry.ppn != book.ppn]
+        for respo in response:
+            respo.link = "SWB"
+
+        # Query Lehmanns
+        with LehmannsClient() as client:
+            results = client.search_by_title(title, strict=True)
+            if results:
                for res in results:
                    response.append(BookData().from_LehmannsSearchResult(res))
-            if response == []:
-                continue
-            # check results if lehmanns has a result with the same isbn from the results of swb. if so, if we have a signature, remove, else keep
-            response = filter_prefer_swb(response)

-            result = (book, response)
+        if not response:
+            return None

-            self.results.append(result)
+        response = filter_prefer_swb(response)
+
+        # Remove entries matching the same ISBN as the current book
+        response = [
+            entry
+            for entry in response
+            if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
+        ]
+
+        if not response:
+            return None
+
+        return (book, response)
+
+    @classmethod
+    def _worker(cls, items: list["BookData"], q: Queue) -> None:
+        """Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None)."""
+        try:
+            for book in items:
+                try:
+                    result = cls._process_book(book)
+                except Exception:
+                    result = None
+                if result is not None:
+                    q.put(("result", result))
+                q.put(("progress", 1))
+        finally:
+            q.put(("done", None))
+
+    # ---------- thread entry point ----------
+
+    def run(self):
+        total = len(self.entries)
+        self.total_entries_signal.emit(total)
+
+        if total == 0:
+            log.debug("No entries to process.")
+            self.resultsSignal.emit([])
+            return
+
+        # Up to 4 workers; ~20 items per worker
+        num_workers = min(4, max(1, ceil(total / 20)))
+        chunks = self._split_evenly(self.entries, num_workers)
+        sizes = [len(ch) for ch in chunks]
+
+        q: Queue = Queue()
+        processed = 0
+        finished_workers = 0
+
+        with ThreadPoolExecutor(max_workers=len(chunks)) as ex:
+            futures = [ex.submit(self._worker, ch, q) for ch in chunks]
+
+            log.info(
+                f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread."
+            )
+            for idx, sz in enumerate(sizes, 1):
+                log.debug(f"Thread {idx}: {sz} entries")
+
+            # Aggregate progress/results
+            while finished_workers < len(chunks):
+                try:
+                    kind, payload = q.get(timeout=0.1)
+                except Empty:
+                    continue
+
+                if kind == "progress":
+                    processed += int(payload)
+                    self.updateSignal.emit(processed, total)
+                    self.updateProgress.emit(processed, total)
+                elif kind == "result":
+                    self.results.append(payload)
+                elif kind == "done":
+                    finished_workers += 1
+
+        self.resultsSignal.emit(self.results)