diff --git a/src/backend/thread_neweditions.py b/src/backend/thread_neweditions.py index f3dfe77..45e662f 100644 --- a/src/backend/thread_neweditions.py +++ b/src/backend/thread_neweditions.py @@ -1,31 +1,29 @@ import re +import sys +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime +from math import ceil +from queue import Empty, Queue from typing import List, Optional, Set, Union -from PySide6.QtCore import QThread -from PySide6.QtCore import Signal as Signal +import loguru +from PySide6.QtCore import QThread, Signal +from src import LOG_DIR from src.logic import BookData from src.logic.lehmannsapi import LehmannsClient from src.logic.swb import SWB +log = loguru.logger +log.remove() +log.add(sys.stdout, level="INFO") +log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days") -def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]: - """Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present).""" - if value is None: - return set() - vals = value if isinstance(value, list) else [value] - out: Set[str] = set() - for v in vals: - s = str(v) - digits = re.sub(r"[^0-9Xx]", "", s) - # keep 13-digit or 10-digit tokens - m13 = re.findall(r"97[89]\d{10}", digits) - if m13: - out.update(m13) - else: - m10 = re.findall(r"\d{9}[0-9Xx]", digits) - out.update(x.upper() for x in m10) - return out +log.add( + f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log", + rotation="1 day", + retention="1 month", +) def _norm_text(s: Optional[str]) -> str: @@ -65,6 +63,25 @@ def _same_book(a: BookData, b: BookData) -> bool: return False +def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]: + """Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present).""" + if value is None: + return set() + vals = value if isinstance(value, list) else [value] + out: Set[str] = set() + for v in vals: + s = str(v) + digits = re.sub(r"[^0-9Xx]", "", s) + # keep 13-digit or 10-digit tokens + m13 = re.findall(r"97[89]\d{10}", digits) + if m13: + out.update(m13) + else: + m10 = re.findall(r"\d{9}[0-9Xx]", digits) + out.update(x.upper() for x in m10) + return out + + def filter_prefer_swb(records: List[BookData]) -> List[BookData]: """ If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s). @@ -94,56 +111,153 @@ def filter_prefer_swb(records: List[BookData]) -> List[BookData]: class NewEditionCheckerThread(QThread): - updateSignal = Signal(int, int) - updateProgress = Signal(int, int) + updateSignal = Signal(int, int) # (processed, total) + updateProgress = Signal(int, int) # (processed, total) total_entries_signal = Signal(int) - resultsSignal = Signal(list) + resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]] - def __init__(self, entries: list[BookData], parent=None): + def __init__(self, entries: Optional[list["BookData"]] = None, parent=None): super().__init__(parent) - self.entries: list[BookData] = entries - self.total_entries_signal.emit(len(entries)) - self.results: list[tuple[BookData, list[BookData]]] = [] + self.entries: list["BookData"] = entries if entries is not None else [] + self.results: list[tuple["BookData", list["BookData"]]] = [] - def run(self): - for book in self.entries: - self.updateSignal.emit(self.entries.index(book) + 1, len(self.entries)) - author = ( - book.author.split(";")[0].replace(" ", "") - if ";" in book.author - else book.author.replace(" ", "") - ) - # title = book.title.split(":")[0].strip() - # remove trailing punctuation from title - title = book.title.rstrip(" .:,;!?") - # remove trailing text in parentheses - title = re.sub(r"\s*\(.*\)", "", title) - title = title.strip() - response: list[BookData] = [] - response = SWB().getBooks( - [ - "pica.bib=20735", - f"pica.tit={title.split(':')[0].strip()}", - # f"pica.per={author}", - ] - ) + def reset(self): + self.entries = [] + self.results = [] - # in the response, remove the entry with the same ppn - response = [entry for entry in response if entry.ppn != book.ppn] - for respo in response: - respo.link = "SWB" - with LehmannsClient() as client: - results = client.search_by_title(title, strict=True) - # client.enrich_pages(results) - if not results: - continue + # ---------- internal helpers ---------- + + @staticmethod + def _split_evenly(items: list, parts: int) -> list[list]: + """Split items as evenly as possible into `parts` chunks (no empty tails).""" + if parts <= 1 or len(items) <= 1: + return [items] + n = len(items) + base = n // parts + extra = n % parts + chunks = [] + i = 0 + for k in range(parts): + size = base + (1 if k < extra else 0) + if size == 0: + continue + chunks.append(items[i : i + size]) + i += size + return chunks + + @staticmethod + def _clean_title(raw: str) -> str: + title = raw.rstrip(" .:,;!?") + title = re.sub(r"\s*\(.*\)", "", title) + return title.strip() + + @classmethod + def _process_book( + cls, book: "BookData" + ) -> tuple["BookData", list["BookData"]] | None: + author = ( + book.author.split(";")[0].replace(" ", "") + if (book.author and ";" in book.author) + else (book.author or "").replace(" ", "") + ) + title = cls._clean_title(book.title or "") + + # Query SWB + response: list[BookData] = SWB().getBooks( + [ + "pica.bib=20735", + f"pica.tit={title.split(':')[0].strip()}", + # f"pica.per={author}", + ] + ) + + # Remove same PPN + response = [entry for entry in response if entry.ppn != book.ppn] + for respo in response: + respo.link = "SWB" + + # Query Lehmanns + with LehmannsClient() as client: + results = client.search_by_title(title, strict=True) + if results: for res in results: response.append(BookData().from_LehmannsSearchResult(res)) - if response == []: - continue - # check results if lehmanns has a result with the same isbn from the results of swb. if so, if we have a signature, remove, else keep - response = filter_prefer_swb(response) - result = (book, response) + if not response: + return None - self.results.append(result) + response = filter_prefer_swb(response) + + # Remove entries matching the same ISBN as the current book + response = [ + entry + for entry in response + if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn)) + ] + + if not response: + return None + + return (book, response) + + @classmethod + def _worker(cls, items: list["BookData"], q: Queue) -> None: + """Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None).""" + try: + for book in items: + try: + result = cls._process_book(book) + except Exception: + result = None + if result is not None: + q.put(("result", result)) + q.put(("progress", 1)) + finally: + q.put(("done", None)) + + # ---------- thread entry point ---------- + + def run(self): + total = len(self.entries) + self.total_entries_signal.emit(total) + + if total == 0: + log.debug("No entries to process.") + self.resultsSignal.emit([]) + return + + # Up to 4 workers; ~20 items per worker + num_workers = min(4, max(1, ceil(total / 20))) + chunks = self._split_evenly(self.entries, num_workers) + sizes = [len(ch) for ch in chunks] + + q: Queue = Queue() + processed = 0 + finished_workers = 0 + + with ThreadPoolExecutor(max_workers=len(chunks)) as ex: + futures = [ex.submit(self._worker, ch, q) for ch in chunks] + + log.info( + f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread." + ) + for idx, sz in enumerate(sizes, 1): + log.debug(f"Thread {idx}: {sz} entries") + + # Aggregate progress/results + while finished_workers < len(chunks): + try: + kind, payload = q.get(timeout=0.1) + except Empty: + continue + + if kind == "progress": + processed += int(payload) + self.updateSignal.emit(processed, total) + self.updateProgress.emit(processed, total) + elif kind == "result": + self.results.append(payload) + elif kind == "done": + finished_workers += 1 + + self.resultsSignal.emit(self.results)