import os import re from concurrent.futures import ThreadPoolExecutor from math import ceil from queue import Empty, Queue from time import monotonic # <-- NEW from typing import List, Optional from PySide6.QtCore import QThread, Signal # from src.services.webrequest import BibTextTransformer, WebRequest from src.services.catalogue import Catalogue from src.core.models import BookData from src.services.sru import SWB from src.shared.logging import log # use all available cores - 2, but at least 1 THREAD_COUNT = max(os.cpu_count() - 2, 1) THREAD_MIN_ITEMS = 5 # Logger configured centrally in main; use shared `log` swb = SWB() dnb = SWB() cat = Catalogue() RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative RVK character set def find_newer_edition( swb_result: BookData, dnb_result: List[BookData] ) -> Optional[List[BookData]]: """ New edition if: - year > swb.year OR - edition_number > swb.edition_number BUT: discard any candidate with year < swb.year (if both years are known). Same-work check: - Compare RVK roots of signatures (after stripping trailing '+N' and '(N)'). - If both have signatures and RVKs differ -> skip. Preferences (in order): 1) RVK matches SWB 2) Print over Online-Ressource 3) Has signature 4) Newer: (year desc, edition_number desc) """ def strip_copy_and_edition(s: str) -> str: s = re.sub(r"\(\s*\d+\s*\)", "", s) # remove '(N)' s = re.sub(r"\s*\+\s*\d+\s*$", "", s) # remove trailing '+N' return s def extract_rvk_root(sig: Optional[str]) -> str: if not sig: return "" t = strip_copy_and_edition(sig.upper()) t = re.sub(r"\s+", " ", t).strip() m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t) if not m: cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip() return cleaned.split(" ")[0] if cleaned else "" return re.sub(r"\s+", " ", m.group(1)).strip() def has_sig(b: BookData) -> bool: return bool(getattr(b, "signature", None)) def is_online(b: BookData) -> bool: return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource" def is_print(b: BookData) -> bool: return not is_online(b) def rvk_matches_swb(b: BookData) -> bool: if not has_sig(b) or not has_sig(swb_result): return False return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature) def strictly_newer(b: BookData) -> bool: # Hard guard: if both years are known and candidate is older, discard if ( b.year is not None and swb_result.year is not None and b.year < swb_result.year ): return False newer_by_year = ( b.year is not None and swb_result.year is not None and b.year > swb_result.year ) newer_by_edition = ( b.edition_number is not None and swb_result.edition_number is not None and b.edition_number > swb_result.edition_number ) # Thanks to the guard above, newer_by_edition can't pick something with a smaller year. return newer_by_year or newer_by_edition swb_has_sig = has_sig(swb_result) swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None)) # 1) Filter: same work (by RVK if both have sigs) AND strictly newer candidates: List[BookData] = [] for b in dnb_result: if has_sig(b) and swb_has_sig: if extract_rvk_root(b.signature) != swb_rvk: continue # different work if strictly_newer(b): candidates.append(b) if not candidates: return None # 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature) def pref_score(x: BookData) -> tuple[int, int, int]: return ( 1 if rvk_matches_swb(x) else 0, 1 if is_print(x) else 0, 1 if has_sig(x) else 0, ) by_ppn: dict[Optional[str], BookData] = {} for b in candidates: key = getattr(b, "ppn", None) prev = by_ppn.get(key) if prev is None or pref_score(b) > pref_score(prev): by_ppn[key] = b deduped = list(by_ppn.values()) if not deduped: return None # 3) Preserve all qualifying newer editions, but order by preference def sort_key(b: BookData): year = b.year if b.year is not None else -1 ed = b.edition_number if b.edition_number is not None else -1 return ( 1 if rvk_matches_swb(b) else 0, 1 if is_print(b) else 0, 1 if has_sig(b) else 0, year, ed, ) deduped.sort(key=sort_key, reverse=True) return deduped class NewEditionCheckerThread(QThread): updateSignal = Signal(int, int) # (processed, total) updateProgress = Signal(int, int) # (processed, total) total_entries_signal = Signal(int) resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]] # NEW: metrics signals rateSignal = Signal(float) # items per second ("it/s") etaSignal = Signal(int) # seconds remaining (-1 when unknown) def __init__(self, entries: Optional[list["BookData"]] = None, parent=None): super().__init__(parent) self.entries: list["BookData"] = entries if entries is not None else [] self.results: list[tuple["BookData", list["BookData"]]] = [] def reset(self): self.entries = [] self.results = [] # ---------- internal helpers ---------- @staticmethod def _split_evenly(items: list, parts: int) -> list[list]: """Split items as evenly as possible into `parts` chunks (no empty tails).""" if parts <= 1 or len(items) <= 1: return [items] n = len(items) base = n // parts extra = n % parts chunks = [] i = 0 for k in range(parts): size = base + (1 if k < extra else 0) if size == 0: continue chunks.append(items[i : i + size]) i += size return chunks @staticmethod def _clean_title(raw: str) -> str: title = raw.rstrip(" .:,;!?") title = re.sub(r"\s*\(.*\)", "", title) return title.strip() @classmethod def _process_book( cls, book: "BookData" ) -> tuple["BookData", list["BookData"]] | None: """Process one book; returns (original, [found editions]) or None on failure.""" if not book.title: return None response: list["BookData"] = [] query = [ f"pica.tit={book.title}", f"pica.vlg={book.publisher}", ] swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0] dnb_results = swb.getBooks(query) new_editions = find_newer_edition(swb_result, dnb_results) if new_editions is not None: for new_edition in new_editions: new_edition.library_location = cat.get_location(new_edition.ppn) try: isbn = ( str(new_edition.isbn[0]) if isinstance(new_edition.isbn, list) else str(new_edition.isbn) ) new_edition.link = ( f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}" ) except (IndexError, TypeError): isbn = None new_edition.in_library = cat.in_library(new_edition.ppn) response = new_editions # client = SWB() # response: list["BookData"] = [] # # First, search by title only # results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"]) # lehmanns = LehmannsClient() # results = lehmanns.search_by_title(title) # for result in results: # if "(eBook)" in result.title: # result.title = result.title.replace("(eBook)", "").strip() # swb_results = client.getBooks( # [ # f"pica.tit={result.title}", # f"pica.vlg={result.publisher.split(',')[0]}", # ] # ) # for swb in swb_results: # if swb.isbn == result.isbn: # result.ppn = swb.ppn # result.signature = swb.signature # response.append(result) # if (result.edition_number < swb.edition_number) and ( # swb.year > result.year # ): # response.append(result) if response == []: return None # Remove duplicates based on ppn return (book, response) @classmethod def _worker(cls, items: list["BookData"], q: Queue) -> None: """Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None).""" try: for book in items: try: result = cls._process_book(book) except Exception: result = None if result is not None: q.put(("result", result)) q.put(("progress", 1)) finally: q.put(("done", None)) # ---------- thread entry point ---------- def run(self): total = len(self.entries) self.total_entries_signal.emit(total) # start timer for metrics t0 = monotonic() if total == 0: log.debug("No entries to process.") # emit metrics (zero work) self.rateSignal.emit(0.0) self.etaSignal.emit(0) self.resultsSignal.emit([]) return # Up to 4 workers; ~20 items per worker num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS))) chunks = self._split_evenly(self.entries, num_workers) sizes = [len(ch) for ch in chunks] q: Queue = Queue() processed = 0 finished_workers = 0 with ThreadPoolExecutor(max_workers=len(chunks)) as ex: futures = [ex.submit(self._worker, ch, q) for ch in chunks] log.info( f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread." ) for idx, sz in enumerate(sizes, 1): log.debug(f"Thread {idx}: {sz} entries") # Aggregate progress/results while finished_workers < len(chunks): try: kind, payload = q.get(timeout=0.1) except Empty: continue if kind == "progress": processed += int(payload) self.updateSignal.emit(processed, total) self.updateProgress.emit(processed, total) # ---- NEW: compute & emit metrics ---- elapsed = max(1e-9, monotonic() - t0) rate = processed / elapsed # items per second remaining = max(0, total - processed) eta_sec = int(round(remaining / rate)) if rate > 0 else -1 self.rateSignal.emit(rate) # clamp negative just in case self.etaSignal.emit(max(0, eta_sec) if eta_sec >= 0 else -1) # ------------------------------------- elif kind == "result": self.results.append(payload) elif kind == "done": finished_workers += 1 # Final metrics on completion elapsed_total = max(1e-9, monotonic() - t0) final_rate = total / elapsed_total self.rateSignal.emit(final_rate) self.etaSignal.emit(0) self.resultsSignal.emit(self.results)