SemesterapparatsManager/src/backend/thread_neweditions.py

import os
import re
from concurrent.futures import ThreadPoolExecutor
from math import ceil
from queue import Empty, Queue
from time import monotonic  # <-- NEW
from typing import List, Optional

from PySide6.QtCore import QThread, Signal

# from src.logic.webrequest import BibTextTransformer, WebRequest
from src.backend.catalogue import Catalogue
from src.logic import BookData
from src.logic.SRU import SWB
from src.shared.logging import log

# use all available cores - 2, but at least 1
THREAD_COUNT = max(os.cpu_count() - 2, 1)
THREAD_MIN_ITEMS = 5

# Logger configured centrally in main; use shared `log`

swb = SWB()
dnb = SWB()
cat = Catalogue()

RVK_ALLOWED = r"[A-Z0-9.\-\/]"  # conservative RVK character set


def find_newer_edition(
    swb_result: BookData, dnb_result: List[BookData]
) -> Optional[List[BookData]]:
    """
    New edition if:
      - year > swb.year OR
      - edition_number > swb.edition_number
    BUT: discard any candidate with year < swb.year (if both years are known).

    Same-work check:
      - Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
      - If both have signatures and RVKs differ -> skip.

    Preferences (in order):
      1) RVK matches SWB
      2) Print over Online-Ressource
      3) Has signature
      4) Newer: (year desc, edition_number desc)
    """

    def strip_copy_and_edition(s: str) -> str:
        s = re.sub(r"\(\s*\d+\s*\)", "", s)  # remove '(N)'
        s = re.sub(r"\s*\+\s*\d+\s*$", "", s)  # remove trailing '+N'
        return s

    def extract_rvk_root(sig: Optional[str]) -> str:
        if not sig:
            return ""
        t = strip_copy_and_edition(sig.upper())
        t = re.sub(r"\s+", " ", t).strip()
        m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
        if not m:
            cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
            return cleaned.split(" ")[0] if cleaned else ""
        return re.sub(r"\s+", " ", m.group(1)).strip()

    def has_sig(b: BookData) -> bool:
        return bool(getattr(b, "signature", None))

    def is_online(b: BookData) -> bool:
        return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"

    def is_print(b: BookData) -> bool:
        return not is_online(b)

    def rvk_matches_swb(b: BookData) -> bool:
        if not has_sig(b) or not has_sig(swb_result):
            return False
        return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)

    def strictly_newer(b: BookData) -> bool:
        # Hard guard: if both years are known and candidate is older, discard
        if (
            b.year is not None
            and swb_result.year is not None
            and b.year < swb_result.year
        ):
            return False

        newer_by_year = (
            b.year is not None
            and swb_result.year is not None
            and b.year > swb_result.year
        )
        newer_by_edition = (
            b.edition_number is not None
            and swb_result.edition_number is not None
            and b.edition_number > swb_result.edition_number
        )
        # Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
        return newer_by_year or newer_by_edition

    swb_has_sig = has_sig(swb_result)
    swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))

    # 1) Filter: same work (by RVK if both have sigs) AND strictly newer
    candidates: List[BookData] = []
    for b in dnb_result:
        if has_sig(b) and swb_has_sig:
            if extract_rvk_root(b.signature) != swb_rvk:
                continue  # different work
        if strictly_newer(b):
            candidates.append(b)

    if not candidates:
        return None

    # 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
    def pref_score(x: BookData) -> tuple[int, int, int]:
        return (
            1 if rvk_matches_swb(x) else 0,
            1 if is_print(x) else 0,
            1 if has_sig(x) else 0,
        )

    by_ppn: dict[Optional[str], BookData] = {}
    for b in candidates:
        key = getattr(b, "ppn", None)
        prev = by_ppn.get(key)
        if prev is None or pref_score(b) > pref_score(prev):
            by_ppn[key] = b

    deduped = list(by_ppn.values())
    if not deduped:
        return None

    # 3) Preserve all qualifying newer editions, but order by preference
    def sort_key(b: BookData):
        year = b.year if b.year is not None else -1
        ed = b.edition_number if b.edition_number is not None else -1
        return (
            1 if rvk_matches_swb(b) else 0,
            1 if is_print(b) else 0,
            1 if has_sig(b) else 0,
            year,
            ed,
        )

    deduped.sort(key=sort_key, reverse=True)
    return deduped


class NewEditionCheckerThread(QThread):
    updateSignal = Signal(int, int)  # (processed, total)
    updateProgress = Signal(int, int)  # (processed, total)
    total_entries_signal = Signal(int)
    resultsSignal = Signal(list)  # list[tuple[BookData, list[BookData]]]

    # NEW: metrics signals
    rateSignal = Signal(float)  # items per second ("it/s")
    etaSignal = Signal(int)  # seconds remaining (-1 when unknown)

    def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
        super().__init__(parent)
        self.entries: list["BookData"] = entries if entries is not None else []
        self.results: list[tuple["BookData", list["BookData"]]] = []

    def reset(self):
        self.entries = []
        self.results = []

    # ---------- internal helpers ----------

    @staticmethod
    def _split_evenly(items: list, parts: int) -> list[list]:
        """Split items as evenly as possible into `parts` chunks (no empty tails)."""
        if parts <= 1 or len(items) <= 1:
            return [items]
        n = len(items)
        base = n // parts
        extra = n % parts
        chunks = []
        i = 0
        for k in range(parts):
            size = base + (1 if k < extra else 0)
            if size == 0:
                continue
            chunks.append(items[i : i + size])
            i += size
        return chunks

    @staticmethod
    def _clean_title(raw: str) -> str:
        title = raw.rstrip(" .:,;!?")
        title = re.sub(r"\s*\(.*\)", "", title)
        return title.strip()

    @classmethod
    def _process_book(
        cls, book: "BookData"
    ) -> tuple["BookData", list["BookData"]] | None:
        """Process one book; returns (original, [found editions]) or None on failure."""
        if not book.title:
            return None
        response: list["BookData"] = []
        query = [
            f"pica.tit={book.title}",
            f"pica.vlg={book.publisher}",
        ]

        swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
        dnb_results = swb.getBooks(query)
        new_editions = find_newer_edition(swb_result, dnb_results)

        if new_editions is not None:
            for new_edition in new_editions:
                new_edition.library_location = cat.get_location(new_edition.ppn)
                try:
                    isbn = (
                        str(new_edition.isbn[0])
                        if isinstance(new_edition.isbn, list)
                        else str(new_edition.isbn)
                    )
                    new_edition.link = (
                        f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
                    )
                except (IndexError, TypeError):
                    isbn = None
                new_edition.in_library = cat.in_library(new_edition.ppn)
            response = new_editions

        # client = SWB()
        # response: list["BookData"] = []
        # # First, search by title only
        # results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])

        # lehmanns = LehmannsClient()
        # results = lehmanns.search_by_title(title)
        # for result in results:
        #     if "(eBook)" in result.title:
        #         result.title = result.title.replace("(eBook)", "").strip()
        #     swb_results = client.getBooks(
        #         [
        #             f"pica.tit={result.title}",
        #             f"pica.vlg={result.publisher.split(',')[0]}",
        #         ]
        #     )
        #     for swb in swb_results:
        #         if swb.isbn == result.isbn:
        #             result.ppn = swb.ppn
        #             result.signature = swb.signature
        #             response.append(result)
        #         if (result.edition_number < swb.edition_number) and (
        #             swb.year > result.year
        #         ):
        #             response.append(result)
        if response == []:
            return None
        # Remove duplicates based on ppn
        return (book, response)

    @classmethod
    def _worker(cls, items: list["BookData"], q: Queue) -> None:
        """Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None)."""
        try:
            for book in items:
                try:
                    result = cls._process_book(book)
                except Exception:
                    result = None
                if result is not None:
                    q.put(("result", result))
                q.put(("progress", 1))
        finally:
            q.put(("done", None))

    # ---------- thread entry point ----------

    def run(self):
        total = len(self.entries)
        self.total_entries_signal.emit(total)

        # start timer for metrics
        t0 = monotonic()

        if total == 0:
            log.debug("No entries to process.")
            # emit metrics (zero work)
            self.rateSignal.emit(0.0)
            self.etaSignal.emit(0)
            self.resultsSignal.emit([])
            return

        # Up to 4 workers; ~20 items per worker
        num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
        chunks = self._split_evenly(self.entries, num_workers)
        sizes = [len(ch) for ch in chunks]

        q: Queue = Queue()
        processed = 0
        finished_workers = 0

        with ThreadPoolExecutor(max_workers=len(chunks)) as ex:
            futures = [ex.submit(self._worker, ch, q) for ch in chunks]

            log.info(
                f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread."
            )
            for idx, sz in enumerate(sizes, 1):
                log.debug(f"Thread {idx}: {sz} entries")

            # Aggregate progress/results
            while finished_workers < len(chunks):
                try:
                    kind, payload = q.get(timeout=0.1)
                except Empty:
                    continue

                if kind == "progress":
                    processed += int(payload)
                    self.updateSignal.emit(processed, total)
                    self.updateProgress.emit(processed, total)

                    # ---- NEW: compute & emit metrics ----
                    elapsed = max(1e-9, monotonic() - t0)
                    rate = processed / elapsed  # items per second
                    remaining = max(0, total - processed)
                    eta_sec = int(round(remaining / rate)) if rate > 0 else -1

                    self.rateSignal.emit(rate)
                    # clamp negative just in case
                    self.etaSignal.emit(max(0, eta_sec) if eta_sec >= 0 else -1)
                    # -------------------------------------

                elif kind == "result":
                    self.results.append(payload)
                elif kind == "done":
                    finished_workers += 1

        # Final metrics on completion
        elapsed_total = max(1e-9, monotonic() - t0)
        final_rate = total / elapsed_total
        self.rateSignal.emit(final_rate)
        self.etaSignal.emit(0)

        self.resultsSignal.emit(self.results)