346 lines
12 KiB
Python
346 lines
12 KiB
Python
import os
|
|
import re
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
from math import ceil
|
|
from queue import Empty, Queue
|
|
from time import monotonic # <-- NEW
|
|
from typing import List, Optional
|
|
|
|
from PySide6.QtCore import QThread, Signal
|
|
|
|
# from src.logic.webrequest import BibTextTransformer, WebRequest
|
|
from src.backend.catalogue import Catalogue
|
|
from src.logic import BookData
|
|
from src.logic.SRU import SWB
|
|
from src.shared.logging import log
|
|
|
|
# use all available cores - 2, but at least 1
|
|
THREAD_COUNT = max(os.cpu_count() - 2, 1)
|
|
THREAD_MIN_ITEMS = 5
|
|
|
|
# Logger configured centrally in main; use shared `log`
|
|
|
|
swb = SWB()
|
|
dnb = SWB()
|
|
cat = Catalogue()
|
|
|
|
RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative RVK character set
|
|
|
|
|
|
def find_newer_edition(
|
|
swb_result: BookData, dnb_result: List[BookData]
|
|
) -> Optional[List[BookData]]:
|
|
"""
|
|
New edition if:
|
|
- year > swb.year OR
|
|
- edition_number > swb.edition_number
|
|
BUT: discard any candidate with year < swb.year (if both years are known).
|
|
|
|
Same-work check:
|
|
- Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
|
|
- If both have signatures and RVKs differ -> skip.
|
|
|
|
Preferences (in order):
|
|
1) RVK matches SWB
|
|
2) Print over Online-Ressource
|
|
3) Has signature
|
|
4) Newer: (year desc, edition_number desc)
|
|
"""
|
|
|
|
def strip_copy_and_edition(s: str) -> str:
|
|
s = re.sub(r"\(\s*\d+\s*\)", "", s) # remove '(N)'
|
|
s = re.sub(r"\s*\+\s*\d+\s*$", "", s) # remove trailing '+N'
|
|
return s
|
|
|
|
def extract_rvk_root(sig: Optional[str]) -> str:
|
|
if not sig:
|
|
return ""
|
|
t = strip_copy_and_edition(sig.upper())
|
|
t = re.sub(r"\s+", " ", t).strip()
|
|
m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
|
|
if not m:
|
|
cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
|
|
return cleaned.split(" ")[0] if cleaned else ""
|
|
return re.sub(r"\s+", " ", m.group(1)).strip()
|
|
|
|
def has_sig(b: BookData) -> bool:
|
|
return bool(getattr(b, "signature", None))
|
|
|
|
def is_online(b: BookData) -> bool:
|
|
return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"
|
|
|
|
def is_print(b: BookData) -> bool:
|
|
return not is_online(b)
|
|
|
|
def rvk_matches_swb(b: BookData) -> bool:
|
|
if not has_sig(b) or not has_sig(swb_result):
|
|
return False
|
|
return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)
|
|
|
|
def strictly_newer(b: BookData) -> bool:
|
|
# Hard guard: if both years are known and candidate is older, discard
|
|
if (
|
|
b.year is not None
|
|
and swb_result.year is not None
|
|
and b.year < swb_result.year
|
|
):
|
|
return False
|
|
|
|
newer_by_year = (
|
|
b.year is not None
|
|
and swb_result.year is not None
|
|
and b.year > swb_result.year
|
|
)
|
|
newer_by_edition = (
|
|
b.edition_number is not None
|
|
and swb_result.edition_number is not None
|
|
and b.edition_number > swb_result.edition_number
|
|
)
|
|
# Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
|
|
return newer_by_year or newer_by_edition
|
|
|
|
swb_has_sig = has_sig(swb_result)
|
|
swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))
|
|
|
|
# 1) Filter: same work (by RVK if both have sigs) AND strictly newer
|
|
candidates: List[BookData] = []
|
|
for b in dnb_result:
|
|
if has_sig(b) and swb_has_sig:
|
|
if extract_rvk_root(b.signature) != swb_rvk:
|
|
continue # different work
|
|
if strictly_newer(b):
|
|
candidates.append(b)
|
|
|
|
if not candidates:
|
|
return None
|
|
|
|
# 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
|
|
def pref_score(x: BookData) -> tuple[int, int, int]:
|
|
return (
|
|
1 if rvk_matches_swb(x) else 0,
|
|
1 if is_print(x) else 0,
|
|
1 if has_sig(x) else 0,
|
|
)
|
|
|
|
by_ppn: dict[Optional[str], BookData] = {}
|
|
for b in candidates:
|
|
key = getattr(b, "ppn", None)
|
|
prev = by_ppn.get(key)
|
|
if prev is None or pref_score(b) > pref_score(prev):
|
|
by_ppn[key] = b
|
|
|
|
deduped = list(by_ppn.values())
|
|
if not deduped:
|
|
return None
|
|
|
|
# 3) Preserve all qualifying newer editions, but order by preference
|
|
def sort_key(b: BookData):
|
|
year = b.year if b.year is not None else -1
|
|
ed = b.edition_number if b.edition_number is not None else -1
|
|
return (
|
|
1 if rvk_matches_swb(b) else 0,
|
|
1 if is_print(b) else 0,
|
|
1 if has_sig(b) else 0,
|
|
year,
|
|
ed,
|
|
)
|
|
|
|
deduped.sort(key=sort_key, reverse=True)
|
|
return deduped
|
|
|
|
|
|
class NewEditionCheckerThread(QThread):
|
|
updateSignal = Signal(int, int) # (processed, total)
|
|
updateProgress = Signal(int, int) # (processed, total)
|
|
total_entries_signal = Signal(int)
|
|
resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]]
|
|
|
|
# NEW: metrics signals
|
|
rateSignal = Signal(float) # items per second ("it/s")
|
|
etaSignal = Signal(int) # seconds remaining (-1 when unknown)
|
|
|
|
def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
|
|
super().__init__(parent)
|
|
self.entries: list["BookData"] = entries if entries is not None else []
|
|
self.results: list[tuple["BookData", list["BookData"]]] = []
|
|
|
|
def reset(self):
|
|
self.entries = []
|
|
self.results = []
|
|
|
|
# ---------- internal helpers ----------
|
|
|
|
@staticmethod
|
|
def _split_evenly(items: list, parts: int) -> list[list]:
|
|
"""Split items as evenly as possible into `parts` chunks (no empty tails)."""
|
|
if parts <= 1 or len(items) <= 1:
|
|
return [items]
|
|
n = len(items)
|
|
base = n // parts
|
|
extra = n % parts
|
|
chunks = []
|
|
i = 0
|
|
for k in range(parts):
|
|
size = base + (1 if k < extra else 0)
|
|
if size == 0:
|
|
continue
|
|
chunks.append(items[i : i + size])
|
|
i += size
|
|
return chunks
|
|
|
|
@staticmethod
|
|
def _clean_title(raw: str) -> str:
|
|
title = raw.rstrip(" .:,;!?")
|
|
title = re.sub(r"\s*\(.*\)", "", title)
|
|
return title.strip()
|
|
|
|
@classmethod
|
|
def _process_book(
|
|
cls, book: "BookData"
|
|
) -> tuple["BookData", list["BookData"]] | None:
|
|
"""Process one book; returns (original, [found editions]) or None on failure."""
|
|
if not book.title:
|
|
return None
|
|
response: list["BookData"] = []
|
|
query = [
|
|
f"pica.tit={book.title}",
|
|
f"pica.vlg={book.publisher}",
|
|
]
|
|
|
|
swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
|
|
dnb_results = swb.getBooks(query)
|
|
new_editions = find_newer_edition(swb_result, dnb_results)
|
|
|
|
if new_editions is not None:
|
|
for new_edition in new_editions:
|
|
new_edition.library_location = cat.get_location(new_edition.ppn)
|
|
try:
|
|
isbn = (
|
|
str(new_edition.isbn[0])
|
|
if isinstance(new_edition.isbn, list)
|
|
else str(new_edition.isbn)
|
|
)
|
|
new_edition.link = (
|
|
f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
|
|
)
|
|
except (IndexError, TypeError):
|
|
isbn = None
|
|
new_edition.in_library = cat.in_library(new_edition.ppn)
|
|
response = new_editions
|
|
|
|
# client = SWB()
|
|
# response: list["BookData"] = []
|
|
# # First, search by title only
|
|
# results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])
|
|
|
|
# lehmanns = LehmannsClient()
|
|
# results = lehmanns.search_by_title(title)
|
|
# for result in results:
|
|
# if "(eBook)" in result.title:
|
|
# result.title = result.title.replace("(eBook)", "").strip()
|
|
# swb_results = client.getBooks(
|
|
# [
|
|
# f"pica.tit={result.title}",
|
|
# f"pica.vlg={result.publisher.split(',')[0]}",
|
|
# ]
|
|
# )
|
|
# for swb in swb_results:
|
|
# if swb.isbn == result.isbn:
|
|
# result.ppn = swb.ppn
|
|
# result.signature = swb.signature
|
|
# response.append(result)
|
|
# if (result.edition_number < swb.edition_number) and (
|
|
# swb.year > result.year
|
|
# ):
|
|
# response.append(result)
|
|
if response == []:
|
|
return None
|
|
# Remove duplicates based on ppn
|
|
return (book, response)
|
|
|
|
@classmethod
|
|
def _worker(cls, items: list["BookData"], q: Queue) -> None:
|
|
"""Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None)."""
|
|
try:
|
|
for book in items:
|
|
try:
|
|
result = cls._process_book(book)
|
|
except Exception:
|
|
result = None
|
|
if result is not None:
|
|
q.put(("result", result))
|
|
q.put(("progress", 1))
|
|
finally:
|
|
q.put(("done", None))
|
|
|
|
# ---------- thread entry point ----------
|
|
|
|
def run(self):
|
|
total = len(self.entries)
|
|
self.total_entries_signal.emit(total)
|
|
|
|
# start timer for metrics
|
|
t0 = monotonic()
|
|
|
|
if total == 0:
|
|
log.debug("No entries to process.")
|
|
# emit metrics (zero work)
|
|
self.rateSignal.emit(0.0)
|
|
self.etaSignal.emit(0)
|
|
self.resultsSignal.emit([])
|
|
return
|
|
|
|
# Up to 4 workers; ~20 items per worker
|
|
num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
|
|
chunks = self._split_evenly(self.entries, num_workers)
|
|
sizes = [len(ch) for ch in chunks]
|
|
|
|
q: Queue = Queue()
|
|
processed = 0
|
|
finished_workers = 0
|
|
|
|
with ThreadPoolExecutor(max_workers=len(chunks)) as ex:
|
|
futures = [ex.submit(self._worker, ch, q) for ch in chunks]
|
|
|
|
log.info(
|
|
f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread."
|
|
)
|
|
for idx, sz in enumerate(sizes, 1):
|
|
log.debug(f"Thread {idx}: {sz} entries")
|
|
|
|
# Aggregate progress/results
|
|
while finished_workers < len(chunks):
|
|
try:
|
|
kind, payload = q.get(timeout=0.1)
|
|
except Empty:
|
|
continue
|
|
|
|
if kind == "progress":
|
|
processed += int(payload)
|
|
self.updateSignal.emit(processed, total)
|
|
self.updateProgress.emit(processed, total)
|
|
|
|
# ---- NEW: compute & emit metrics ----
|
|
elapsed = max(1e-9, monotonic() - t0)
|
|
rate = processed / elapsed # items per second
|
|
remaining = max(0, total - processed)
|
|
eta_sec = int(round(remaining / rate)) if rate > 0 else -1
|
|
|
|
self.rateSignal.emit(rate)
|
|
# clamp negative just in case
|
|
self.etaSignal.emit(max(0, eta_sec) if eta_sec >= 0 else -1)
|
|
# -------------------------------------
|
|
|
|
elif kind == "result":
|
|
self.results.append(payload)
|
|
elif kind == "done":
|
|
finished_workers += 1
|
|
|
|
# Final metrics on completion
|
|
elapsed_total = max(1e-9, monotonic() - t0)
|
|
final_rate = total / elapsed_total
|
|
self.rateSignal.emit(final_rate)
|
|
self.etaSignal.emit(0)
|
|
|
|
self.resultsSignal.emit(self.results)
|