chore: restructured project, updated readme
This commit is contained in:
345
src/background/new_editions.py
Normal file
345
src/background/new_editions.py
Normal file
@@ -0,0 +1,345 @@
|
||||
import os
|
||||
import re
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from math import ceil
|
||||
from queue import Empty, Queue
|
||||
from time import monotonic # <-- NEW
|
||||
from typing import List, Optional
|
||||
|
||||
from PySide6.QtCore import QThread, Signal
|
||||
|
||||
# from src.services.webrequest import BibTextTransformer, WebRequest
|
||||
from src.services.catalogue import Catalogue
|
||||
from src.core.models import BookData
|
||||
from src.services.sru import SWB
|
||||
from src.shared.logging import log
|
||||
|
||||
# use all available cores - 2, but at least 1
|
||||
THREAD_COUNT = max(os.cpu_count() - 2, 1)
|
||||
THREAD_MIN_ITEMS = 5
|
||||
|
||||
# Logger configured centrally in main; use shared `log`
|
||||
|
||||
swb = SWB()
|
||||
dnb = SWB()
|
||||
cat = Catalogue()
|
||||
|
||||
RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative RVK character set
|
||||
|
||||
|
||||
def find_newer_edition(
|
||||
swb_result: BookData, dnb_result: List[BookData]
|
||||
) -> Optional[List[BookData]]:
|
||||
"""
|
||||
New edition if:
|
||||
- year > swb.year OR
|
||||
- edition_number > swb.edition_number
|
||||
BUT: discard any candidate with year < swb.year (if both years are known).
|
||||
|
||||
Same-work check:
|
||||
- Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
|
||||
- If both have signatures and RVKs differ -> skip.
|
||||
|
||||
Preferences (in order):
|
||||
1) RVK matches SWB
|
||||
2) Print over Online-Ressource
|
||||
3) Has signature
|
||||
4) Newer: (year desc, edition_number desc)
|
||||
"""
|
||||
|
||||
def strip_copy_and_edition(s: str) -> str:
|
||||
s = re.sub(r"\(\s*\d+\s*\)", "", s) # remove '(N)'
|
||||
s = re.sub(r"\s*\+\s*\d+\s*$", "", s) # remove trailing '+N'
|
||||
return s
|
||||
|
||||
def extract_rvk_root(sig: Optional[str]) -> str:
|
||||
if not sig:
|
||||
return ""
|
||||
t = strip_copy_and_edition(sig.upper())
|
||||
t = re.sub(r"\s+", " ", t).strip()
|
||||
m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
|
||||
if not m:
|
||||
cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
|
||||
return cleaned.split(" ")[0] if cleaned else ""
|
||||
return re.sub(r"\s+", " ", m.group(1)).strip()
|
||||
|
||||
def has_sig(b: BookData) -> bool:
|
||||
return bool(getattr(b, "signature", None))
|
||||
|
||||
def is_online(b: BookData) -> bool:
|
||||
return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"
|
||||
|
||||
def is_print(b: BookData) -> bool:
|
||||
return not is_online(b)
|
||||
|
||||
def rvk_matches_swb(b: BookData) -> bool:
|
||||
if not has_sig(b) or not has_sig(swb_result):
|
||||
return False
|
||||
return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)
|
||||
|
||||
def strictly_newer(b: BookData) -> bool:
|
||||
# Hard guard: if both years are known and candidate is older, discard
|
||||
if (
|
||||
b.year is not None
|
||||
and swb_result.year is not None
|
||||
and b.year < swb_result.year
|
||||
):
|
||||
return False
|
||||
|
||||
newer_by_year = (
|
||||
b.year is not None
|
||||
and swb_result.year is not None
|
||||
and b.year > swb_result.year
|
||||
)
|
||||
newer_by_edition = (
|
||||
b.edition_number is not None
|
||||
and swb_result.edition_number is not None
|
||||
and b.edition_number > swb_result.edition_number
|
||||
)
|
||||
# Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
|
||||
return newer_by_year or newer_by_edition
|
||||
|
||||
swb_has_sig = has_sig(swb_result)
|
||||
swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))
|
||||
|
||||
# 1) Filter: same work (by RVK if both have sigs) AND strictly newer
|
||||
candidates: List[BookData] = []
|
||||
for b in dnb_result:
|
||||
if has_sig(b) and swb_has_sig:
|
||||
if extract_rvk_root(b.signature) != swb_rvk:
|
||||
continue # different work
|
||||
if strictly_newer(b):
|
||||
candidates.append(b)
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
|
||||
def pref_score(x: BookData) -> tuple[int, int, int]:
|
||||
return (
|
||||
1 if rvk_matches_swb(x) else 0,
|
||||
1 if is_print(x) else 0,
|
||||
1 if has_sig(x) else 0,
|
||||
)
|
||||
|
||||
by_ppn: dict[Optional[str], BookData] = {}
|
||||
for b in candidates:
|
||||
key = getattr(b, "ppn", None)
|
||||
prev = by_ppn.get(key)
|
||||
if prev is None or pref_score(b) > pref_score(prev):
|
||||
by_ppn[key] = b
|
||||
|
||||
deduped = list(by_ppn.values())
|
||||
if not deduped:
|
||||
return None
|
||||
|
||||
# 3) Preserve all qualifying newer editions, but order by preference
|
||||
def sort_key(b: BookData):
|
||||
year = b.year if b.year is not None else -1
|
||||
ed = b.edition_number if b.edition_number is not None else -1
|
||||
return (
|
||||
1 if rvk_matches_swb(b) else 0,
|
||||
1 if is_print(b) else 0,
|
||||
1 if has_sig(b) else 0,
|
||||
year,
|
||||
ed,
|
||||
)
|
||||
|
||||
deduped.sort(key=sort_key, reverse=True)
|
||||
return deduped
|
||||
|
||||
|
||||
class NewEditionCheckerThread(QThread):
|
||||
updateSignal = Signal(int, int) # (processed, total)
|
||||
updateProgress = Signal(int, int) # (processed, total)
|
||||
total_entries_signal = Signal(int)
|
||||
resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]]
|
||||
|
||||
# NEW: metrics signals
|
||||
rateSignal = Signal(float) # items per second ("it/s")
|
||||
etaSignal = Signal(int) # seconds remaining (-1 when unknown)
|
||||
|
||||
def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
|
||||
super().__init__(parent)
|
||||
self.entries: list["BookData"] = entries if entries is not None else []
|
||||
self.results: list[tuple["BookData", list["BookData"]]] = []
|
||||
|
||||
def reset(self):
|
||||
self.entries = []
|
||||
self.results = []
|
||||
|
||||
# ---------- internal helpers ----------
|
||||
|
||||
@staticmethod
|
||||
def _split_evenly(items: list, parts: int) -> list[list]:
|
||||
"""Split items as evenly as possible into `parts` chunks (no empty tails)."""
|
||||
if parts <= 1 or len(items) <= 1:
|
||||
return [items]
|
||||
n = len(items)
|
||||
base = n // parts
|
||||
extra = n % parts
|
||||
chunks = []
|
||||
i = 0
|
||||
for k in range(parts):
|
||||
size = base + (1 if k < extra else 0)
|
||||
if size == 0:
|
||||
continue
|
||||
chunks.append(items[i : i + size])
|
||||
i += size
|
||||
return chunks
|
||||
|
||||
@staticmethod
|
||||
def _clean_title(raw: str) -> str:
|
||||
title = raw.rstrip(" .:,;!?")
|
||||
title = re.sub(r"\s*\(.*\)", "", title)
|
||||
return title.strip()
|
||||
|
||||
@classmethod
|
||||
def _process_book(
|
||||
cls, book: "BookData"
|
||||
) -> tuple["BookData", list["BookData"]] | None:
|
||||
"""Process one book; returns (original, [found editions]) or None on failure."""
|
||||
if not book.title:
|
||||
return None
|
||||
response: list["BookData"] = []
|
||||
query = [
|
||||
f"pica.tit={book.title}",
|
||||
f"pica.vlg={book.publisher}",
|
||||
]
|
||||
|
||||
swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
|
||||
dnb_results = swb.getBooks(query)
|
||||
new_editions = find_newer_edition(swb_result, dnb_results)
|
||||
|
||||
if new_editions is not None:
|
||||
for new_edition in new_editions:
|
||||
new_edition.library_location = cat.get_location(new_edition.ppn)
|
||||
try:
|
||||
isbn = (
|
||||
str(new_edition.isbn[0])
|
||||
if isinstance(new_edition.isbn, list)
|
||||
else str(new_edition.isbn)
|
||||
)
|
||||
new_edition.link = (
|
||||
f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
|
||||
)
|
||||
except (IndexError, TypeError):
|
||||
isbn = None
|
||||
new_edition.in_library = cat.in_library(new_edition.ppn)
|
||||
response = new_editions
|
||||
|
||||
# client = SWB()
|
||||
# response: list["BookData"] = []
|
||||
# # First, search by title only
|
||||
# results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])
|
||||
|
||||
# lehmanns = LehmannsClient()
|
||||
# results = lehmanns.search_by_title(title)
|
||||
# for result in results:
|
||||
# if "(eBook)" in result.title:
|
||||
# result.title = result.title.replace("(eBook)", "").strip()
|
||||
# swb_results = client.getBooks(
|
||||
# [
|
||||
# f"pica.tit={result.title}",
|
||||
# f"pica.vlg={result.publisher.split(',')[0]}",
|
||||
# ]
|
||||
# )
|
||||
# for swb in swb_results:
|
||||
# if swb.isbn == result.isbn:
|
||||
# result.ppn = swb.ppn
|
||||
# result.signature = swb.signature
|
||||
# response.append(result)
|
||||
# if (result.edition_number < swb.edition_number) and (
|
||||
# swb.year > result.year
|
||||
# ):
|
||||
# response.append(result)
|
||||
if response == []:
|
||||
return None
|
||||
# Remove duplicates based on ppn
|
||||
return (book, response)
|
||||
|
||||
@classmethod
|
||||
def _worker(cls, items: list["BookData"], q: Queue) -> None:
|
||||
"""Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None)."""
|
||||
try:
|
||||
for book in items:
|
||||
try:
|
||||
result = cls._process_book(book)
|
||||
except Exception:
|
||||
result = None
|
||||
if result is not None:
|
||||
q.put(("result", result))
|
||||
q.put(("progress", 1))
|
||||
finally:
|
||||
q.put(("done", None))
|
||||
|
||||
# ---------- thread entry point ----------
|
||||
|
||||
def run(self):
|
||||
total = len(self.entries)
|
||||
self.total_entries_signal.emit(total)
|
||||
|
||||
# start timer for metrics
|
||||
t0 = monotonic()
|
||||
|
||||
if total == 0:
|
||||
log.debug("No entries to process.")
|
||||
# emit metrics (zero work)
|
||||
self.rateSignal.emit(0.0)
|
||||
self.etaSignal.emit(0)
|
||||
self.resultsSignal.emit([])
|
||||
return
|
||||
|
||||
# Up to 4 workers; ~20 items per worker
|
||||
num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
|
||||
chunks = self._split_evenly(self.entries, num_workers)
|
||||
sizes = [len(ch) for ch in chunks]
|
||||
|
||||
q: Queue = Queue()
|
||||
processed = 0
|
||||
finished_workers = 0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=len(chunks)) as ex:
|
||||
futures = [ex.submit(self._worker, ch, q) for ch in chunks]
|
||||
|
||||
log.info(
|
||||
f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread."
|
||||
)
|
||||
for idx, sz in enumerate(sizes, 1):
|
||||
log.debug(f"Thread {idx}: {sz} entries")
|
||||
|
||||
# Aggregate progress/results
|
||||
while finished_workers < len(chunks):
|
||||
try:
|
||||
kind, payload = q.get(timeout=0.1)
|
||||
except Empty:
|
||||
continue
|
||||
|
||||
if kind == "progress":
|
||||
processed += int(payload)
|
||||
self.updateSignal.emit(processed, total)
|
||||
self.updateProgress.emit(processed, total)
|
||||
|
||||
# ---- NEW: compute & emit metrics ----
|
||||
elapsed = max(1e-9, monotonic() - t0)
|
||||
rate = processed / elapsed # items per second
|
||||
remaining = max(0, total - processed)
|
||||
eta_sec = int(round(remaining / rate)) if rate > 0 else -1
|
||||
|
||||
self.rateSignal.emit(rate)
|
||||
# clamp negative just in case
|
||||
self.etaSignal.emit(max(0, eta_sec) if eta_sec >= 0 else -1)
|
||||
# -------------------------------------
|
||||
|
||||
elif kind == "result":
|
||||
self.results.append(payload)
|
||||
elif kind == "done":
|
||||
finished_workers += 1
|
||||
|
||||
# Final metrics on completion
|
||||
elapsed_total = max(1e-9, monotonic() - t0)
|
||||
final_rate = total / elapsed_total
|
||||
self.rateSignal.emit(final_rate)
|
||||
self.etaSignal.emit(0)
|
||||
|
||||
self.resultsSignal.emit(self.results)
|
||||
Reference in New Issue
Block a user