rework threads and also use app_ids where applicable

This commit is contained in:
2025-10-07 14:11:14 +02:00
parent 8e9eff4f3a
commit e061c1f5a9
4 changed files with 225 additions and 157 deletions

View File

@@ -1,19 +1,26 @@
import os
import re
import sys
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from math import ceil
from queue import Empty, Queue
from typing import List, Optional, Set, Union
from time import monotonic # <-- NEW
from typing import List, Optional
import loguru
from PySide6.QtCore import QThread, Signal
from src import LOG_DIR
# from src.logic.webrequest import BibTextTransformer, WebRequest
from src.backend.catalogue import Catalogue
from src.logic import BookData
from src.logic.lehmannsapi import LehmannsClient
from src.logic.swb import SWB
from src.logic.SRU import SWB
# use all available cores - 2, but at least 1
THREAD_COUNT = max(os.cpu_count() - 2, 1)
THREAD_MIN_ITEMS = 5
log = loguru.logger
log.remove()
@@ -23,89 +30,136 @@ log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
log.add(
f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
rotation="1 day",
retention="1 month",
retention="7 days",
)
swb = SWB()
dnb = SWB()
cat = Catalogue()
def _norm_text(s: Optional[str]) -> str:
if not s:
return ""
# lowercase, collapse whitespace, drop some punctuation
s = s.lower()
s = re.sub(r"[\s\-\u2013\u2014]+", " ", s) # spaces/dashes
s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s)
return s.strip()
RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative RVK character set
def _same_book(a: BookData, b: BookData) -> bool:
"""Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized."""
isbns_a = _norm_isbns(a.isbn)
isbns_b = _norm_isbns(b.isbn)
if isbns_a and isbns_b and (isbns_a & isbns_b):
return True
ta, tb = _norm_text(a.title), _norm_text(b.title)
aa, ab = _norm_text(a.author), _norm_text(b.author)
ya, yb = (a.year or "").strip(), (b.year or "").strip()
# strong title match required; then author if available; then year if available
if ta and tb and ta == tb:
if aa and ab and aa == ab:
if ya and yb:
return ya == yb
return True
if ya and yb:
return ya == yb
return True
return False
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
if value is None:
return set()
vals = value if isinstance(value, list) else [value]
out: Set[str] = set()
for v in vals:
s = str(v)
digits = re.sub(r"[^0-9Xx]", "", s)
# keep 13-digit or 10-digit tokens
m13 = re.findall(r"97[89]\d{10}", digits)
if m13:
out.update(m13)
else:
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
out.update(x.upper() for x in m10)
return out
def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
def find_newer_edition(
swb_result: BookData, dnb_result: List[BookData]
) -> Optional[List[BookData]]:
"""
If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
Returns a NEW list (does not mutate the input).
New edition if:
- year > swb.year OR
- edition_number > swb.edition_number
BUT: discard any candidate with year < swb.year (if both years are known).
Same-work check:
- Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
- If both have signatures and RVKs differ -> skip.
Preferences (in order):
1) RVK matches SWB
2) Print over Online-Ressource
3) Has signature
4) Newer: (year desc, edition_number desc)
"""
swb_with_sig = [
r
for r in records
if (r.link == "SWB") and (r.signature and r.signature.strip())
]
if not swb_with_sig:
return list(records)
to_remove: Set[int] = set()
def strip_copy_and_edition(s: str) -> str:
s = re.sub(r"\(\s*\d+\s*\)", "", s) # remove '(N)'
s = re.sub(r"\s*\+\s*\d+\s*$", "", s) # remove trailing '+N'
return s
# For each URL entry, see if it matches any SWB-with-signature entry
for idx, rec in enumerate(records):
if not rec.link or not rec.link.lower().startswith("http"):
continue
for swb in swb_with_sig:
if _same_book(swb, rec):
to_remove.add(idx)
break
def extract_rvk_root(sig: Optional[str]) -> str:
if not sig:
return ""
t = strip_copy_and_edition(sig.upper())
t = re.sub(r"\s+", " ", t).strip()
m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
if not m:
cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
return cleaned.split(" ")[0] if cleaned else ""
return re.sub(r"\s+", " ", m.group(1)).strip()
# Build filtered list
return [rec for i, rec in enumerate(records) if i not in to_remove]
def has_sig(b: BookData) -> bool:
return bool(getattr(b, "signature", None))
def is_online(b: BookData) -> bool:
return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"
def is_print(b: BookData) -> bool:
return not is_online(b)
def rvk_matches_swb(b: BookData) -> bool:
if not has_sig(b) or not has_sig(swb_result):
return False
return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)
def strictly_newer(b: BookData) -> bool:
# Hard guard: if both years are known and candidate is older, discard
if (
b.year is not None
and swb_result.year is not None
and b.year < swb_result.year
):
return False
newer_by_year = (
b.year is not None
and swb_result.year is not None
and b.year > swb_result.year
)
newer_by_edition = (
b.edition_number is not None
and swb_result.edition_number is not None
and b.edition_number > swb_result.edition_number
)
# Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
return newer_by_year or newer_by_edition
swb_has_sig = has_sig(swb_result)
swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))
# 1) Filter: same work (by RVK if both have sigs) AND strictly newer
candidates: List[BookData] = []
for b in dnb_result:
if has_sig(b) and swb_has_sig:
if extract_rvk_root(b.signature) != swb_rvk:
continue # different work
if strictly_newer(b):
candidates.append(b)
if not candidates:
return None
# 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
def pref_score(x: BookData) -> tuple[int, int, int]:
return (
1 if rvk_matches_swb(x) else 0,
1 if is_print(x) else 0,
1 if has_sig(x) else 0,
)
by_ppn: dict[Optional[str], BookData] = {}
for b in candidates:
key = getattr(b, "ppn", None)
prev = by_ppn.get(key)
if prev is None or pref_score(b) > pref_score(prev):
by_ppn[key] = b
deduped = list(by_ppn.values())
if not deduped:
return None
# 3) Final pick (single best)
def sort_key(b: BookData):
year = b.year if b.year is not None else -1
ed = b.edition_number if b.edition_number is not None else -1
return (
1 if rvk_matches_swb(b) else 0,
1 if is_print(b) else 0,
1 if has_sig(b) else 0,
year,
ed,
)
best = max(deduped, key=sort_key)
return [best] if best else None
class NewEditionCheckerThread(QThread):
@@ -115,8 +169,8 @@ class NewEditionCheckerThread(QThread):
resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]]
# NEW: metrics signals
rateSignal = Signal(float) # items per second ("it/s")
etaSignal = Signal(int) # seconds remaining (-1 when unknown)
rateSignal = Signal(float) # items per second ("it/s")
etaSignal = Signal(int) # seconds remaining (-1 when unknown)
def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
super().__init__(parent)
@@ -157,54 +211,64 @@ class NewEditionCheckerThread(QThread):
def _process_book(
cls, book: "BookData"
) -> tuple["BookData", list["BookData"]] | None:
author = (
book.author.split(";")[0].replace(" ", "")
if (book.author and ";" in book.author)
else (book.author or "").replace(" ", "")
)
title = cls._clean_title(book.title or "")
# Query SWB
response: list[BookData] = SWB().getBooks(
[
"pica.bib=20735",
f"pica.tit={title.split(':')[0].strip()}",
# f"pica.per={author}",
]
)
# Remove same PPN
response = [entry for entry in response if entry.ppn != book.ppn]
for respo in response:
respo.link = "SWB"
# Query Lehmanns
with LehmannsClient() as client:
results = client.search_by_title(title, strict=True)
if results:
for res in results:
response.append(BookData().from_LehmannsSearchResult(res))
if not response:
"""Process one book; returns (original, [found editions]) or None on failure."""
if not book.title:
return None
response = filter_prefer_swb(response)
# Remove entries matching the same ISBN as the current book
response = [
entry
for entry in response
if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
]
response = [
entry
for entry in response
if book.publisher in entry.publisher
response: list["BookData"] = []
query = [
f"pica.tit={book.title}",
f"pica.vlg={book.publisher}",
]
if not response:
return None
swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
dnb_results = swb.getBooks(query)
new_editions = find_newer_edition(swb_result, dnb_results)
if new_editions is not None:
for new_edition in new_editions:
new_edition.library_location = cat.get_location(new_edition.ppn)
try:
isbn = (
str(new_edition.isbn[0])
if isinstance(new_edition.isbn, list)
else str(new_edition.isbn)
)
new_edition.link = (
f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
)
except (IndexError, TypeError):
isbn = None
new_edition.in_library = cat.in_library(new_edition.ppn)
response = new_editions
# client = SWB()
# response: list["BookData"] = []
# # First, search by title only
# results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])
# lehmanns = LehmannsClient()
# results = lehmanns.search_by_title(title)
# for result in results:
# if "(eBook)" in result.title:
# result.title = result.title.replace("(eBook)", "").strip()
# swb_results = client.getBooks(
# [
# f"pica.tit={result.title}",
# f"pica.vlg={result.publisher.split(',')[0]}",
# ]
# )
# for swb in swb_results:
# if swb.isbn == result.isbn:
# result.ppn = swb.ppn
# result.signature = swb.signature
# response.append(result)
# if (result.edition_number < swb.edition_number) and (
# swb.year > result.year
# ):
# response.append(result)
if response == []:
return None
# Remove duplicates based on ppn
return (book, response)
@classmethod
@@ -240,7 +304,7 @@ class NewEditionCheckerThread(QThread):
return
# Up to 4 workers; ~20 items per worker
num_workers = min(4, max(1, ceil(total / 20)))
num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
chunks = self._split_evenly(self.entries, num_workers)
sizes = [len(ch) for ch in chunks]