rework threads and also use app_ids where applicable
This commit is contained in:
@@ -1,19 +1,26 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
from math import ceil
|
||||
from queue import Empty, Queue
|
||||
from typing import List, Optional, Set, Union
|
||||
from time import monotonic # <-- NEW
|
||||
from typing import List, Optional
|
||||
|
||||
import loguru
|
||||
from PySide6.QtCore import QThread, Signal
|
||||
|
||||
from src import LOG_DIR
|
||||
|
||||
# from src.logic.webrequest import BibTextTransformer, WebRequest
|
||||
from src.backend.catalogue import Catalogue
|
||||
from src.logic import BookData
|
||||
from src.logic.lehmannsapi import LehmannsClient
|
||||
from src.logic.swb import SWB
|
||||
from src.logic.SRU import SWB
|
||||
|
||||
# use all available cores - 2, but at least 1
|
||||
THREAD_COUNT = max(os.cpu_count() - 2, 1)
|
||||
THREAD_MIN_ITEMS = 5
|
||||
|
||||
log = loguru.logger
|
||||
log.remove()
|
||||
@@ -23,89 +30,136 @@ log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
|
||||
log.add(
|
||||
f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
|
||||
rotation="1 day",
|
||||
retention="1 month",
|
||||
retention="7 days",
|
||||
)
|
||||
|
||||
swb = SWB()
|
||||
dnb = SWB()
|
||||
cat = Catalogue()
|
||||
|
||||
def _norm_text(s: Optional[str]) -> str:
|
||||
if not s:
|
||||
return ""
|
||||
# lowercase, collapse whitespace, drop some punctuation
|
||||
s = s.lower()
|
||||
s = re.sub(r"[\s\-\u2013\u2014]+", " ", s) # spaces/dashes
|
||||
s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s)
|
||||
return s.strip()
|
||||
RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative RVK character set
|
||||
|
||||
|
||||
def _same_book(a: BookData, b: BookData) -> bool:
|
||||
"""Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized."""
|
||||
isbns_a = _norm_isbns(a.isbn)
|
||||
isbns_b = _norm_isbns(b.isbn)
|
||||
if isbns_a and isbns_b and (isbns_a & isbns_b):
|
||||
return True
|
||||
|
||||
ta, tb = _norm_text(a.title), _norm_text(b.title)
|
||||
aa, ab = _norm_text(a.author), _norm_text(b.author)
|
||||
ya, yb = (a.year or "").strip(), (b.year or "").strip()
|
||||
|
||||
# strong title match required; then author if available; then year if available
|
||||
if ta and tb and ta == tb:
|
||||
if aa and ab and aa == ab:
|
||||
if ya and yb:
|
||||
return ya == yb
|
||||
return True
|
||||
if ya and yb:
|
||||
return ya == yb
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
|
||||
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
|
||||
if value is None:
|
||||
return set()
|
||||
vals = value if isinstance(value, list) else [value]
|
||||
out: Set[str] = set()
|
||||
for v in vals:
|
||||
s = str(v)
|
||||
digits = re.sub(r"[^0-9Xx]", "", s)
|
||||
# keep 13-digit or 10-digit tokens
|
||||
m13 = re.findall(r"97[89]\d{10}", digits)
|
||||
if m13:
|
||||
out.update(m13)
|
||||
else:
|
||||
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
|
||||
out.update(x.upper() for x in m10)
|
||||
return out
|
||||
|
||||
|
||||
def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
|
||||
def find_newer_edition(
|
||||
swb_result: BookData, dnb_result: List[BookData]
|
||||
) -> Optional[List[BookData]]:
|
||||
"""
|
||||
If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
|
||||
Returns a NEW list (does not mutate the input).
|
||||
New edition if:
|
||||
- year > swb.year OR
|
||||
- edition_number > swb.edition_number
|
||||
BUT: discard any candidate with year < swb.year (if both years are known).
|
||||
|
||||
Same-work check:
|
||||
- Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
|
||||
- If both have signatures and RVKs differ -> skip.
|
||||
|
||||
Preferences (in order):
|
||||
1) RVK matches SWB
|
||||
2) Print over Online-Ressource
|
||||
3) Has signature
|
||||
4) Newer: (year desc, edition_number desc)
|
||||
"""
|
||||
swb_with_sig = [
|
||||
r
|
||||
for r in records
|
||||
if (r.link == "SWB") and (r.signature and r.signature.strip())
|
||||
]
|
||||
if not swb_with_sig:
|
||||
return list(records)
|
||||
|
||||
to_remove: Set[int] = set()
|
||||
def strip_copy_and_edition(s: str) -> str:
|
||||
s = re.sub(r"\(\s*\d+\s*\)", "", s) # remove '(N)'
|
||||
s = re.sub(r"\s*\+\s*\d+\s*$", "", s) # remove trailing '+N'
|
||||
return s
|
||||
|
||||
# For each URL entry, see if it matches any SWB-with-signature entry
|
||||
for idx, rec in enumerate(records):
|
||||
if not rec.link or not rec.link.lower().startswith("http"):
|
||||
continue
|
||||
for swb in swb_with_sig:
|
||||
if _same_book(swb, rec):
|
||||
to_remove.add(idx)
|
||||
break
|
||||
def extract_rvk_root(sig: Optional[str]) -> str:
|
||||
if not sig:
|
||||
return ""
|
||||
t = strip_copy_and_edition(sig.upper())
|
||||
t = re.sub(r"\s+", " ", t).strip()
|
||||
m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
|
||||
if not m:
|
||||
cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
|
||||
return cleaned.split(" ")[0] if cleaned else ""
|
||||
return re.sub(r"\s+", " ", m.group(1)).strip()
|
||||
|
||||
# Build filtered list
|
||||
return [rec for i, rec in enumerate(records) if i not in to_remove]
|
||||
def has_sig(b: BookData) -> bool:
|
||||
return bool(getattr(b, "signature", None))
|
||||
|
||||
def is_online(b: BookData) -> bool:
|
||||
return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"
|
||||
|
||||
def is_print(b: BookData) -> bool:
|
||||
return not is_online(b)
|
||||
|
||||
def rvk_matches_swb(b: BookData) -> bool:
|
||||
if not has_sig(b) or not has_sig(swb_result):
|
||||
return False
|
||||
return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)
|
||||
|
||||
def strictly_newer(b: BookData) -> bool:
|
||||
# Hard guard: if both years are known and candidate is older, discard
|
||||
if (
|
||||
b.year is not None
|
||||
and swb_result.year is not None
|
||||
and b.year < swb_result.year
|
||||
):
|
||||
return False
|
||||
|
||||
newer_by_year = (
|
||||
b.year is not None
|
||||
and swb_result.year is not None
|
||||
and b.year > swb_result.year
|
||||
)
|
||||
newer_by_edition = (
|
||||
b.edition_number is not None
|
||||
and swb_result.edition_number is not None
|
||||
and b.edition_number > swb_result.edition_number
|
||||
)
|
||||
# Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
|
||||
return newer_by_year or newer_by_edition
|
||||
|
||||
swb_has_sig = has_sig(swb_result)
|
||||
swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))
|
||||
|
||||
# 1) Filter: same work (by RVK if both have sigs) AND strictly newer
|
||||
candidates: List[BookData] = []
|
||||
for b in dnb_result:
|
||||
if has_sig(b) and swb_has_sig:
|
||||
if extract_rvk_root(b.signature) != swb_rvk:
|
||||
continue # different work
|
||||
if strictly_newer(b):
|
||||
candidates.append(b)
|
||||
|
||||
if not candidates:
|
||||
return None
|
||||
|
||||
# 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
|
||||
def pref_score(x: BookData) -> tuple[int, int, int]:
|
||||
return (
|
||||
1 if rvk_matches_swb(x) else 0,
|
||||
1 if is_print(x) else 0,
|
||||
1 if has_sig(x) else 0,
|
||||
)
|
||||
|
||||
by_ppn: dict[Optional[str], BookData] = {}
|
||||
for b in candidates:
|
||||
key = getattr(b, "ppn", None)
|
||||
prev = by_ppn.get(key)
|
||||
if prev is None or pref_score(b) > pref_score(prev):
|
||||
by_ppn[key] = b
|
||||
|
||||
deduped = list(by_ppn.values())
|
||||
if not deduped:
|
||||
return None
|
||||
|
||||
# 3) Final pick (single best)
|
||||
def sort_key(b: BookData):
|
||||
year = b.year if b.year is not None else -1
|
||||
ed = b.edition_number if b.edition_number is not None else -1
|
||||
return (
|
||||
1 if rvk_matches_swb(b) else 0,
|
||||
1 if is_print(b) else 0,
|
||||
1 if has_sig(b) else 0,
|
||||
year,
|
||||
ed,
|
||||
)
|
||||
|
||||
best = max(deduped, key=sort_key)
|
||||
return [best] if best else None
|
||||
|
||||
|
||||
class NewEditionCheckerThread(QThread):
|
||||
@@ -115,8 +169,8 @@ class NewEditionCheckerThread(QThread):
|
||||
resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]]
|
||||
|
||||
# NEW: metrics signals
|
||||
rateSignal = Signal(float) # items per second ("it/s")
|
||||
etaSignal = Signal(int) # seconds remaining (-1 when unknown)
|
||||
rateSignal = Signal(float) # items per second ("it/s")
|
||||
etaSignal = Signal(int) # seconds remaining (-1 when unknown)
|
||||
|
||||
def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
|
||||
super().__init__(parent)
|
||||
@@ -157,54 +211,64 @@ class NewEditionCheckerThread(QThread):
|
||||
def _process_book(
|
||||
cls, book: "BookData"
|
||||
) -> tuple["BookData", list["BookData"]] | None:
|
||||
author = (
|
||||
book.author.split(";")[0].replace(" ", "")
|
||||
if (book.author and ";" in book.author)
|
||||
else (book.author or "").replace(" ", "")
|
||||
)
|
||||
title = cls._clean_title(book.title or "")
|
||||
|
||||
# Query SWB
|
||||
response: list[BookData] = SWB().getBooks(
|
||||
[
|
||||
"pica.bib=20735",
|
||||
f"pica.tit={title.split(':')[0].strip()}",
|
||||
# f"pica.per={author}",
|
||||
]
|
||||
)
|
||||
|
||||
# Remove same PPN
|
||||
response = [entry for entry in response if entry.ppn != book.ppn]
|
||||
for respo in response:
|
||||
respo.link = "SWB"
|
||||
|
||||
# Query Lehmanns
|
||||
with LehmannsClient() as client:
|
||||
results = client.search_by_title(title, strict=True)
|
||||
if results:
|
||||
for res in results:
|
||||
response.append(BookData().from_LehmannsSearchResult(res))
|
||||
|
||||
if not response:
|
||||
"""Process one book; returns (original, [found editions]) or None on failure."""
|
||||
if not book.title:
|
||||
return None
|
||||
|
||||
response = filter_prefer_swb(response)
|
||||
|
||||
# Remove entries matching the same ISBN as the current book
|
||||
response = [
|
||||
entry
|
||||
for entry in response
|
||||
if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
|
||||
]
|
||||
response = [
|
||||
entry
|
||||
for entry in response
|
||||
if book.publisher in entry.publisher
|
||||
response: list["BookData"] = []
|
||||
query = [
|
||||
f"pica.tit={book.title}",
|
||||
f"pica.vlg={book.publisher}",
|
||||
]
|
||||
|
||||
if not response:
|
||||
return None
|
||||
swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
|
||||
dnb_results = swb.getBooks(query)
|
||||
new_editions = find_newer_edition(swb_result, dnb_results)
|
||||
|
||||
if new_editions is not None:
|
||||
for new_edition in new_editions:
|
||||
new_edition.library_location = cat.get_location(new_edition.ppn)
|
||||
try:
|
||||
isbn = (
|
||||
str(new_edition.isbn[0])
|
||||
if isinstance(new_edition.isbn, list)
|
||||
else str(new_edition.isbn)
|
||||
)
|
||||
new_edition.link = (
|
||||
f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
|
||||
)
|
||||
except (IndexError, TypeError):
|
||||
isbn = None
|
||||
new_edition.in_library = cat.in_library(new_edition.ppn)
|
||||
response = new_editions
|
||||
|
||||
# client = SWB()
|
||||
# response: list["BookData"] = []
|
||||
# # First, search by title only
|
||||
# results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])
|
||||
|
||||
# lehmanns = LehmannsClient()
|
||||
# results = lehmanns.search_by_title(title)
|
||||
# for result in results:
|
||||
# if "(eBook)" in result.title:
|
||||
# result.title = result.title.replace("(eBook)", "").strip()
|
||||
# swb_results = client.getBooks(
|
||||
# [
|
||||
# f"pica.tit={result.title}",
|
||||
# f"pica.vlg={result.publisher.split(',')[0]}",
|
||||
# ]
|
||||
# )
|
||||
# for swb in swb_results:
|
||||
# if swb.isbn == result.isbn:
|
||||
# result.ppn = swb.ppn
|
||||
# result.signature = swb.signature
|
||||
# response.append(result)
|
||||
# if (result.edition_number < swb.edition_number) and (
|
||||
# swb.year > result.year
|
||||
# ):
|
||||
# response.append(result)
|
||||
if response == []:
|
||||
return None
|
||||
# Remove duplicates based on ppn
|
||||
return (book, response)
|
||||
|
||||
@classmethod
|
||||
@@ -240,7 +304,7 @@ class NewEditionCheckerThread(QThread):
|
||||
return
|
||||
|
||||
# Up to 4 workers; ~20 items per worker
|
||||
num_workers = min(4, max(1, ceil(total / 20)))
|
||||
num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
|
||||
chunks = self._split_evenly(self.entries, num_workers)
|
||||
sizes = [len(ch) for ch in chunks]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user