feat: implement NewEditionCheckerThread and related utility functions for book data processing

This commit is contained in:
2025-09-01 14:31:23 +02:00
parent 7da2b3f65d
commit 5f15352401
2 changed files with 169 additions and 6 deletions

View File

@@ -1,8 +1,22 @@
from .semester import Semester
from .database import Database
__all__ = [
"AdminCommands",
"Semester",
"AutoAdder",
"AvailChecker",
"BookGrabber",
"Database",
"DocumentationThread",
"NewEditionCheckerThread",
"recreateElsaFile",
"recreateFile",
]
from .admin_console import AdminCommands
from .thread_bookgrabber import BookGrabber
from .threads_availchecker import AvailChecker
from .threads_autoadder import AutoAdder
from .create_file import recreateElsaFile, recreateFile
from .database import Database
from .documentation_thread import DocumentationThread
from .create_file import recreateFile, recreateElsaFile
from .semester import Semester
from .thread_bookgrabber import BookGrabber
from .thread_neweditions import NewEditionCheckerThread
from .threads_autoadder import AutoAdder
from .threads_availchecker import AvailChecker

View File

@@ -0,0 +1,149 @@
import re
from typing import List, Optional, Set, Union
from PySide6.QtCore import QThread
from PySide6.QtCore import Signal as Signal
from src.logic import BookData
from src.logic.lehmannsapi import LehmannsClient
from src.logic.swb import SWB
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
if value is None:
return set()
vals = value if isinstance(value, list) else [value]
out: Set[str] = set()
for v in vals:
s = str(v)
digits = re.sub(r"[^0-9Xx]", "", s)
# keep 13-digit or 10-digit tokens
m13 = re.findall(r"97[89]\d{10}", digits)
if m13:
out.update(m13)
else:
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
out.update(x.upper() for x in m10)
return out
def _norm_text(s: Optional[str]) -> str:
if not s:
return ""
# lowercase, collapse whitespace, drop some punctuation
s = s.lower()
s = re.sub(r"[\s\-\u2013\u2014]+", " ", s) # spaces/dashes
s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s)
return s.strip()
def _same_book(a: BookData, b: BookData) -> bool:
"""Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized."""
isbns_a = _norm_isbns(a.isbn)
isbns_b = _norm_isbns(b.isbn)
if isbns_a and isbns_b and (isbns_a & isbns_b):
return True
ta, tb = _norm_text(a.title), _norm_text(b.title)
aa, ab = _norm_text(a.author), _norm_text(b.author)
ya, yb = (a.year or "").strip(), (b.year or "").strip()
# strong title match required; then author if available; then year if available
if ta and tb and ta == tb:
# if both have authors, require match
if aa and ab and aa == ab:
# if both have year, require match
if ya and yb:
return ya == yb
return True
# if one/both authors missing, allow title (+year if both present)
if ya and yb:
return ya == yb
return True
return False
def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
"""
If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
Returns a NEW list (does not mutate the input).
"""
swb_with_sig = [
r
for r in records
if (r.link == "SWB") and (r.signature and r.signature.strip())
]
if not swb_with_sig:
return list(records)
to_remove: Set[int] = set()
# For each URL entry, see if it matches any SWB-with-signature entry
for idx, rec in enumerate(records):
if not rec.link or not rec.link.lower().startswith("http"):
continue
for swb in swb_with_sig:
if _same_book(swb, rec):
to_remove.add(idx)
break
# Build filtered list
return [rec for i, rec in enumerate(records) if i not in to_remove]
class NewEditionCheckerThread(QThread):
updateSignal = Signal(int, int)
updateProgress = Signal(int, int)
total_entries_signal = Signal(int)
resultsSignal = Signal(list)
def __init__(self, entries: list[BookData], parent=None):
super().__init__(parent)
self.entries: list[BookData] = entries
self.total_entries_signal.emit(len(entries))
self.results: list[tuple[BookData, list[BookData]]] = []
def run(self):
for book in self.entries:
self.updateSignal.emit(self.entries.index(book) + 1, len(self.entries))
author = (
book.author.split(";")[0].replace(" ", "")
if ";" in book.author
else book.author.replace(" ", "")
)
# title = book.title.split(":")[0].strip()
# remove trailing punctuation from title
title = book.title.rstrip(" .:,;!?")
# remove trailing text in parentheses
title = re.sub(r"\s*\(.*\)", "", title)
title = title.strip()
response: list[BookData] = []
response = SWB().getBooks(
[
"pica.bib=20735",
f"pica.tit={title.split(':')[0].strip()}",
# f"pica.per={author}",
]
)
# in the response, remove the entry with the same ppn
response = [entry for entry in response if entry.ppn != book.ppn]
for respo in response:
respo.link = "SWB"
with LehmannsClient() as client:
results = client.search_by_title(title, strict=True)
# client.enrich_pages(results)
if not results:
continue
for res in results:
response.append(BookData().from_LehmannsSearchResult(res))
if response == []:
continue
# check results if lehmanns has a result with the same isbn from the results of swb. if so, if we have a signature, remove, else keep
response = filter_prefer_swb(response)
result = (book, response)
self.results.append(result)