feat: implement NewEditionCheckerThread and related utility functions for book data processing
This commit is contained in:
149
src/backend/thread_neweditions.py
Normal file
149
src/backend/thread_neweditions.py
Normal file
@@ -0,0 +1,149 @@
|
||||
import re
|
||||
from typing import List, Optional, Set, Union
|
||||
|
||||
from PySide6.QtCore import QThread
|
||||
from PySide6.QtCore import Signal as Signal
|
||||
|
||||
from src.logic import BookData
|
||||
from src.logic.lehmannsapi import LehmannsClient
|
||||
from src.logic.swb import SWB
|
||||
|
||||
|
||||
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
|
||||
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
|
||||
if value is None:
|
||||
return set()
|
||||
vals = value if isinstance(value, list) else [value]
|
||||
out: Set[str] = set()
|
||||
for v in vals:
|
||||
s = str(v)
|
||||
digits = re.sub(r"[^0-9Xx]", "", s)
|
||||
# keep 13-digit or 10-digit tokens
|
||||
m13 = re.findall(r"97[89]\d{10}", digits)
|
||||
if m13:
|
||||
out.update(m13)
|
||||
else:
|
||||
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
|
||||
out.update(x.upper() for x in m10)
|
||||
return out
|
||||
|
||||
|
||||
def _norm_text(s: Optional[str]) -> str:
|
||||
if not s:
|
||||
return ""
|
||||
# lowercase, collapse whitespace, drop some punctuation
|
||||
s = s.lower()
|
||||
s = re.sub(r"[\s\-\u2013\u2014]+", " ", s) # spaces/dashes
|
||||
s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s)
|
||||
return s.strip()
|
||||
|
||||
|
||||
def _same_book(a: BookData, b: BookData) -> bool:
|
||||
"""Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized."""
|
||||
isbns_a = _norm_isbns(a.isbn)
|
||||
isbns_b = _norm_isbns(b.isbn)
|
||||
if isbns_a and isbns_b and (isbns_a & isbns_b):
|
||||
return True
|
||||
|
||||
ta, tb = _norm_text(a.title), _norm_text(b.title)
|
||||
aa, ab = _norm_text(a.author), _norm_text(b.author)
|
||||
ya, yb = (a.year or "").strip(), (b.year or "").strip()
|
||||
|
||||
# strong title match required; then author if available; then year if available
|
||||
if ta and tb and ta == tb:
|
||||
# if both have authors, require match
|
||||
if aa and ab and aa == ab:
|
||||
# if both have year, require match
|
||||
if ya and yb:
|
||||
return ya == yb
|
||||
return True
|
||||
# if one/both authors missing, allow title (+year if both present)
|
||||
if ya and yb:
|
||||
return ya == yb
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
|
||||
"""
|
||||
If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
|
||||
Returns a NEW list (does not mutate the input).
|
||||
"""
|
||||
swb_with_sig = [
|
||||
r
|
||||
for r in records
|
||||
if (r.link == "SWB") and (r.signature and r.signature.strip())
|
||||
]
|
||||
if not swb_with_sig:
|
||||
return list(records)
|
||||
|
||||
to_remove: Set[int] = set()
|
||||
|
||||
# For each URL entry, see if it matches any SWB-with-signature entry
|
||||
for idx, rec in enumerate(records):
|
||||
if not rec.link or not rec.link.lower().startswith("http"):
|
||||
continue
|
||||
for swb in swb_with_sig:
|
||||
if _same_book(swb, rec):
|
||||
to_remove.add(idx)
|
||||
break
|
||||
|
||||
# Build filtered list
|
||||
return [rec for i, rec in enumerate(records) if i not in to_remove]
|
||||
|
||||
|
||||
class NewEditionCheckerThread(QThread):
|
||||
updateSignal = Signal(int, int)
|
||||
updateProgress = Signal(int, int)
|
||||
total_entries_signal = Signal(int)
|
||||
resultsSignal = Signal(list)
|
||||
|
||||
def __init__(self, entries: list[BookData], parent=None):
|
||||
super().__init__(parent)
|
||||
self.entries: list[BookData] = entries
|
||||
self.total_entries_signal.emit(len(entries))
|
||||
self.results: list[tuple[BookData, list[BookData]]] = []
|
||||
|
||||
def run(self):
|
||||
for book in self.entries:
|
||||
self.updateSignal.emit(self.entries.index(book) + 1, len(self.entries))
|
||||
author = (
|
||||
book.author.split(";")[0].replace(" ", "")
|
||||
if ";" in book.author
|
||||
else book.author.replace(" ", "")
|
||||
)
|
||||
# title = book.title.split(":")[0].strip()
|
||||
# remove trailing punctuation from title
|
||||
title = book.title.rstrip(" .:,;!?")
|
||||
# remove trailing text in parentheses
|
||||
title = re.sub(r"\s*\(.*\)", "", title)
|
||||
title = title.strip()
|
||||
response: list[BookData] = []
|
||||
response = SWB().getBooks(
|
||||
[
|
||||
"pica.bib=20735",
|
||||
f"pica.tit={title.split(':')[0].strip()}",
|
||||
# f"pica.per={author}",
|
||||
]
|
||||
)
|
||||
|
||||
# in the response, remove the entry with the same ppn
|
||||
response = [entry for entry in response if entry.ppn != book.ppn]
|
||||
for respo in response:
|
||||
respo.link = "SWB"
|
||||
with LehmannsClient() as client:
|
||||
results = client.search_by_title(title, strict=True)
|
||||
# client.enrich_pages(results)
|
||||
if not results:
|
||||
continue
|
||||
for res in results:
|
||||
response.append(BookData().from_LehmannsSearchResult(res))
|
||||
if response == []:
|
||||
continue
|
||||
# check results if lehmanns has a result with the same isbn from the results of swb. if so, if we have a signature, remove, else keep
|
||||
response = filter_prefer_swb(response)
|
||||
|
||||
result = (book, response)
|
||||
|
||||
self.results.append(result)
|
||||
Reference in New Issue
Block a user