refactor: reorganize imports and enhance logging setup; improve book processing logic in NewEditionCheckerThread

This commit is contained in:
2025-09-03 10:33:15 +02:00
parent 0e3199e289
commit b344d806e2

View File

@@ -1,31 +1,29 @@
import re
import sys
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from math import ceil
from queue import Empty, Queue
from typing import List, Optional, Set, Union
from PySide6.QtCore import QThread
from PySide6.QtCore import Signal as Signal
import loguru
from PySide6.QtCore import QThread, Signal
from src import LOG_DIR
from src.logic import BookData
from src.logic.lehmannsapi import LehmannsClient
from src.logic.swb import SWB
log = loguru.logger
log.remove()
log.add(sys.stdout, level="INFO")
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
if value is None:
return set()
vals = value if isinstance(value, list) else [value]
out: Set[str] = set()
for v in vals:
s = str(v)
digits = re.sub(r"[^0-9Xx]", "", s)
# keep 13-digit or 10-digit tokens
m13 = re.findall(r"97[89]\d{10}", digits)
if m13:
out.update(m13)
else:
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
out.update(x.upper() for x in m10)
return out
log.add(
f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
rotation="1 day",
retention="1 month",
)
def _norm_text(s: Optional[str]) -> str:
@@ -65,6 +63,25 @@ def _same_book(a: BookData, b: BookData) -> bool:
return False
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
if value is None:
return set()
vals = value if isinstance(value, list) else [value]
out: Set[str] = set()
for v in vals:
s = str(v)
digits = re.sub(r"[^0-9Xx]", "", s)
# keep 13-digit or 10-digit tokens
m13 = re.findall(r"97[89]\d{10}", digits)
if m13:
out.update(m13)
else:
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
out.update(x.upper() for x in m10)
return out
def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
"""
If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
@@ -94,56 +111,153 @@ def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
class NewEditionCheckerThread(QThread):
updateSignal = Signal(int, int)
updateProgress = Signal(int, int)
updateSignal = Signal(int, int) # (processed, total)
updateProgress = Signal(int, int) # (processed, total)
total_entries_signal = Signal(int)
resultsSignal = Signal(list)
resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]]
def __init__(self, entries: list[BookData], parent=None):
def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
super().__init__(parent)
self.entries: list[BookData] = entries
self.total_entries_signal.emit(len(entries))
self.results: list[tuple[BookData, list[BookData]]] = []
self.entries: list["BookData"] = entries if entries is not None else []
self.results: list[tuple["BookData", list["BookData"]]] = []
def run(self):
for book in self.entries:
self.updateSignal.emit(self.entries.index(book) + 1, len(self.entries))
author = (
book.author.split(";")[0].replace(" ", "")
if ";" in book.author
else book.author.replace(" ", "")
)
# title = book.title.split(":")[0].strip()
# remove trailing punctuation from title
title = book.title.rstrip(" .:,;!?")
# remove trailing text in parentheses
title = re.sub(r"\s*\(.*\)", "", title)
title = title.strip()
response: list[BookData] = []
response = SWB().getBooks(
[
"pica.bib=20735",
f"pica.tit={title.split(':')[0].strip()}",
# f"pica.per={author}",
]
)
def reset(self):
self.entries = []
self.results = []
# in the response, remove the entry with the same ppn
response = [entry for entry in response if entry.ppn != book.ppn]
for respo in response:
respo.link = "SWB"
with LehmannsClient() as client:
results = client.search_by_title(title, strict=True)
# client.enrich_pages(results)
if not results:
continue
# ---------- internal helpers ----------
@staticmethod
def _split_evenly(items: list, parts: int) -> list[list]:
"""Split items as evenly as possible into `parts` chunks (no empty tails)."""
if parts <= 1 or len(items) <= 1:
return [items]
n = len(items)
base = n // parts
extra = n % parts
chunks = []
i = 0
for k in range(parts):
size = base + (1 if k < extra else 0)
if size == 0:
continue
chunks.append(items[i : i + size])
i += size
return chunks
@staticmethod
def _clean_title(raw: str) -> str:
title = raw.rstrip(" .:,;!?")
title = re.sub(r"\s*\(.*\)", "", title)
return title.strip()
@classmethod
def _process_book(
cls, book: "BookData"
) -> tuple["BookData", list["BookData"]] | None:
author = (
book.author.split(";")[0].replace(" ", "")
if (book.author and ";" in book.author)
else (book.author or "").replace(" ", "")
)
title = cls._clean_title(book.title or "")
# Query SWB
response: list[BookData] = SWB().getBooks(
[
"pica.bib=20735",
f"pica.tit={title.split(':')[0].strip()}",
# f"pica.per={author}",
]
)
# Remove same PPN
response = [entry for entry in response if entry.ppn != book.ppn]
for respo in response:
respo.link = "SWB"
# Query Lehmanns
with LehmannsClient() as client:
results = client.search_by_title(title, strict=True)
if results:
for res in results:
response.append(BookData().from_LehmannsSearchResult(res))
if response == []:
continue
# check results if lehmanns has a result with the same isbn from the results of swb. if so, if we have a signature, remove, else keep
response = filter_prefer_swb(response)
result = (book, response)
if not response:
return None
self.results.append(result)
response = filter_prefer_swb(response)
# Remove entries matching the same ISBN as the current book
response = [
entry
for entry in response
if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
]
if not response:
return None
return (book, response)
@classmethod
def _worker(cls, items: list["BookData"], q: Queue) -> None:
"""Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None)."""
try:
for book in items:
try:
result = cls._process_book(book)
except Exception:
result = None
if result is not None:
q.put(("result", result))
q.put(("progress", 1))
finally:
q.put(("done", None))
# ---------- thread entry point ----------
def run(self):
total = len(self.entries)
self.total_entries_signal.emit(total)
if total == 0:
log.debug("No entries to process.")
self.resultsSignal.emit([])
return
# Up to 4 workers; ~20 items per worker
num_workers = min(4, max(1, ceil(total / 20)))
chunks = self._split_evenly(self.entries, num_workers)
sizes = [len(ch) for ch in chunks]
q: Queue = Queue()
processed = 0
finished_workers = 0
with ThreadPoolExecutor(max_workers=len(chunks)) as ex:
futures = [ex.submit(self._worker, ch, q) for ch in chunks]
log.info(
f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread."
)
for idx, sz in enumerate(sizes, 1):
log.debug(f"Thread {idx}: {sz} entries")
# Aggregate progress/results
while finished_workers < len(chunks):
try:
kind, payload = q.get(timeout=0.1)
except Empty:
continue
if kind == "progress":
processed += int(payload)
self.updateSignal.emit(processed, total)
self.updateProgress.emit(processed, total)
elif kind == "result":
self.results.append(payload)
elif kind == "done":
finished_workers += 1
self.resultsSignal.emit(self.results)