refactor: reorganize imports and enhance logging setup; improve book processing logic in NewEditionCheckerThread
This commit is contained in:
@@ -1,31 +1,29 @@
|
|||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
from datetime import datetime
|
||||||
|
from math import ceil
|
||||||
|
from queue import Empty, Queue
|
||||||
from typing import List, Optional, Set, Union
|
from typing import List, Optional, Set, Union
|
||||||
|
|
||||||
from PySide6.QtCore import QThread
|
import loguru
|
||||||
from PySide6.QtCore import Signal as Signal
|
from PySide6.QtCore import QThread, Signal
|
||||||
|
|
||||||
|
from src import LOG_DIR
|
||||||
from src.logic import BookData
|
from src.logic import BookData
|
||||||
from src.logic.lehmannsapi import LehmannsClient
|
from src.logic.lehmannsapi import LehmannsClient
|
||||||
from src.logic.swb import SWB
|
from src.logic.swb import SWB
|
||||||
|
|
||||||
|
log = loguru.logger
|
||||||
|
log.remove()
|
||||||
|
log.add(sys.stdout, level="INFO")
|
||||||
|
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
|
||||||
|
|
||||||
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
|
log.add(
|
||||||
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
|
f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
|
||||||
if value is None:
|
rotation="1 day",
|
||||||
return set()
|
retention="1 month",
|
||||||
vals = value if isinstance(value, list) else [value]
|
)
|
||||||
out: Set[str] = set()
|
|
||||||
for v in vals:
|
|
||||||
s = str(v)
|
|
||||||
digits = re.sub(r"[^0-9Xx]", "", s)
|
|
||||||
# keep 13-digit or 10-digit tokens
|
|
||||||
m13 = re.findall(r"97[89]\d{10}", digits)
|
|
||||||
if m13:
|
|
||||||
out.update(m13)
|
|
||||||
else:
|
|
||||||
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
|
|
||||||
out.update(x.upper() for x in m10)
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def _norm_text(s: Optional[str]) -> str:
|
def _norm_text(s: Optional[str]) -> str:
|
||||||
@@ -65,6 +63,25 @@ def _same_book(a: BookData, b: BookData) -> bool:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
|
||||||
|
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
|
||||||
|
if value is None:
|
||||||
|
return set()
|
||||||
|
vals = value if isinstance(value, list) else [value]
|
||||||
|
out: Set[str] = set()
|
||||||
|
for v in vals:
|
||||||
|
s = str(v)
|
||||||
|
digits = re.sub(r"[^0-9Xx]", "", s)
|
||||||
|
# keep 13-digit or 10-digit tokens
|
||||||
|
m13 = re.findall(r"97[89]\d{10}", digits)
|
||||||
|
if m13:
|
||||||
|
out.update(m13)
|
||||||
|
else:
|
||||||
|
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
|
||||||
|
out.update(x.upper() for x in m10)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
|
def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
|
||||||
"""
|
"""
|
||||||
If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
|
If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
|
||||||
@@ -94,56 +111,153 @@ def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
|
|||||||
|
|
||||||
|
|
||||||
class NewEditionCheckerThread(QThread):
|
class NewEditionCheckerThread(QThread):
|
||||||
updateSignal = Signal(int, int)
|
updateSignal = Signal(int, int) # (processed, total)
|
||||||
updateProgress = Signal(int, int)
|
updateProgress = Signal(int, int) # (processed, total)
|
||||||
total_entries_signal = Signal(int)
|
total_entries_signal = Signal(int)
|
||||||
resultsSignal = Signal(list)
|
resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]]
|
||||||
|
|
||||||
def __init__(self, entries: list[BookData], parent=None):
|
def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
|
||||||
super().__init__(parent)
|
super().__init__(parent)
|
||||||
self.entries: list[BookData] = entries
|
self.entries: list["BookData"] = entries if entries is not None else []
|
||||||
self.total_entries_signal.emit(len(entries))
|
self.results: list[tuple["BookData", list["BookData"]]] = []
|
||||||
self.results: list[tuple[BookData, list[BookData]]] = []
|
|
||||||
|
|
||||||
def run(self):
|
def reset(self):
|
||||||
for book in self.entries:
|
self.entries = []
|
||||||
self.updateSignal.emit(self.entries.index(book) + 1, len(self.entries))
|
self.results = []
|
||||||
author = (
|
|
||||||
book.author.split(";")[0].replace(" ", "")
|
|
||||||
if ";" in book.author
|
|
||||||
else book.author.replace(" ", "")
|
|
||||||
)
|
|
||||||
# title = book.title.split(":")[0].strip()
|
|
||||||
# remove trailing punctuation from title
|
|
||||||
title = book.title.rstrip(" .:,;!?")
|
|
||||||
# remove trailing text in parentheses
|
|
||||||
title = re.sub(r"\s*\(.*\)", "", title)
|
|
||||||
title = title.strip()
|
|
||||||
response: list[BookData] = []
|
|
||||||
response = SWB().getBooks(
|
|
||||||
[
|
|
||||||
"pica.bib=20735",
|
|
||||||
f"pica.tit={title.split(':')[0].strip()}",
|
|
||||||
# f"pica.per={author}",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
# in the response, remove the entry with the same ppn
|
# ---------- internal helpers ----------
|
||||||
response = [entry for entry in response if entry.ppn != book.ppn]
|
|
||||||
for respo in response:
|
@staticmethod
|
||||||
respo.link = "SWB"
|
def _split_evenly(items: list, parts: int) -> list[list]:
|
||||||
with LehmannsClient() as client:
|
"""Split items as evenly as possible into `parts` chunks (no empty tails)."""
|
||||||
results = client.search_by_title(title, strict=True)
|
if parts <= 1 or len(items) <= 1:
|
||||||
# client.enrich_pages(results)
|
return [items]
|
||||||
if not results:
|
n = len(items)
|
||||||
continue
|
base = n // parts
|
||||||
|
extra = n % parts
|
||||||
|
chunks = []
|
||||||
|
i = 0
|
||||||
|
for k in range(parts):
|
||||||
|
size = base + (1 if k < extra else 0)
|
||||||
|
if size == 0:
|
||||||
|
continue
|
||||||
|
chunks.append(items[i : i + size])
|
||||||
|
i += size
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _clean_title(raw: str) -> str:
|
||||||
|
title = raw.rstrip(" .:,;!?")
|
||||||
|
title = re.sub(r"\s*\(.*\)", "", title)
|
||||||
|
return title.strip()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _process_book(
|
||||||
|
cls, book: "BookData"
|
||||||
|
) -> tuple["BookData", list["BookData"]] | None:
|
||||||
|
author = (
|
||||||
|
book.author.split(";")[0].replace(" ", "")
|
||||||
|
if (book.author and ";" in book.author)
|
||||||
|
else (book.author or "").replace(" ", "")
|
||||||
|
)
|
||||||
|
title = cls._clean_title(book.title or "")
|
||||||
|
|
||||||
|
# Query SWB
|
||||||
|
response: list[BookData] = SWB().getBooks(
|
||||||
|
[
|
||||||
|
"pica.bib=20735",
|
||||||
|
f"pica.tit={title.split(':')[0].strip()}",
|
||||||
|
# f"pica.per={author}",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove same PPN
|
||||||
|
response = [entry for entry in response if entry.ppn != book.ppn]
|
||||||
|
for respo in response:
|
||||||
|
respo.link = "SWB"
|
||||||
|
|
||||||
|
# Query Lehmanns
|
||||||
|
with LehmannsClient() as client:
|
||||||
|
results = client.search_by_title(title, strict=True)
|
||||||
|
if results:
|
||||||
for res in results:
|
for res in results:
|
||||||
response.append(BookData().from_LehmannsSearchResult(res))
|
response.append(BookData().from_LehmannsSearchResult(res))
|
||||||
if response == []:
|
|
||||||
continue
|
|
||||||
# check results if lehmanns has a result with the same isbn from the results of swb. if so, if we have a signature, remove, else keep
|
|
||||||
response = filter_prefer_swb(response)
|
|
||||||
|
|
||||||
result = (book, response)
|
if not response:
|
||||||
|
return None
|
||||||
|
|
||||||
self.results.append(result)
|
response = filter_prefer_swb(response)
|
||||||
|
|
||||||
|
# Remove entries matching the same ISBN as the current book
|
||||||
|
response = [
|
||||||
|
entry
|
||||||
|
for entry in response
|
||||||
|
if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
|
||||||
|
]
|
||||||
|
|
||||||
|
if not response:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return (book, response)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _worker(cls, items: list["BookData"], q: Queue) -> None:
|
||||||
|
"""Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None)."""
|
||||||
|
try:
|
||||||
|
for book in items:
|
||||||
|
try:
|
||||||
|
result = cls._process_book(book)
|
||||||
|
except Exception:
|
||||||
|
result = None
|
||||||
|
if result is not None:
|
||||||
|
q.put(("result", result))
|
||||||
|
q.put(("progress", 1))
|
||||||
|
finally:
|
||||||
|
q.put(("done", None))
|
||||||
|
|
||||||
|
# ---------- thread entry point ----------
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
total = len(self.entries)
|
||||||
|
self.total_entries_signal.emit(total)
|
||||||
|
|
||||||
|
if total == 0:
|
||||||
|
log.debug("No entries to process.")
|
||||||
|
self.resultsSignal.emit([])
|
||||||
|
return
|
||||||
|
|
||||||
|
# Up to 4 workers; ~20 items per worker
|
||||||
|
num_workers = min(4, max(1, ceil(total / 20)))
|
||||||
|
chunks = self._split_evenly(self.entries, num_workers)
|
||||||
|
sizes = [len(ch) for ch in chunks]
|
||||||
|
|
||||||
|
q: Queue = Queue()
|
||||||
|
processed = 0
|
||||||
|
finished_workers = 0
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=len(chunks)) as ex:
|
||||||
|
futures = [ex.submit(self._worker, ch, q) for ch in chunks]
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread."
|
||||||
|
)
|
||||||
|
for idx, sz in enumerate(sizes, 1):
|
||||||
|
log.debug(f"Thread {idx}: {sz} entries")
|
||||||
|
|
||||||
|
# Aggregate progress/results
|
||||||
|
while finished_workers < len(chunks):
|
||||||
|
try:
|
||||||
|
kind, payload = q.get(timeout=0.1)
|
||||||
|
except Empty:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if kind == "progress":
|
||||||
|
processed += int(payload)
|
||||||
|
self.updateSignal.emit(processed, total)
|
||||||
|
self.updateProgress.emit(processed, total)
|
||||||
|
elif kind == "result":
|
||||||
|
self.results.append(payload)
|
||||||
|
elif kind == "done":
|
||||||
|
finished_workers += 1
|
||||||
|
|
||||||
|
self.resultsSignal.emit(self.results)
|
||||||
|
|||||||
Reference in New Issue
Block a user