merge main into dev #13
@@ -1,31 +1,29 @@
|
||||
import re
|
||||
import sys
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from datetime import datetime
|
||||
from math import ceil
|
||||
from queue import Empty, Queue
|
||||
from typing import List, Optional, Set, Union
|
||||
|
||||
from PySide6.QtCore import QThread
|
||||
from PySide6.QtCore import Signal as Signal
|
||||
import loguru
|
||||
from PySide6.QtCore import QThread, Signal
|
||||
|
||||
from src import LOG_DIR
|
||||
from src.logic import BookData
|
||||
from src.logic.lehmannsapi import LehmannsClient
|
||||
from src.logic.swb import SWB
|
||||
|
||||
log = loguru.logger
|
||||
log.remove()
|
||||
log.add(sys.stdout, level="INFO")
|
||||
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
|
||||
|
||||
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
|
||||
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
|
||||
if value is None:
|
||||
return set()
|
||||
vals = value if isinstance(value, list) else [value]
|
||||
out: Set[str] = set()
|
||||
for v in vals:
|
||||
s = str(v)
|
||||
digits = re.sub(r"[^0-9Xx]", "", s)
|
||||
# keep 13-digit or 10-digit tokens
|
||||
m13 = re.findall(r"97[89]\d{10}", digits)
|
||||
if m13:
|
||||
out.update(m13)
|
||||
else:
|
||||
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
|
||||
out.update(x.upper() for x in m10)
|
||||
return out
|
||||
log.add(
|
||||
f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
|
||||
rotation="1 day",
|
||||
retention="1 month",
|
||||
)
|
||||
|
||||
|
||||
def _norm_text(s: Optional[str]) -> str:
|
||||
@@ -65,6 +63,25 @@ def _same_book(a: BookData, b: BookData) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
|
||||
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
|
||||
if value is None:
|
||||
return set()
|
||||
vals = value if isinstance(value, list) else [value]
|
||||
out: Set[str] = set()
|
||||
for v in vals:
|
||||
s = str(v)
|
||||
digits = re.sub(r"[^0-9Xx]", "", s)
|
||||
# keep 13-digit or 10-digit tokens
|
||||
m13 = re.findall(r"97[89]\d{10}", digits)
|
||||
if m13:
|
||||
out.update(m13)
|
||||
else:
|
||||
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
|
||||
out.update(x.upper() for x in m10)
|
||||
return out
|
||||
|
||||
|
||||
def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
|
||||
"""
|
||||
If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
|
||||
@@ -94,56 +111,153 @@ def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
|
||||
|
||||
|
||||
class NewEditionCheckerThread(QThread):
|
||||
updateSignal = Signal(int, int)
|
||||
updateProgress = Signal(int, int)
|
||||
updateSignal = Signal(int, int) # (processed, total)
|
||||
updateProgress = Signal(int, int) # (processed, total)
|
||||
total_entries_signal = Signal(int)
|
||||
resultsSignal = Signal(list)
|
||||
resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]]
|
||||
|
||||
def __init__(self, entries: list[BookData], parent=None):
|
||||
def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
|
||||
super().__init__(parent)
|
||||
self.entries: list[BookData] = entries
|
||||
self.total_entries_signal.emit(len(entries))
|
||||
self.results: list[tuple[BookData, list[BookData]]] = []
|
||||
self.entries: list["BookData"] = entries if entries is not None else []
|
||||
self.results: list[tuple["BookData", list["BookData"]]] = []
|
||||
|
||||
def run(self):
|
||||
for book in self.entries:
|
||||
self.updateSignal.emit(self.entries.index(book) + 1, len(self.entries))
|
||||
author = (
|
||||
book.author.split(";")[0].replace(" ", "")
|
||||
if ";" in book.author
|
||||
else book.author.replace(" ", "")
|
||||
)
|
||||
# title = book.title.split(":")[0].strip()
|
||||
# remove trailing punctuation from title
|
||||
title = book.title.rstrip(" .:,;!?")
|
||||
# remove trailing text in parentheses
|
||||
title = re.sub(r"\s*\(.*\)", "", title)
|
||||
title = title.strip()
|
||||
response: list[BookData] = []
|
||||
response = SWB().getBooks(
|
||||
[
|
||||
"pica.bib=20735",
|
||||
f"pica.tit={title.split(':')[0].strip()}",
|
||||
# f"pica.per={author}",
|
||||
]
|
||||
)
|
||||
def reset(self):
|
||||
self.entries = []
|
||||
self.results = []
|
||||
|
||||
# in the response, remove the entry with the same ppn
|
||||
response = [entry for entry in response if entry.ppn != book.ppn]
|
||||
for respo in response:
|
||||
respo.link = "SWB"
|
||||
with LehmannsClient() as client:
|
||||
results = client.search_by_title(title, strict=True)
|
||||
# client.enrich_pages(results)
|
||||
if not results:
|
||||
continue
|
||||
# ---------- internal helpers ----------
|
||||
|
||||
@staticmethod
|
||||
def _split_evenly(items: list, parts: int) -> list[list]:
|
||||
"""Split items as evenly as possible into `parts` chunks (no empty tails)."""
|
||||
if parts <= 1 or len(items) <= 1:
|
||||
return [items]
|
||||
n = len(items)
|
||||
base = n // parts
|
||||
extra = n % parts
|
||||
chunks = []
|
||||
i = 0
|
||||
for k in range(parts):
|
||||
size = base + (1 if k < extra else 0)
|
||||
if size == 0:
|
||||
continue
|
||||
chunks.append(items[i : i + size])
|
||||
i += size
|
||||
return chunks
|
||||
|
||||
@staticmethod
|
||||
def _clean_title(raw: str) -> str:
|
||||
title = raw.rstrip(" .:,;!?")
|
||||
title = re.sub(r"\s*\(.*\)", "", title)
|
||||
return title.strip()
|
||||
|
||||
@classmethod
|
||||
def _process_book(
|
||||
cls, book: "BookData"
|
||||
) -> tuple["BookData", list["BookData"]] | None:
|
||||
author = (
|
||||
book.author.split(";")[0].replace(" ", "")
|
||||
if (book.author and ";" in book.author)
|
||||
else (book.author or "").replace(" ", "")
|
||||
)
|
||||
title = cls._clean_title(book.title or "")
|
||||
|
||||
# Query SWB
|
||||
response: list[BookData] = SWB().getBooks(
|
||||
[
|
||||
"pica.bib=20735",
|
||||
f"pica.tit={title.split(':')[0].strip()}",
|
||||
# f"pica.per={author}",
|
||||
]
|
||||
)
|
||||
|
||||
# Remove same PPN
|
||||
response = [entry for entry in response if entry.ppn != book.ppn]
|
||||
for respo in response:
|
||||
respo.link = "SWB"
|
||||
|
||||
# Query Lehmanns
|
||||
with LehmannsClient() as client:
|
||||
results = client.search_by_title(title, strict=True)
|
||||
if results:
|
||||
for res in results:
|
||||
response.append(BookData().from_LehmannsSearchResult(res))
|
||||
if response == []:
|
||||
continue
|
||||
# check results if lehmanns has a result with the same isbn from the results of swb. if so, if we have a signature, remove, else keep
|
||||
response = filter_prefer_swb(response)
|
||||
|
||||
result = (book, response)
|
||||
if not response:
|
||||
return None
|
||||
|
||||
self.results.append(result)
|
||||
response = filter_prefer_swb(response)
|
||||
|
||||
# Remove entries matching the same ISBN as the current book
|
||||
response = [
|
||||
entry
|
||||
for entry in response
|
||||
if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
|
||||
]
|
||||
|
||||
if not response:
|
||||
return None
|
||||
|
||||
return (book, response)
|
||||
|
||||
@classmethod
|
||||
def _worker(cls, items: list["BookData"], q: Queue) -> None:
|
||||
"""Worker for one chunk; pushes ('result', ...), ('progress', 1), and ('done', None)."""
|
||||
try:
|
||||
for book in items:
|
||||
try:
|
||||
result = cls._process_book(book)
|
||||
except Exception:
|
||||
result = None
|
||||
if result is not None:
|
||||
q.put(("result", result))
|
||||
q.put(("progress", 1))
|
||||
finally:
|
||||
q.put(("done", None))
|
||||
|
||||
# ---------- thread entry point ----------
|
||||
|
||||
def run(self):
|
||||
total = len(self.entries)
|
||||
self.total_entries_signal.emit(total)
|
||||
|
||||
if total == 0:
|
||||
log.debug("No entries to process.")
|
||||
self.resultsSignal.emit([])
|
||||
return
|
||||
|
||||
# Up to 4 workers; ~20 items per worker
|
||||
num_workers = min(4, max(1, ceil(total / 20)))
|
||||
chunks = self._split_evenly(self.entries, num_workers)
|
||||
sizes = [len(ch) for ch in chunks]
|
||||
|
||||
q: Queue = Queue()
|
||||
processed = 0
|
||||
finished_workers = 0
|
||||
|
||||
with ThreadPoolExecutor(max_workers=len(chunks)) as ex:
|
||||
futures = [ex.submit(self._worker, ch, q) for ch in chunks]
|
||||
|
||||
log.info(
|
||||
f"Launched {len(futures)} worker thread(s) for {total} entries: {sizes} entries per thread."
|
||||
)
|
||||
for idx, sz in enumerate(sizes, 1):
|
||||
log.debug(f"Thread {idx}: {sz} entries")
|
||||
|
||||
# Aggregate progress/results
|
||||
while finished_workers < len(chunks):
|
||||
try:
|
||||
kind, payload = q.get(timeout=0.1)
|
||||
except Empty:
|
||||
continue
|
||||
|
||||
if kind == "progress":
|
||||
processed += int(payload)
|
||||
self.updateSignal.emit(processed, total)
|
||||
self.updateProgress.emit(processed, total)
|
||||
elif kind == "result":
|
||||
self.results.append(payload)
|
||||
elif kind == "done":
|
||||
finished_workers += 1
|
||||
|
||||
self.resultsSignal.emit(self.results)
|
||||
|
||||
Reference in New Issue
Block a user