diff --git a/src/backend/thread_bookgrabber.py b/src/backend/thread_bookgrabber.py index 02fef02..0594caa 100644 --- a/src/backend/thread_bookgrabber.py +++ b/src/backend/thread_bookgrabber.py @@ -1,11 +1,12 @@ -from PySide6.QtCore import QThread -from PySide6.QtCore import Signal -from src.backend import Database - -from src.logic.webrequest import BibTextTransformer, WebRequest -import loguru import sys + +import loguru +from PySide6.QtCore import QThread, Signal + from src import LOG_DIR +from src.backend import Database +from src.logic.webrequest import BibTextTransformer, WebRequest + log = loguru.logger log.remove() log.add(sys.stdout, level="INFO") @@ -31,9 +32,11 @@ class BookGrabber(QThread): self.book_id = None self.use_any = False self.use_exact = False - self.app_id = None + self.app_nr = None self.tstate = (self.app_id, self.prof_id, self.mode, self.data) self.request = WebRequest() + self.db = Database() + def add_values( self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False @@ -45,13 +48,13 @@ class BookGrabber(QThread): self.use_any = any_book self.use_exact = exact log.info(f"Working on {len(self.data)} entries") - self.tstate = (self.app_id, self.prof_id, self.mode, self.data) + self.tstate = (self.app_nr, self.prof_id, self.mode, self.data) log.debug("State: " + str(self.tstate)) - self.request.set_apparat(self.app_id) + app_nr = self.db.query_db("SELECT appnr FROM semesterapparat WHERE id = ?", (self.app_id,))[0][0] + self.request.set_apparat(app_nr) # log.debug(self.tstate) def run(self): - self.db = Database() item = 0 iterdata = self.data # log.debug(iterdata) @@ -91,7 +94,7 @@ class BookGrabber(QThread): state = 0 for result in transformer.RDS_DATA: # log.debug(result.RDS_LOCATION) - if str(self.app_id) in result.RDS_LOCATION: + if str(self.app_nr) in result.RDS_LOCATION: state = 1 break @@ -126,27 +129,27 @@ class BookGrabberTest(QThread): self.is_Running = True log.info("Starting worker thread") self.data = None - self.app_id = None + self.app_nr = None self.prof_id = None self.mode = None self.book_id = None self.use_any = False self.use_exact = False - self.app_id = appnr - self.tstate = (self.app_id, self.prof_id, self.mode, self.data) + self.app_nr = appnr + self.tstate = (self.app_nr, self.prof_id, self.mode, self.data) self.results = [] def add_values( - self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False + self, app_nr: int, prof_id: int, mode: str, data, any_book=False, exact=False ): - self.app_id = app_id + self.app_nr = app_nr self.prof_id = prof_id self.mode = mode self.data = data self.use_any = any_book self.use_exact = exact log.info(f"Working on {len(self.data)} entries") - self.tstate = (self.app_id, self.prof_id, self.mode, self.data) + self.tstate = (self.app_nr, self.prof_id, self.mode, self.data) log.debug("State: " + str(self.tstate)) # log.debug(self.tstate) @@ -159,7 +162,7 @@ class BookGrabberTest(QThread): signature = str(entry) log.info("Processing entry: " + signature) - webdata = WebRequest().set_apparat(self.app_id).get_ppn(entry) + webdata = WebRequest().set_apparat(self.app_nr).get_ppn(entry) if self.use_any: webdata = webdata.use_any_book webdata = webdata.get_data() @@ -186,7 +189,7 @@ class BookGrabberTest(QThread): state = 0 for result in transformer.RDS_DATA: # log.debug(result.RDS_LOCATION) - if str(self.app_id) in result.RDS_LOCATION: + if str(self.app_nr) in result.RDS_LOCATION: state = 1 break diff --git a/src/backend/thread_neweditions.py b/src/backend/thread_neweditions.py index 245c091..8849d24 100644 --- a/src/backend/thread_neweditions.py +++ b/src/backend/thread_neweditions.py @@ -1,19 +1,26 @@ +import os import re import sys from concurrent.futures import ThreadPoolExecutor from datetime import datetime from math import ceil from queue import Empty, Queue -from typing import List, Optional, Set, Union from time import monotonic # <-- NEW +from typing import List, Optional import loguru from PySide6.QtCore import QThread, Signal from src import LOG_DIR + +# from src.logic.webrequest import BibTextTransformer, WebRequest +from src.backend.catalogue import Catalogue from src.logic import BookData -from src.logic.lehmannsapi import LehmannsClient -from src.logic.swb import SWB +from src.logic.SRU import SWB + +# use all available cores - 2, but at least 1 +THREAD_COUNT = max(os.cpu_count() - 2, 1) +THREAD_MIN_ITEMS = 5 log = loguru.logger log.remove() @@ -23,89 +30,136 @@ log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days") log.add( f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log", rotation="1 day", - retention="1 month", + retention="7 days", ) +swb = SWB() +dnb = SWB() +cat = Catalogue() -def _norm_text(s: Optional[str]) -> str: - if not s: - return "" - # lowercase, collapse whitespace, drop some punctuation - s = s.lower() - s = re.sub(r"[\s\-\u2013\u2014]+", " ", s) # spaces/dashes - s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s) - return s.strip() +RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative RVK character set -def _same_book(a: BookData, b: BookData) -> bool: - """Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized.""" - isbns_a = _norm_isbns(a.isbn) - isbns_b = _norm_isbns(b.isbn) - if isbns_a and isbns_b and (isbns_a & isbns_b): - return True - - ta, tb = _norm_text(a.title), _norm_text(b.title) - aa, ab = _norm_text(a.author), _norm_text(b.author) - ya, yb = (a.year or "").strip(), (b.year or "").strip() - - # strong title match required; then author if available; then year if available - if ta and tb and ta == tb: - if aa and ab and aa == ab: - if ya and yb: - return ya == yb - return True - if ya and yb: - return ya == yb - return True - - return False - - -def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]: - """Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present).""" - if value is None: - return set() - vals = value if isinstance(value, list) else [value] - out: Set[str] = set() - for v in vals: - s = str(v) - digits = re.sub(r"[^0-9Xx]", "", s) - # keep 13-digit or 10-digit tokens - m13 = re.findall(r"97[89]\d{10}", digits) - if m13: - out.update(m13) - else: - m10 = re.findall(r"\d{9}[0-9Xx]", digits) - out.update(x.upper() for x in m10) - return out - - -def filter_prefer_swb(records: List[BookData]) -> List[BookData]: +def find_newer_edition( + swb_result: BookData, dnb_result: List[BookData] +) -> Optional[List[BookData]]: """ - If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s). - Returns a NEW list (does not mutate the input). + New edition if: + - year > swb.year OR + - edition_number > swb.edition_number + BUT: discard any candidate with year < swb.year (if both years are known). + + Same-work check: + - Compare RVK roots of signatures (after stripping trailing '+N' and '(N)'). + - If both have signatures and RVKs differ -> skip. + + Preferences (in order): + 1) RVK matches SWB + 2) Print over Online-Ressource + 3) Has signature + 4) Newer: (year desc, edition_number desc) """ - swb_with_sig = [ - r - for r in records - if (r.link == "SWB") and (r.signature and r.signature.strip()) - ] - if not swb_with_sig: - return list(records) - to_remove: Set[int] = set() + def strip_copy_and_edition(s: str) -> str: + s = re.sub(r"\(\s*\d+\s*\)", "", s) # remove '(N)' + s = re.sub(r"\s*\+\s*\d+\s*$", "", s) # remove trailing '+N' + return s - # For each URL entry, see if it matches any SWB-with-signature entry - for idx, rec in enumerate(records): - if not rec.link or not rec.link.lower().startswith("http"): - continue - for swb in swb_with_sig: - if _same_book(swb, rec): - to_remove.add(idx) - break + def extract_rvk_root(sig: Optional[str]) -> str: + if not sig: + return "" + t = strip_copy_and_edition(sig.upper()) + t = re.sub(r"\s+", " ", t).strip() + m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t) + if not m: + cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip() + return cleaned.split(" ")[0] if cleaned else "" + return re.sub(r"\s+", " ", m.group(1)).strip() - # Build filtered list - return [rec for i, rec in enumerate(records) if i not in to_remove] + def has_sig(b: BookData) -> bool: + return bool(getattr(b, "signature", None)) + + def is_online(b: BookData) -> bool: + return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource" + + def is_print(b: BookData) -> bool: + return not is_online(b) + + def rvk_matches_swb(b: BookData) -> bool: + if not has_sig(b) or not has_sig(swb_result): + return False + return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature) + + def strictly_newer(b: BookData) -> bool: + # Hard guard: if both years are known and candidate is older, discard + if ( + b.year is not None + and swb_result.year is not None + and b.year < swb_result.year + ): + return False + + newer_by_year = ( + b.year is not None + and swb_result.year is not None + and b.year > swb_result.year + ) + newer_by_edition = ( + b.edition_number is not None + and swb_result.edition_number is not None + and b.edition_number > swb_result.edition_number + ) + # Thanks to the guard above, newer_by_edition can't pick something with a smaller year. + return newer_by_year or newer_by_edition + + swb_has_sig = has_sig(swb_result) + swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None)) + + # 1) Filter: same work (by RVK if both have sigs) AND strictly newer + candidates: List[BookData] = [] + for b in dnb_result: + if has_sig(b) and swb_has_sig: + if extract_rvk_root(b.signature) != swb_rvk: + continue # different work + if strictly_newer(b): + candidates.append(b) + + if not candidates: + return None + + # 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature) + def pref_score(x: BookData) -> tuple[int, int, int]: + return ( + 1 if rvk_matches_swb(x) else 0, + 1 if is_print(x) else 0, + 1 if has_sig(x) else 0, + ) + + by_ppn: dict[Optional[str], BookData] = {} + for b in candidates: + key = getattr(b, "ppn", None) + prev = by_ppn.get(key) + if prev is None or pref_score(b) > pref_score(prev): + by_ppn[key] = b + + deduped = list(by_ppn.values()) + if not deduped: + return None + + # 3) Final pick (single best) + def sort_key(b: BookData): + year = b.year if b.year is not None else -1 + ed = b.edition_number if b.edition_number is not None else -1 + return ( + 1 if rvk_matches_swb(b) else 0, + 1 if is_print(b) else 0, + 1 if has_sig(b) else 0, + year, + ed, + ) + + best = max(deduped, key=sort_key) + return [best] if best else None class NewEditionCheckerThread(QThread): @@ -115,8 +169,8 @@ class NewEditionCheckerThread(QThread): resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]] # NEW: metrics signals - rateSignal = Signal(float) # items per second ("it/s") - etaSignal = Signal(int) # seconds remaining (-1 when unknown) + rateSignal = Signal(float) # items per second ("it/s") + etaSignal = Signal(int) # seconds remaining (-1 when unknown) def __init__(self, entries: Optional[list["BookData"]] = None, parent=None): super().__init__(parent) @@ -157,54 +211,64 @@ class NewEditionCheckerThread(QThread): def _process_book( cls, book: "BookData" ) -> tuple["BookData", list["BookData"]] | None: - author = ( - book.author.split(";")[0].replace(" ", "") - if (book.author and ";" in book.author) - else (book.author or "").replace(" ", "") - ) - title = cls._clean_title(book.title or "") - - # Query SWB - response: list[BookData] = SWB().getBooks( - [ - "pica.bib=20735", - f"pica.tit={title.split(':')[0].strip()}", - # f"pica.per={author}", - ] - ) - - # Remove same PPN - response = [entry for entry in response if entry.ppn != book.ppn] - for respo in response: - respo.link = "SWB" - - # Query Lehmanns - with LehmannsClient() as client: - results = client.search_by_title(title, strict=True) - if results: - for res in results: - response.append(BookData().from_LehmannsSearchResult(res)) - - if not response: + """Process one book; returns (original, [found editions]) or None on failure.""" + if not book.title: return None - - response = filter_prefer_swb(response) - - # Remove entries matching the same ISBN as the current book - response = [ - entry - for entry in response - if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn)) - ] - response = [ - entry - for entry in response - if book.publisher in entry.publisher + response: list["BookData"] = [] + query = [ + f"pica.tit={book.title}", + f"pica.vlg={book.publisher}", ] - if not response: - return None + swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0] + dnb_results = swb.getBooks(query) + new_editions = find_newer_edition(swb_result, dnb_results) + if new_editions is not None: + for new_edition in new_editions: + new_edition.library_location = cat.get_location(new_edition.ppn) + try: + isbn = ( + str(new_edition.isbn[0]) + if isinstance(new_edition.isbn, list) + else str(new_edition.isbn) + ) + new_edition.link = ( + f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}" + ) + except (IndexError, TypeError): + isbn = None + new_edition.in_library = cat.in_library(new_edition.ppn) + response = new_editions + + # client = SWB() + # response: list["BookData"] = [] + # # First, search by title only + # results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"]) + + # lehmanns = LehmannsClient() + # results = lehmanns.search_by_title(title) + # for result in results: + # if "(eBook)" in result.title: + # result.title = result.title.replace("(eBook)", "").strip() + # swb_results = client.getBooks( + # [ + # f"pica.tit={result.title}", + # f"pica.vlg={result.publisher.split(',')[0]}", + # ] + # ) + # for swb in swb_results: + # if swb.isbn == result.isbn: + # result.ppn = swb.ppn + # result.signature = swb.signature + # response.append(result) + # if (result.edition_number < swb.edition_number) and ( + # swb.year > result.year + # ): + # response.append(result) + if response == []: + return None + # Remove duplicates based on ppn return (book, response) @classmethod @@ -240,7 +304,7 @@ class NewEditionCheckerThread(QThread): return # Up to 4 workers; ~20 items per worker - num_workers = min(4, max(1, ceil(total / 20))) + num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS))) chunks = self._split_evenly(self.entries, num_workers) sizes = [len(ch) for ch in chunks] diff --git a/src/backend/threads_autoadder.py b/src/backend/threads_autoadder.py index b793ae6..e956ef9 100644 --- a/src/backend/threads_autoadder.py +++ b/src/backend/threads_autoadder.py @@ -1,13 +1,15 @@ +import sys import time +import loguru + # from icecream import ic from PySide6.QtCore import QThread from PySide6.QtCore import Signal as Signal -from src.backend import Database -import loguru -import sys from src import LOG_DIR +from src.backend import Database + log = loguru.logger log.remove() log.add(sys.stdout, level="INFO") @@ -29,8 +31,8 @@ class AutoAdder(QThread): self.app_id = app_id self.prof_id = prof_id - # print("Launched AutoAdder") - # print(self.data, self.app_id, self.prof_id) + # #print("Launched AutoAdder") + # #print(self.data, self.app_id, self.prof_id) def run(self): self.db = Database() @@ -46,7 +48,7 @@ class AutoAdder(QThread): time.sleep(1) except Exception as e: - # print(e) + # #print(e) log.exception( f"The query failed with message {e} for signature {entry}" ) diff --git a/src/backend/threads_availchecker.py b/src/backend/threads_availchecker.py index c3ba3cc..9014ccd 100644 --- a/src/backend/threads_availchecker.py +++ b/src/backend/threads_availchecker.py @@ -1,24 +1,23 @@ +import sys import time +# from src.transformers import RDS_AVAIL_DATA +import loguru + # from icecream import ic from PySide6.QtCore import QThread from PySide6.QtCore import Signal as Signal -from src.backend.database import Database from src import LOG_DIR +from src.backend.database import Database from src.logic.webrequest import BibTextTransformer, WebRequest -# from src.transformers import RDS_AVAIL_DATA -import loguru -import sys - log = loguru.logger log.remove() log.add(sys.stdout, level="INFO") log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days") - class AvailChecker(QThread): updateSignal = Signal(str, int) updateProgress = Signal(int, int) @@ -62,8 +61,8 @@ class AvailChecker(QThread): for item in rds.items: sign = item.superlocation loc = item.location - # # print(item.location) - if self.appnumber in sign or self.appnumber in loc: + # # #print(item.location) + if str(self.appnumber) in sign or str(self.appnumber) in loc: state = 1 break for book in self.books: @@ -71,7 +70,7 @@ class AvailChecker(QThread): book_id = book["id"] break log.info(f"State of {link}: " + str(state)) - # print("Updating availability of " + str(book_id) + " to " + str(state)) + # #print("Updating availability of " + str(book_id) + " to " + str(state)) self.db.setAvailability(book_id, state) count += 1 self.updateProgress.emit(count, len(self.links))