rework threads and also use app_ids where applicable

2025-10-07 14:11:14 +02:00
parent 8e9eff4f3a
commit e061c1f5a9
4 changed files with 225 additions and 157 deletions
--- a/src/backend/thread_bookgrabber.py
+++ b/src/backend/thread_bookgrabber.py
@@ -1,11 +1,12 @@
 from PySide6.QtCore import QThread
 from PySide6.QtCore import Signal
 from src.backend import Database
 from src.logic.webrequest import BibTextTransformer, WebRequest
 import loguru
 import sys
 import loguru
 from PySide6.QtCore import QThread, Signal
 from src import LOG_DIR
 from src.backend import Database
 from src.logic.webrequest import BibTextTransformer, WebRequest
 log = loguru.logger
 log.remove()
 log.add(sys.stdout, level="INFO")
@@ -31,9 +32,11 @@ class BookGrabber(QThread):
        self.book_id = None
        self.use_any = False
        self.use_exact = False
-        self.app_id = None
+        self.app_nr = None
        self.tstate = (self.app_id, self.prof_id, self.mode, self.data)
        self.request = WebRequest()
        self.db = Database()
    def add_values(
        self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False
@@ -45,13 +48,13 @@ class BookGrabber(QThread):
        self.use_any = any_book
        self.use_exact = exact
        log.info(f"Working on {len(self.data)} entries")
-        self.tstate = (self.app_id, self.prof_id, self.mode, self.data)
+        self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
        log.debug("State: " + str(self.tstate))
-        self.request.set_apparat(self.app_id)
+        app_nr = self.db.query_db("SELECT appnr FROM semesterapparat WHERE id = ?", (self.app_id,))[0][0]
        self.request.set_apparat(app_nr)
        # log.debug(self.tstate)
    def run(self):
        self.db = Database()
        item = 0
        iterdata = self.data
        # log.debug(iterdata)
@@ -91,7 +94,7 @@ class BookGrabber(QThread):
            state = 0
            for result in transformer.RDS_DATA:
                # log.debug(result.RDS_LOCATION)
-                if str(self.app_id) in result.RDS_LOCATION:
+                if str(self.app_nr) in result.RDS_LOCATION:
                    state = 1
                    break
@@ -126,27 +129,27 @@ class BookGrabberTest(QThread):
        self.is_Running = True
        log.info("Starting worker thread")
        self.data = None
-        self.app_id = None
+        self.app_nr = None
        self.prof_id = None
        self.mode = None
        self.book_id = None
        self.use_any = False
        self.use_exact = False
-        self.app_id = appnr
+        self.app_nr = appnr
-        self.tstate = (self.app_id, self.prof_id, self.mode, self.data)
+        self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
        self.results = []
    def add_values(
-        self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False
+        self, app_nr: int, prof_id: int, mode: str, data, any_book=False, exact=False
    ):
-        self.app_id = app_id
+        self.app_nr = app_nr
        self.prof_id = prof_id
        self.mode = mode
        self.data = data
        self.use_any = any_book
        self.use_exact = exact
        log.info(f"Working on {len(self.data)} entries")
-        self.tstate = (self.app_id, self.prof_id, self.mode, self.data)
+        self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
        log.debug("State: " + str(self.tstate))
        # log.debug(self.tstate)
@@ -159,7 +162,7 @@ class BookGrabberTest(QThread):
            signature = str(entry)
            log.info("Processing entry: " + signature)
-            webdata = WebRequest().set_apparat(self.app_id).get_ppn(entry)
+            webdata = WebRequest().set_apparat(self.app_nr).get_ppn(entry)
            if self.use_any:
                webdata = webdata.use_any_book
            webdata = webdata.get_data()
@@ -186,7 +189,7 @@ class BookGrabberTest(QThread):
            state = 0
            for result in transformer.RDS_DATA:
                # log.debug(result.RDS_LOCATION)
-                if str(self.app_id) in result.RDS_LOCATION:
+                if str(self.app_nr) in result.RDS_LOCATION:
                    state = 1
                    break
--- a/src/backend/thread_neweditions.py
+++ b/src/backend/thread_neweditions.py
@@ -1,19 +1,26 @@
 import os
 import re
 import sys
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime
 from math import ceil
 from queue import Empty, Queue
 from typing import List, Optional, Set, Union
 from time import monotonic  # <-- NEW
 from typing import List, Optional
 import loguru
 from PySide6.QtCore import QThread, Signal
 from src import LOG_DIR
 # from src.logic.webrequest import BibTextTransformer, WebRequest
 from src.backend.catalogue import Catalogue
 from src.logic import BookData
-from src.logic.lehmannsapi import LehmannsClient
+from src.logic.SRU import SWB
-from src.logic.swb import SWB
+
 # use all available cores - 2, but at least 1
 THREAD_COUNT = max(os.cpu_count() - 2, 1)
 THREAD_MIN_ITEMS = 5
 log = loguru.logger
 log.remove()
@@ -23,89 +30,136 @@ log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
 log.add(
    f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
    rotation="1 day",
-    retention="1 month",
+    retention="7 days",
 )
 swb = SWB()
 dnb = SWB()
 cat = Catalogue()
-def _norm_text(s: Optional[str]) -> str:
+RVK_ALLOWED = r"[A-Z0-9.\-\/]"  # conservative RVK character set
    if not s:
        return ""
    # lowercase, collapse whitespace, drop some punctuation
    s = s.lower()
    s = re.sub(r"[\s\-\u2013\u2014]+", " ", s)  # spaces/dashes
    s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s)
    return s.strip()
-def _same_book(a: BookData, b: BookData) -> bool:
+def find_newer_edition(
-    """Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized."""
+    swb_result: BookData, dnb_result: List[BookData]
-    isbns_a = _norm_isbns(a.isbn)
+) -> Optional[List[BookData]]:
    isbns_b = _norm_isbns(b.isbn)
    if isbns_a and isbns_b and (isbns_a & isbns_b):
        return True
    ta, tb = _norm_text(a.title), _norm_text(b.title)
    aa, ab = _norm_text(a.author), _norm_text(b.author)
    ya, yb = (a.year or "").strip(), (b.year or "").strip()
    # strong title match required; then author if available; then year if available
    if ta and tb and ta == tb:
        if aa and ab and aa == ab:
            if ya and yb:
                return ya == yb
            return True
        if ya and yb:
            return ya == yb
        return True
    return False
 def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
    """Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
    if value is None:
        return set()
    vals = value if isinstance(value, list) else [value]
    out: Set[str] = set()
    for v in vals:
        s = str(v)
        digits = re.sub(r"[^0-9Xx]", "", s)
        # keep 13-digit or 10-digit tokens
        m13 = re.findall(r"97[89]\d{10}", digits)
        if m13:
            out.update(m13)
        else:
            m10 = re.findall(r"\d{9}[0-9Xx]", digits)
            out.update(x.upper() for x in m10)
    return out
 def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
    """
-    If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
+    New edition if:
-    Returns a NEW list (does not mutate the input).
+      - year > swb.year OR
      - edition_number > swb.edition_number
    BUT: discard any candidate with year < swb.year (if both years are known).
    Same-work check:
      - Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
      - If both have signatures and RVKs differ -> skip.
    Preferences (in order):
      1) RVK matches SWB
      2) Print over Online-Ressource
      3) Has signature
      4) Newer: (year desc, edition_number desc)
    """
    swb_with_sig = [
        r
        for r in records
        if (r.link == "SWB") and (r.signature and r.signature.strip())
    ]
    if not swb_with_sig:
        return list(records)
-    to_remove: Set[int] = set()
+    def strip_copy_and_edition(s: str) -> str:
        s = re.sub(r"\(\s*\d+\s*\)", "", s)  # remove '(N)'
        s = re.sub(r"\s*\+\s*\d+\s*$", "", s)  # remove trailing '+N'
        return s
-    # For each URL entry, see if it matches any SWB-with-signature entry
+    def extract_rvk_root(sig: Optional[str]) -> str:
-    for idx, rec in enumerate(records):
+        if not sig:
-        if not rec.link or not rec.link.lower().startswith("http"):
+            return ""
-            continue
+        t = strip_copy_and_edition(sig.upper())
-        for swb in swb_with_sig:
+        t = re.sub(r"\s+", " ", t).strip()
-            if _same_book(swb, rec):
+        m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
-                to_remove.add(idx)
+        if not m:
-                break
+            cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
            return cleaned.split(" ")[0] if cleaned else ""
        return re.sub(r"\s+", " ", m.group(1)).strip()
-    # Build filtered list
+    def has_sig(b: BookData) -> bool:
-    return [rec for i, rec in enumerate(records) if i not in to_remove]
+        return bool(getattr(b, "signature", None))
    def is_online(b: BookData) -> bool:
        return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"
    def is_print(b: BookData) -> bool:
        return not is_online(b)
    def rvk_matches_swb(b: BookData) -> bool:
        if not has_sig(b) or not has_sig(swb_result):
            return False
        return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)
    def strictly_newer(b: BookData) -> bool:
        # Hard guard: if both years are known and candidate is older, discard
        if (
            b.year is not None
            and swb_result.year is not None
            and b.year < swb_result.year
        ):
            return False
        newer_by_year = (
            b.year is not None
            and swb_result.year is not None
            and b.year > swb_result.year
        )
        newer_by_edition = (
            b.edition_number is not None
            and swb_result.edition_number is not None
            and b.edition_number > swb_result.edition_number
        )
        # Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
        return newer_by_year or newer_by_edition
    swb_has_sig = has_sig(swb_result)
    swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))
    # 1) Filter: same work (by RVK if both have sigs) AND strictly newer
    candidates: List[BookData] = []
    for b in dnb_result:
        if has_sig(b) and swb_has_sig:
            if extract_rvk_root(b.signature) != swb_rvk:
                continue  # different work
        if strictly_newer(b):
            candidates.append(b)
    if not candidates:
        return None
    # 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
    def pref_score(x: BookData) -> tuple[int, int, int]:
        return (
            1 if rvk_matches_swb(x) else 0,
            1 if is_print(x) else 0,
            1 if has_sig(x) else 0,
        )
    by_ppn: dict[Optional[str], BookData] = {}
    for b in candidates:
        key = getattr(b, "ppn", None)
        prev = by_ppn.get(key)
        if prev is None or pref_score(b) > pref_score(prev):
            by_ppn[key] = b
    deduped = list(by_ppn.values())
    if not deduped:
        return None
    # 3) Final pick (single best)
    def sort_key(b: BookData):
        year = b.year if b.year is not None else -1
        ed = b.edition_number if b.edition_number is not None else -1
        return (
            1 if rvk_matches_swb(b) else 0,
            1 if is_print(b) else 0,
            1 if has_sig(b) else 0,
            year,
            ed,
        )
    best = max(deduped, key=sort_key)
    return [best] if best else None
 class NewEditionCheckerThread(QThread):
@@ -115,8 +169,8 @@ class NewEditionCheckerThread(QThread):
    resultsSignal = Signal(list)  # list[tuple[BookData, list[BookData]]]
    # NEW: metrics signals
-    rateSignal = Signal(float)   # items per second ("it/s")
+    rateSignal = Signal(float)  # items per second ("it/s")
-    etaSignal = Signal(int)      # seconds remaining (-1 when unknown)
+    etaSignal = Signal(int)  # seconds remaining (-1 when unknown)
    def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
        super().__init__(parent)
@@ -157,54 +211,64 @@ class NewEditionCheckerThread(QThread):
    def _process_book(
        cls, book: "BookData"
    ) -> tuple["BookData", list["BookData"]] | None:
-        author = (
+        """Process one book; returns (original, [found editions]) or None on failure."""
-            book.author.split(";")[0].replace(" ", "")
+        if not book.title:
            if (book.author and ";" in book.author)
            else (book.author or "").replace(" ", "")
        )
        title = cls._clean_title(book.title or "")
        # Query SWB
        response: list[BookData] = SWB().getBooks(
            [
                "pica.bib=20735",
                f"pica.tit={title.split(':')[0].strip()}",
                # f"pica.per={author}",
            ]
        )
        # Remove same PPN
        response = [entry for entry in response if entry.ppn != book.ppn]
        for respo in response:
            respo.link = "SWB"
        # Query Lehmanns
        with LehmannsClient() as client:
            results = client.search_by_title(title, strict=True)
            if results:
                for res in results:
                    response.append(BookData().from_LehmannsSearchResult(res))
        if not response:
            return None
-
+        response: list["BookData"] = []
-        response = filter_prefer_swb(response)
+        query = [
-
+            f"pica.tit={book.title}",
-        # Remove entries matching the same ISBN as the current book
+            f"pica.vlg={book.publisher}",
        response = [
            entry
            for entry in response
            if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
        ]
        response = [
            entry
            for entry in response
            if book.publisher in entry.publisher
        ]
-        if not response:
+        swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
-            return None
+        dnb_results = swb.getBooks(query)
        new_editions = find_newer_edition(swb_result, dnb_results)
        if new_editions is not None:
            for new_edition in new_editions:
                new_edition.library_location = cat.get_location(new_edition.ppn)
                try:
                    isbn = (
                        str(new_edition.isbn[0])
                        if isinstance(new_edition.isbn, list)
                        else str(new_edition.isbn)
                    )
                    new_edition.link = (
                        f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
                    )
                except (IndexError, TypeError):
                    isbn = None
                new_edition.in_library = cat.in_library(new_edition.ppn)
            response = new_editions
        # client = SWB()
        # response: list["BookData"] = []
        # # First, search by title only
        # results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])
        # lehmanns = LehmannsClient()
        # results = lehmanns.search_by_title(title)
        # for result in results:
        #     if "(eBook)" in result.title:
        #         result.title = result.title.replace("(eBook)", "").strip()
        #     swb_results = client.getBooks(
        #         [
        #             f"pica.tit={result.title}",
        #             f"pica.vlg={result.publisher.split(',')[0]}",
        #         ]
        #     )
        #     for swb in swb_results:
        #         if swb.isbn == result.isbn:
        #             result.ppn = swb.ppn
        #             result.signature = swb.signature
        #             response.append(result)
        #         if (result.edition_number < swb.edition_number) and (
        #             swb.year > result.year
        #         ):
        #             response.append(result)
        if response == []:
            return None
        # Remove duplicates based on ppn
        return (book, response)
    @classmethod
@@ -240,7 +304,7 @@ class NewEditionCheckerThread(QThread):
            return
        # Up to 4 workers; ~20 items per worker
-        num_workers = min(4, max(1, ceil(total / 20)))
+        num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
        chunks = self._split_evenly(self.entries, num_workers)
        sizes = [len(ch) for ch in chunks]
--- a/src/backend/threads_autoadder.py
+++ b/src/backend/threads_autoadder.py
@@ -1,13 +1,15 @@
 import sys
 import time
 import loguru
 # from icecream import ic
 from PySide6.QtCore import QThread
 from PySide6.QtCore import Signal as Signal
 from src.backend import Database
 import loguru
 import sys
 from src import LOG_DIR
 from src.backend import Database
 log = loguru.logger
 log.remove()
 log.add(sys.stdout, level="INFO")
@@ -29,8 +31,8 @@ class AutoAdder(QThread):
        self.app_id = app_id
        self.prof_id = prof_id
-        # print("Launched AutoAdder")
+        # #print("Launched AutoAdder")
-        # print(self.data, self.app_id, self.prof_id)
+        # #print(self.data, self.app_id, self.prof_id)
    def run(self):
        self.db = Database()
@@ -46,7 +48,7 @@ class AutoAdder(QThread):
                time.sleep(1)
            except Exception as e:
-                # print(e)
+                # #print(e)
                log.exception(
                    f"The query failed with message {e} for signature {entry}"
                )
--- a/src/backend/threads_availchecker.py
+++ b/src/backend/threads_availchecker.py
@@ -1,24 +1,23 @@
 import sys
 import time
 # from src.transformers import RDS_AVAIL_DATA
 import loguru
 # from icecream import ic
 from PySide6.QtCore import QThread
 from PySide6.QtCore import Signal as Signal
 from src.backend.database import Database
 from src import LOG_DIR
 from src.backend.database import Database
 from src.logic.webrequest import BibTextTransformer, WebRequest
 # from src.transformers import RDS_AVAIL_DATA
 import loguru
 import sys
 log = loguru.logger
 log.remove()
 log.add(sys.stdout, level="INFO")
 log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
 class AvailChecker(QThread):
    updateSignal = Signal(str, int)
    updateProgress = Signal(int, int)
@@ -62,8 +61,8 @@ class AvailChecker(QThread):
            for item in rds.items:
                sign = item.superlocation
                loc = item.location
-                # # print(item.location)
+                # # #print(item.location)
-                if self.appnumber in sign or self.appnumber in loc:
+                if str(self.appnumber) in sign or str(self.appnumber) in loc:
                    state = 1
                    break
            for book in self.books:
@@ -71,7 +70,7 @@ class AvailChecker(QThread):
                    book_id = book["id"]
                    break
            log.info(f"State of {link}: " + str(state))
-            # print("Updating availability of " + str(book_id) + " to " + str(state))
+            # #print("Updating availability of " + str(book_id) + " to " + str(state))
            self.db.setAvailability(book_id, state)
            count += 1
            self.updateProgress.emit(count, len(self.links))