rework threads and also use app_ids where applicable

This commit is contained in:
2025-10-07 14:11:14 +02:00
parent 8e9eff4f3a
commit e061c1f5a9
4 changed files with 225 additions and 157 deletions

View File

@@ -1,11 +1,12 @@
from PySide6.QtCore import QThread
from PySide6.QtCore import Signal
from src.backend import Database
from src.logic.webrequest import BibTextTransformer, WebRequest
import loguru
import sys import sys
import loguru
from PySide6.QtCore import QThread, Signal
from src import LOG_DIR from src import LOG_DIR
from src.backend import Database
from src.logic.webrequest import BibTextTransformer, WebRequest
log = loguru.logger log = loguru.logger
log.remove() log.remove()
log.add(sys.stdout, level="INFO") log.add(sys.stdout, level="INFO")
@@ -31,9 +32,11 @@ class BookGrabber(QThread):
self.book_id = None self.book_id = None
self.use_any = False self.use_any = False
self.use_exact = False self.use_exact = False
self.app_id = None self.app_nr = None
self.tstate = (self.app_id, self.prof_id, self.mode, self.data) self.tstate = (self.app_id, self.prof_id, self.mode, self.data)
self.request = WebRequest() self.request = WebRequest()
self.db = Database()
def add_values( def add_values(
self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False
@@ -45,13 +48,13 @@ class BookGrabber(QThread):
self.use_any = any_book self.use_any = any_book
self.use_exact = exact self.use_exact = exact
log.info(f"Working on {len(self.data)} entries") log.info(f"Working on {len(self.data)} entries")
self.tstate = (self.app_id, self.prof_id, self.mode, self.data) self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
log.debug("State: " + str(self.tstate)) log.debug("State: " + str(self.tstate))
self.request.set_apparat(self.app_id) app_nr = self.db.query_db("SELECT appnr FROM semesterapparat WHERE id = ?", (self.app_id,))[0][0]
self.request.set_apparat(app_nr)
# log.debug(self.tstate) # log.debug(self.tstate)
def run(self): def run(self):
self.db = Database()
item = 0 item = 0
iterdata = self.data iterdata = self.data
# log.debug(iterdata) # log.debug(iterdata)
@@ -91,7 +94,7 @@ class BookGrabber(QThread):
state = 0 state = 0
for result in transformer.RDS_DATA: for result in transformer.RDS_DATA:
# log.debug(result.RDS_LOCATION) # log.debug(result.RDS_LOCATION)
if str(self.app_id) in result.RDS_LOCATION: if str(self.app_nr) in result.RDS_LOCATION:
state = 1 state = 1
break break
@@ -126,27 +129,27 @@ class BookGrabberTest(QThread):
self.is_Running = True self.is_Running = True
log.info("Starting worker thread") log.info("Starting worker thread")
self.data = None self.data = None
self.app_id = None self.app_nr = None
self.prof_id = None self.prof_id = None
self.mode = None self.mode = None
self.book_id = None self.book_id = None
self.use_any = False self.use_any = False
self.use_exact = False self.use_exact = False
self.app_id = appnr self.app_nr = appnr
self.tstate = (self.app_id, self.prof_id, self.mode, self.data) self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
self.results = [] self.results = []
def add_values( def add_values(
self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False self, app_nr: int, prof_id: int, mode: str, data, any_book=False, exact=False
): ):
self.app_id = app_id self.app_nr = app_nr
self.prof_id = prof_id self.prof_id = prof_id
self.mode = mode self.mode = mode
self.data = data self.data = data
self.use_any = any_book self.use_any = any_book
self.use_exact = exact self.use_exact = exact
log.info(f"Working on {len(self.data)} entries") log.info(f"Working on {len(self.data)} entries")
self.tstate = (self.app_id, self.prof_id, self.mode, self.data) self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
log.debug("State: " + str(self.tstate)) log.debug("State: " + str(self.tstate))
# log.debug(self.tstate) # log.debug(self.tstate)
@@ -159,7 +162,7 @@ class BookGrabberTest(QThread):
signature = str(entry) signature = str(entry)
log.info("Processing entry: " + signature) log.info("Processing entry: " + signature)
webdata = WebRequest().set_apparat(self.app_id).get_ppn(entry) webdata = WebRequest().set_apparat(self.app_nr).get_ppn(entry)
if self.use_any: if self.use_any:
webdata = webdata.use_any_book webdata = webdata.use_any_book
webdata = webdata.get_data() webdata = webdata.get_data()
@@ -186,7 +189,7 @@ class BookGrabberTest(QThread):
state = 0 state = 0
for result in transformer.RDS_DATA: for result in transformer.RDS_DATA:
# log.debug(result.RDS_LOCATION) # log.debug(result.RDS_LOCATION)
if str(self.app_id) in result.RDS_LOCATION: if str(self.app_nr) in result.RDS_LOCATION:
state = 1 state = 1
break break

View File

@@ -1,19 +1,26 @@
import os
import re import re
import sys import sys
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from datetime import datetime from datetime import datetime
from math import ceil from math import ceil
from queue import Empty, Queue from queue import Empty, Queue
from typing import List, Optional, Set, Union
from time import monotonic # <-- NEW from time import monotonic # <-- NEW
from typing import List, Optional
import loguru import loguru
from PySide6.QtCore import QThread, Signal from PySide6.QtCore import QThread, Signal
from src import LOG_DIR from src import LOG_DIR
# from src.logic.webrequest import BibTextTransformer, WebRequest
from src.backend.catalogue import Catalogue
from src.logic import BookData from src.logic import BookData
from src.logic.lehmannsapi import LehmannsClient from src.logic.SRU import SWB
from src.logic.swb import SWB
# use all available cores - 2, but at least 1
THREAD_COUNT = max(os.cpu_count() - 2, 1)
THREAD_MIN_ITEMS = 5
log = loguru.logger log = loguru.logger
log.remove() log.remove()
@@ -23,89 +30,136 @@ log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
log.add( log.add(
f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log", f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
rotation="1 day", rotation="1 day",
retention="1 month", retention="7 days",
) )
swb = SWB()
dnb = SWB()
cat = Catalogue()
def _norm_text(s: Optional[str]) -> str: RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative RVK character set
if not s:
return ""
# lowercase, collapse whitespace, drop some punctuation
s = s.lower()
s = re.sub(r"[\s\-\u2013\u2014]+", " ", s) # spaces/dashes
s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s)
return s.strip()
def _same_book(a: BookData, b: BookData) -> bool: def find_newer_edition(
"""Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized.""" swb_result: BookData, dnb_result: List[BookData]
isbns_a = _norm_isbns(a.isbn) ) -> Optional[List[BookData]]:
isbns_b = _norm_isbns(b.isbn)
if isbns_a and isbns_b and (isbns_a & isbns_b):
return True
ta, tb = _norm_text(a.title), _norm_text(b.title)
aa, ab = _norm_text(a.author), _norm_text(b.author)
ya, yb = (a.year or "").strip(), (b.year or "").strip()
# strong title match required; then author if available; then year if available
if ta and tb and ta == tb:
if aa and ab and aa == ab:
if ya and yb:
return ya == yb
return True
if ya and yb:
return ya == yb
return True
return False
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
if value is None:
return set()
vals = value if isinstance(value, list) else [value]
out: Set[str] = set()
for v in vals:
s = str(v)
digits = re.sub(r"[^0-9Xx]", "", s)
# keep 13-digit or 10-digit tokens
m13 = re.findall(r"97[89]\d{10}", digits)
if m13:
out.update(m13)
else:
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
out.update(x.upper() for x in m10)
return out
def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
""" """
If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s). New edition if:
Returns a NEW list (does not mutate the input). - year > swb.year OR
- edition_number > swb.edition_number
BUT: discard any candidate with year < swb.year (if both years are known).
Same-work check:
- Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
- If both have signatures and RVKs differ -> skip.
Preferences (in order):
1) RVK matches SWB
2) Print over Online-Ressource
3) Has signature
4) Newer: (year desc, edition_number desc)
""" """
swb_with_sig = [
r
for r in records
if (r.link == "SWB") and (r.signature and r.signature.strip())
]
if not swb_with_sig:
return list(records)
to_remove: Set[int] = set() def strip_copy_and_edition(s: str) -> str:
s = re.sub(r"\(\s*\d+\s*\)", "", s) # remove '(N)'
s = re.sub(r"\s*\+\s*\d+\s*$", "", s) # remove trailing '+N'
return s
# For each URL entry, see if it matches any SWB-with-signature entry def extract_rvk_root(sig: Optional[str]) -> str:
for idx, rec in enumerate(records): if not sig:
if not rec.link or not rec.link.lower().startswith("http"): return ""
continue t = strip_copy_and_edition(sig.upper())
for swb in swb_with_sig: t = re.sub(r"\s+", " ", t).strip()
if _same_book(swb, rec): m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
to_remove.add(idx) if not m:
break cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
return cleaned.split(" ")[0] if cleaned else ""
return re.sub(r"\s+", " ", m.group(1)).strip()
# Build filtered list def has_sig(b: BookData) -> bool:
return [rec for i, rec in enumerate(records) if i not in to_remove] return bool(getattr(b, "signature", None))
def is_online(b: BookData) -> bool:
return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"
def is_print(b: BookData) -> bool:
return not is_online(b)
def rvk_matches_swb(b: BookData) -> bool:
if not has_sig(b) or not has_sig(swb_result):
return False
return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)
def strictly_newer(b: BookData) -> bool:
# Hard guard: if both years are known and candidate is older, discard
if (
b.year is not None
and swb_result.year is not None
and b.year < swb_result.year
):
return False
newer_by_year = (
b.year is not None
and swb_result.year is not None
and b.year > swb_result.year
)
newer_by_edition = (
b.edition_number is not None
and swb_result.edition_number is not None
and b.edition_number > swb_result.edition_number
)
# Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
return newer_by_year or newer_by_edition
swb_has_sig = has_sig(swb_result)
swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))
# 1) Filter: same work (by RVK if both have sigs) AND strictly newer
candidates: List[BookData] = []
for b in dnb_result:
if has_sig(b) and swb_has_sig:
if extract_rvk_root(b.signature) != swb_rvk:
continue # different work
if strictly_newer(b):
candidates.append(b)
if not candidates:
return None
# 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
def pref_score(x: BookData) -> tuple[int, int, int]:
return (
1 if rvk_matches_swb(x) else 0,
1 if is_print(x) else 0,
1 if has_sig(x) else 0,
)
by_ppn: dict[Optional[str], BookData] = {}
for b in candidates:
key = getattr(b, "ppn", None)
prev = by_ppn.get(key)
if prev is None or pref_score(b) > pref_score(prev):
by_ppn[key] = b
deduped = list(by_ppn.values())
if not deduped:
return None
# 3) Final pick (single best)
def sort_key(b: BookData):
year = b.year if b.year is not None else -1
ed = b.edition_number if b.edition_number is not None else -1
return (
1 if rvk_matches_swb(b) else 0,
1 if is_print(b) else 0,
1 if has_sig(b) else 0,
year,
ed,
)
best = max(deduped, key=sort_key)
return [best] if best else None
class NewEditionCheckerThread(QThread): class NewEditionCheckerThread(QThread):
@@ -115,8 +169,8 @@ class NewEditionCheckerThread(QThread):
resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]] resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]]
# NEW: metrics signals # NEW: metrics signals
rateSignal = Signal(float) # items per second ("it/s") rateSignal = Signal(float) # items per second ("it/s")
etaSignal = Signal(int) # seconds remaining (-1 when unknown) etaSignal = Signal(int) # seconds remaining (-1 when unknown)
def __init__(self, entries: Optional[list["BookData"]] = None, parent=None): def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
super().__init__(parent) super().__init__(parent)
@@ -157,54 +211,64 @@ class NewEditionCheckerThread(QThread):
def _process_book( def _process_book(
cls, book: "BookData" cls, book: "BookData"
) -> tuple["BookData", list["BookData"]] | None: ) -> tuple["BookData", list["BookData"]] | None:
author = ( """Process one book; returns (original, [found editions]) or None on failure."""
book.author.split(";")[0].replace(" ", "") if not book.title:
if (book.author and ";" in book.author)
else (book.author or "").replace(" ", "")
)
title = cls._clean_title(book.title or "")
# Query SWB
response: list[BookData] = SWB().getBooks(
[
"pica.bib=20735",
f"pica.tit={title.split(':')[0].strip()}",
# f"pica.per={author}",
]
)
# Remove same PPN
response = [entry for entry in response if entry.ppn != book.ppn]
for respo in response:
respo.link = "SWB"
# Query Lehmanns
with LehmannsClient() as client:
results = client.search_by_title(title, strict=True)
if results:
for res in results:
response.append(BookData().from_LehmannsSearchResult(res))
if not response:
return None return None
response: list["BookData"] = []
response = filter_prefer_swb(response) query = [
f"pica.tit={book.title}",
# Remove entries matching the same ISBN as the current book f"pica.vlg={book.publisher}",
response = [
entry
for entry in response
if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
]
response = [
entry
for entry in response
if book.publisher in entry.publisher
] ]
if not response: swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
return None dnb_results = swb.getBooks(query)
new_editions = find_newer_edition(swb_result, dnb_results)
if new_editions is not None:
for new_edition in new_editions:
new_edition.library_location = cat.get_location(new_edition.ppn)
try:
isbn = (
str(new_edition.isbn[0])
if isinstance(new_edition.isbn, list)
else str(new_edition.isbn)
)
new_edition.link = (
f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
)
except (IndexError, TypeError):
isbn = None
new_edition.in_library = cat.in_library(new_edition.ppn)
response = new_editions
# client = SWB()
# response: list["BookData"] = []
# # First, search by title only
# results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])
# lehmanns = LehmannsClient()
# results = lehmanns.search_by_title(title)
# for result in results:
# if "(eBook)" in result.title:
# result.title = result.title.replace("(eBook)", "").strip()
# swb_results = client.getBooks(
# [
# f"pica.tit={result.title}",
# f"pica.vlg={result.publisher.split(',')[0]}",
# ]
# )
# for swb in swb_results:
# if swb.isbn == result.isbn:
# result.ppn = swb.ppn
# result.signature = swb.signature
# response.append(result)
# if (result.edition_number < swb.edition_number) and (
# swb.year > result.year
# ):
# response.append(result)
if response == []:
return None
# Remove duplicates based on ppn
return (book, response) return (book, response)
@classmethod @classmethod
@@ -240,7 +304,7 @@ class NewEditionCheckerThread(QThread):
return return
# Up to 4 workers; ~20 items per worker # Up to 4 workers; ~20 items per worker
num_workers = min(4, max(1, ceil(total / 20))) num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
chunks = self._split_evenly(self.entries, num_workers) chunks = self._split_evenly(self.entries, num_workers)
sizes = [len(ch) for ch in chunks] sizes = [len(ch) for ch in chunks]

View File

@@ -1,13 +1,15 @@
import sys
import time import time
import loguru
# from icecream import ic # from icecream import ic
from PySide6.QtCore import QThread from PySide6.QtCore import QThread
from PySide6.QtCore import Signal as Signal from PySide6.QtCore import Signal as Signal
from src.backend import Database
import loguru
import sys
from src import LOG_DIR from src import LOG_DIR
from src.backend import Database
log = loguru.logger log = loguru.logger
log.remove() log.remove()
log.add(sys.stdout, level="INFO") log.add(sys.stdout, level="INFO")
@@ -29,8 +31,8 @@ class AutoAdder(QThread):
self.app_id = app_id self.app_id = app_id
self.prof_id = prof_id self.prof_id = prof_id
# print("Launched AutoAdder") # #print("Launched AutoAdder")
# print(self.data, self.app_id, self.prof_id) # #print(self.data, self.app_id, self.prof_id)
def run(self): def run(self):
self.db = Database() self.db = Database()
@@ -46,7 +48,7 @@ class AutoAdder(QThread):
time.sleep(1) time.sleep(1)
except Exception as e: except Exception as e:
# print(e) # #print(e)
log.exception( log.exception(
f"The query failed with message {e} for signature {entry}" f"The query failed with message {e} for signature {entry}"
) )

View File

@@ -1,24 +1,23 @@
import sys
import time import time
# from src.transformers import RDS_AVAIL_DATA
import loguru
# from icecream import ic # from icecream import ic
from PySide6.QtCore import QThread from PySide6.QtCore import QThread
from PySide6.QtCore import Signal as Signal from PySide6.QtCore import Signal as Signal
from src.backend.database import Database
from src import LOG_DIR from src import LOG_DIR
from src.backend.database import Database
from src.logic.webrequest import BibTextTransformer, WebRequest from src.logic.webrequest import BibTextTransformer, WebRequest
# from src.transformers import RDS_AVAIL_DATA
import loguru
import sys
log = loguru.logger log = loguru.logger
log.remove() log.remove()
log.add(sys.stdout, level="INFO") log.add(sys.stdout, level="INFO")
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days") log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
class AvailChecker(QThread): class AvailChecker(QThread):
updateSignal = Signal(str, int) updateSignal = Signal(str, int)
updateProgress = Signal(int, int) updateProgress = Signal(int, int)
@@ -62,8 +61,8 @@ class AvailChecker(QThread):
for item in rds.items: for item in rds.items:
sign = item.superlocation sign = item.superlocation
loc = item.location loc = item.location
# # print(item.location) # # #print(item.location)
if self.appnumber in sign or self.appnumber in loc: if str(self.appnumber) in sign or str(self.appnumber) in loc:
state = 1 state = 1
break break
for book in self.books: for book in self.books:
@@ -71,7 +70,7 @@ class AvailChecker(QThread):
book_id = book["id"] book_id = book["id"]
break break
log.info(f"State of {link}: " + str(state)) log.info(f"State of {link}: " + str(state))
# print("Updating availability of " + str(book_id) + " to " + str(state)) # #print("Updating availability of " + str(book_id) + " to " + str(state))
self.db.setAvailability(book_id, state) self.db.setAvailability(book_id, state)
count += 1 count += 1
self.updateProgress.emit(count, len(self.links)) self.updateProgress.emit(count, len(self.links))