dev #21

Merged
WorldTeacher merged 46 commits from dev into main 2025-11-24 12:59:41 +00:00
4 changed files with 225 additions and 157 deletions
Showing only changes of commit e061c1f5a9 - Show all commits

View File

@@ -1,11 +1,12 @@
from PySide6.QtCore import QThread
from PySide6.QtCore import Signal
from src.backend import Database
from src.logic.webrequest import BibTextTransformer, WebRequest
import loguru
import sys
import loguru
from PySide6.QtCore import QThread, Signal
from src import LOG_DIR
from src.backend import Database
from src.logic.webrequest import BibTextTransformer, WebRequest
log = loguru.logger
log.remove()
log.add(sys.stdout, level="INFO")
@@ -31,9 +32,11 @@ class BookGrabber(QThread):
self.book_id = None
self.use_any = False
self.use_exact = False
self.app_id = None
self.app_nr = None
self.tstate = (self.app_id, self.prof_id, self.mode, self.data)
self.request = WebRequest()
self.db = Database()
def add_values(
self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False
@@ -45,13 +48,13 @@ class BookGrabber(QThread):
self.use_any = any_book
self.use_exact = exact
log.info(f"Working on {len(self.data)} entries")
self.tstate = (self.app_id, self.prof_id, self.mode, self.data)
self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
log.debug("State: " + str(self.tstate))
self.request.set_apparat(self.app_id)
app_nr = self.db.query_db("SELECT appnr FROM semesterapparat WHERE id = ?", (self.app_id,))[0][0]
self.request.set_apparat(app_nr)
# log.debug(self.tstate)
def run(self):
self.db = Database()
item = 0
iterdata = self.data
# log.debug(iterdata)
@@ -91,7 +94,7 @@ class BookGrabber(QThread):
state = 0
for result in transformer.RDS_DATA:
# log.debug(result.RDS_LOCATION)
if str(self.app_id) in result.RDS_LOCATION:
if str(self.app_nr) in result.RDS_LOCATION:
state = 1
break
@@ -126,27 +129,27 @@ class BookGrabberTest(QThread):
self.is_Running = True
log.info("Starting worker thread")
self.data = None
self.app_id = None
self.app_nr = None
self.prof_id = None
self.mode = None
self.book_id = None
self.use_any = False
self.use_exact = False
self.app_id = appnr
self.tstate = (self.app_id, self.prof_id, self.mode, self.data)
self.app_nr = appnr
self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
self.results = []
def add_values(
self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False
self, app_nr: int, prof_id: int, mode: str, data, any_book=False, exact=False
):
self.app_id = app_id
self.app_nr = app_nr
self.prof_id = prof_id
self.mode = mode
self.data = data
self.use_any = any_book
self.use_exact = exact
log.info(f"Working on {len(self.data)} entries")
self.tstate = (self.app_id, self.prof_id, self.mode, self.data)
self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
log.debug("State: " + str(self.tstate))
# log.debug(self.tstate)
@@ -159,7 +162,7 @@ class BookGrabberTest(QThread):
signature = str(entry)
log.info("Processing entry: " + signature)
webdata = WebRequest().set_apparat(self.app_id).get_ppn(entry)
webdata = WebRequest().set_apparat(self.app_nr).get_ppn(entry)
if self.use_any:
webdata = webdata.use_any_book
webdata = webdata.get_data()
@@ -186,7 +189,7 @@ class BookGrabberTest(QThread):
state = 0
for result in transformer.RDS_DATA:
# log.debug(result.RDS_LOCATION)
if str(self.app_id) in result.RDS_LOCATION:
if str(self.app_nr) in result.RDS_LOCATION:
state = 1
break

View File

@@ -1,19 +1,26 @@
import os
import re
import sys
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
from math import ceil
from queue import Empty, Queue
from typing import List, Optional, Set, Union
from time import monotonic # <-- NEW
from typing import List, Optional
import loguru
from PySide6.QtCore import QThread, Signal
from src import LOG_DIR
# from src.logic.webrequest import BibTextTransformer, WebRequest
from src.backend.catalogue import Catalogue
from src.logic import BookData
from src.logic.lehmannsapi import LehmannsClient
from src.logic.swb import SWB
from src.logic.SRU import SWB
# use all available cores - 2, but at least 1
THREAD_COUNT = max(os.cpu_count() - 2, 1)
THREAD_MIN_ITEMS = 5
log = loguru.logger
log.remove()
@@ -23,89 +30,136 @@ log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
log.add(
f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
rotation="1 day",
retention="1 month",
retention="7 days",
)
swb = SWB()
dnb = SWB()
cat = Catalogue()
def _norm_text(s: Optional[str]) -> str:
if not s:
return ""
# lowercase, collapse whitespace, drop some punctuation
s = s.lower()
s = re.sub(r"[\s\-\u2013\u2014]+", " ", s) # spaces/dashes
s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s)
return s.strip()
RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative RVK character set
def _same_book(a: BookData, b: BookData) -> bool:
"""Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized."""
isbns_a = _norm_isbns(a.isbn)
isbns_b = _norm_isbns(b.isbn)
if isbns_a and isbns_b and (isbns_a & isbns_b):
return True
ta, tb = _norm_text(a.title), _norm_text(b.title)
aa, ab = _norm_text(a.author), _norm_text(b.author)
ya, yb = (a.year or "").strip(), (b.year or "").strip()
# strong title match required; then author if available; then year if available
if ta and tb and ta == tb:
if aa and ab and aa == ab:
if ya and yb:
return ya == yb
return True
if ya and yb:
return ya == yb
return True
return False
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]:
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present)."""
if value is None:
return set()
vals = value if isinstance(value, list) else [value]
out: Set[str] = set()
for v in vals:
s = str(v)
digits = re.sub(r"[^0-9Xx]", "", s)
# keep 13-digit or 10-digit tokens
m13 = re.findall(r"97[89]\d{10}", digits)
if m13:
out.update(m13)
else:
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
out.update(x.upper() for x in m10)
return out
def filter_prefer_swb(records: List[BookData]) -> List[BookData]:
def find_newer_edition(
swb_result: BookData, dnb_result: List[BookData]
) -> Optional[List[BookData]]:
"""
If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
Returns a NEW list (does not mutate the input).
New edition if:
- year > swb.year OR
- edition_number > swb.edition_number
BUT: discard any candidate with year < swb.year (if both years are known).
Same-work check:
- Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
- If both have signatures and RVKs differ -> skip.
Preferences (in order):
1) RVK matches SWB
2) Print over Online-Ressource
3) Has signature
4) Newer: (year desc, edition_number desc)
"""
swb_with_sig = [
r
for r in records
if (r.link == "SWB") and (r.signature and r.signature.strip())
]
if not swb_with_sig:
return list(records)
to_remove: Set[int] = set()
def strip_copy_and_edition(s: str) -> str:
s = re.sub(r"\(\s*\d+\s*\)", "", s) # remove '(N)'
s = re.sub(r"\s*\+\s*\d+\s*$", "", s) # remove trailing '+N'
return s
# For each URL entry, see if it matches any SWB-with-signature entry
for idx, rec in enumerate(records):
if not rec.link or not rec.link.lower().startswith("http"):
continue
for swb in swb_with_sig:
if _same_book(swb, rec):
to_remove.add(idx)
break
def extract_rvk_root(sig: Optional[str]) -> str:
if not sig:
return ""
t = strip_copy_and_edition(sig.upper())
t = re.sub(r"\s+", " ", t).strip()
m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
if not m:
cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
return cleaned.split(" ")[0] if cleaned else ""
return re.sub(r"\s+", " ", m.group(1)).strip()
# Build filtered list
return [rec for i, rec in enumerate(records) if i not in to_remove]
def has_sig(b: BookData) -> bool:
return bool(getattr(b, "signature", None))
def is_online(b: BookData) -> bool:
return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"
def is_print(b: BookData) -> bool:
return not is_online(b)
def rvk_matches_swb(b: BookData) -> bool:
if not has_sig(b) or not has_sig(swb_result):
return False
return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)
def strictly_newer(b: BookData) -> bool:
# Hard guard: if both years are known and candidate is older, discard
if (
b.year is not None
and swb_result.year is not None
and b.year < swb_result.year
):
return False
newer_by_year = (
b.year is not None
and swb_result.year is not None
and b.year > swb_result.year
)
newer_by_edition = (
b.edition_number is not None
and swb_result.edition_number is not None
and b.edition_number > swb_result.edition_number
)
# Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
return newer_by_year or newer_by_edition
swb_has_sig = has_sig(swb_result)
swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))
# 1) Filter: same work (by RVK if both have sigs) AND strictly newer
candidates: List[BookData] = []
for b in dnb_result:
if has_sig(b) and swb_has_sig:
if extract_rvk_root(b.signature) != swb_rvk:
continue # different work
if strictly_newer(b):
candidates.append(b)
if not candidates:
return None
# 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
def pref_score(x: BookData) -> tuple[int, int, int]:
return (
1 if rvk_matches_swb(x) else 0,
1 if is_print(x) else 0,
1 if has_sig(x) else 0,
)
by_ppn: dict[Optional[str], BookData] = {}
for b in candidates:
key = getattr(b, "ppn", None)
prev = by_ppn.get(key)
if prev is None or pref_score(b) > pref_score(prev):
by_ppn[key] = b
deduped = list(by_ppn.values())
if not deduped:
return None
# 3) Final pick (single best)
def sort_key(b: BookData):
year = b.year if b.year is not None else -1
ed = b.edition_number if b.edition_number is not None else -1
return (
1 if rvk_matches_swb(b) else 0,
1 if is_print(b) else 0,
1 if has_sig(b) else 0,
year,
ed,
)
best = max(deduped, key=sort_key)
return [best] if best else None
class NewEditionCheckerThread(QThread):
@@ -115,8 +169,8 @@ class NewEditionCheckerThread(QThread):
resultsSignal = Signal(list) # list[tuple[BookData, list[BookData]]]
# NEW: metrics signals
rateSignal = Signal(float) # items per second ("it/s")
etaSignal = Signal(int) # seconds remaining (-1 when unknown)
rateSignal = Signal(float) # items per second ("it/s")
etaSignal = Signal(int) # seconds remaining (-1 when unknown)
def __init__(self, entries: Optional[list["BookData"]] = None, parent=None):
super().__init__(parent)
@@ -157,54 +211,64 @@ class NewEditionCheckerThread(QThread):
def _process_book(
cls, book: "BookData"
) -> tuple["BookData", list["BookData"]] | None:
author = (
book.author.split(";")[0].replace(" ", "")
if (book.author and ";" in book.author)
else (book.author or "").replace(" ", "")
)
title = cls._clean_title(book.title or "")
# Query SWB
response: list[BookData] = SWB().getBooks(
[
"pica.bib=20735",
f"pica.tit={title.split(':')[0].strip()}",
# f"pica.per={author}",
]
)
# Remove same PPN
response = [entry for entry in response if entry.ppn != book.ppn]
for respo in response:
respo.link = "SWB"
# Query Lehmanns
with LehmannsClient() as client:
results = client.search_by_title(title, strict=True)
if results:
for res in results:
response.append(BookData().from_LehmannsSearchResult(res))
if not response:
"""Process one book; returns (original, [found editions]) or None on failure."""
if not book.title:
return None
response = filter_prefer_swb(response)
# Remove entries matching the same ISBN as the current book
response = [
entry
for entry in response
if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
]
response = [
entry
for entry in response
if book.publisher in entry.publisher
response: list["BookData"] = []
query = [
f"pica.tit={book.title}",
f"pica.vlg={book.publisher}",
]
if not response:
return None
swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
dnb_results = swb.getBooks(query)
new_editions = find_newer_edition(swb_result, dnb_results)
if new_editions is not None:
for new_edition in new_editions:
new_edition.library_location = cat.get_location(new_edition.ppn)
try:
isbn = (
str(new_edition.isbn[0])
if isinstance(new_edition.isbn, list)
else str(new_edition.isbn)
)
new_edition.link = (
f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
)
except (IndexError, TypeError):
isbn = None
new_edition.in_library = cat.in_library(new_edition.ppn)
response = new_editions
# client = SWB()
# response: list["BookData"] = []
# # First, search by title only
# results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])
# lehmanns = LehmannsClient()
# results = lehmanns.search_by_title(title)
# for result in results:
# if "(eBook)" in result.title:
# result.title = result.title.replace("(eBook)", "").strip()
# swb_results = client.getBooks(
# [
# f"pica.tit={result.title}",
# f"pica.vlg={result.publisher.split(',')[0]}",
# ]
# )
# for swb in swb_results:
# if swb.isbn == result.isbn:
# result.ppn = swb.ppn
# result.signature = swb.signature
# response.append(result)
# if (result.edition_number < swb.edition_number) and (
# swb.year > result.year
# ):
# response.append(result)
if response == []:
return None
# Remove duplicates based on ppn
return (book, response)
@classmethod
@@ -240,7 +304,7 @@ class NewEditionCheckerThread(QThread):
return
# Up to 4 workers; ~20 items per worker
num_workers = min(4, max(1, ceil(total / 20)))
num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
chunks = self._split_evenly(self.entries, num_workers)
sizes = [len(ch) for ch in chunks]

View File

@@ -1,13 +1,15 @@
import sys
import time
import loguru
# from icecream import ic
from PySide6.QtCore import QThread
from PySide6.QtCore import Signal as Signal
from src.backend import Database
import loguru
import sys
from src import LOG_DIR
from src.backend import Database
log = loguru.logger
log.remove()
log.add(sys.stdout, level="INFO")
@@ -29,8 +31,8 @@ class AutoAdder(QThread):
self.app_id = app_id
self.prof_id = prof_id
# print("Launched AutoAdder")
# print(self.data, self.app_id, self.prof_id)
# #print("Launched AutoAdder")
# #print(self.data, self.app_id, self.prof_id)
def run(self):
self.db = Database()
@@ -46,7 +48,7 @@ class AutoAdder(QThread):
time.sleep(1)
except Exception as e:
# print(e)
# #print(e)
log.exception(
f"The query failed with message {e} for signature {entry}"
)

View File

@@ -1,24 +1,23 @@
import sys
import time
# from src.transformers import RDS_AVAIL_DATA
import loguru
# from icecream import ic
from PySide6.QtCore import QThread
from PySide6.QtCore import Signal as Signal
from src.backend.database import Database
from src import LOG_DIR
from src.backend.database import Database
from src.logic.webrequest import BibTextTransformer, WebRequest
# from src.transformers import RDS_AVAIL_DATA
import loguru
import sys
log = loguru.logger
log.remove()
log.add(sys.stdout, level="INFO")
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
class AvailChecker(QThread):
updateSignal = Signal(str, int)
updateProgress = Signal(int, int)
@@ -62,8 +61,8 @@ class AvailChecker(QThread):
for item in rds.items:
sign = item.superlocation
loc = item.location
# # print(item.location)
if self.appnumber in sign or self.appnumber in loc:
# # #print(item.location)
if str(self.appnumber) in sign or str(self.appnumber) in loc:
state = 1
break
for book in self.books:
@@ -71,7 +70,7 @@ class AvailChecker(QThread):
book_id = book["id"]
break
log.info(f"State of {link}: " + str(state))
# print("Updating availability of " + str(book_id) + " to " + str(state))
# #print("Updating availability of " + str(book_id) + " to " + str(state))
self.db.setAvailability(book_id, state)
count += 1
self.updateProgress.emit(count, len(self.links))