dev #21

Merged
WorldTeacher merged 46 commits from dev into main 2025-11-24 12:59:41 +00:00
4 changed files with 225 additions and 157 deletions
Showing only changes of commit e061c1f5a9 - Show all commits

View File

@@ -1,11 +1,12 @@
from PySide6.QtCore import QThread
from PySide6.QtCore import Signal
from src.backend import Database
from src.logic.webrequest import BibTextTransformer, WebRequest
import loguru
import sys import sys
import loguru
from PySide6.QtCore import QThread, Signal
from src import LOG_DIR from src import LOG_DIR
from src.backend import Database
from src.logic.webrequest import BibTextTransformer, WebRequest
log = loguru.logger log = loguru.logger
log.remove() log.remove()
log.add(sys.stdout, level="INFO") log.add(sys.stdout, level="INFO")
@@ -31,9 +32,11 @@ class BookGrabber(QThread):
self.book_id = None self.book_id = None
self.use_any = False self.use_any = False
self.use_exact = False self.use_exact = False
self.app_id = None self.app_nr = None
self.tstate = (self.app_id, self.prof_id, self.mode, self.data) self.tstate = (self.app_id, self.prof_id, self.mode, self.data)
self.request = WebRequest() self.request = WebRequest()
self.db = Database()
def add_values( def add_values(
self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False
@@ -45,13 +48,13 @@ class BookGrabber(QThread):
self.use_any = any_book self.use_any = any_book
self.use_exact = exact self.use_exact = exact
log.info(f"Working on {len(self.data)} entries") log.info(f"Working on {len(self.data)} entries")
self.tstate = (self.app_id, self.prof_id, self.mode, self.data) self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
log.debug("State: " + str(self.tstate)) log.debug("State: " + str(self.tstate))
self.request.set_apparat(self.app_id) app_nr = self.db.query_db("SELECT appnr FROM semesterapparat WHERE id = ?", (self.app_id,))[0][0]
self.request.set_apparat(app_nr)
# log.debug(self.tstate) # log.debug(self.tstate)
def run(self): def run(self):
self.db = Database()
item = 0 item = 0
iterdata = self.data iterdata = self.data
# log.debug(iterdata) # log.debug(iterdata)
@@ -91,7 +94,7 @@ class BookGrabber(QThread):
state = 0 state = 0
for result in transformer.RDS_DATA: for result in transformer.RDS_DATA:
# log.debug(result.RDS_LOCATION) # log.debug(result.RDS_LOCATION)
if str(self.app_id) in result.RDS_LOCATION: if str(self.app_nr) in result.RDS_LOCATION:
state = 1 state = 1
break break
@@ -126,27 +129,27 @@ class BookGrabberTest(QThread):
self.is_Running = True self.is_Running = True
log.info("Starting worker thread") log.info("Starting worker thread")
self.data = None self.data = None
self.app_id = None self.app_nr = None
self.prof_id = None self.prof_id = None
self.mode = None self.mode = None
self.book_id = None self.book_id = None
self.use_any = False self.use_any = False
self.use_exact = False self.use_exact = False
self.app_id = appnr self.app_nr = appnr
self.tstate = (self.app_id, self.prof_id, self.mode, self.data) self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
self.results = [] self.results = []
def add_values( def add_values(
self, app_id: int, prof_id: int, mode: str, data, any_book=False, exact=False self, app_nr: int, prof_id: int, mode: str, data, any_book=False, exact=False
): ):
self.app_id = app_id self.app_nr = app_nr
self.prof_id = prof_id self.prof_id = prof_id
self.mode = mode self.mode = mode
self.data = data self.data = data
self.use_any = any_book self.use_any = any_book
self.use_exact = exact self.use_exact = exact
log.info(f"Working on {len(self.data)} entries") log.info(f"Working on {len(self.data)} entries")
self.tstate = (self.app_id, self.prof_id, self.mode, self.data) self.tstate = (self.app_nr, self.prof_id, self.mode, self.data)
log.debug("State: " + str(self.tstate)) log.debug("State: " + str(self.tstate))
# log.debug(self.tstate) # log.debug(self.tstate)
@@ -159,7 +162,7 @@ class BookGrabberTest(QThread):
signature = str(entry) signature = str(entry)
log.info("Processing entry: " + signature) log.info("Processing entry: " + signature)
webdata = WebRequest().set_apparat(self.app_id).get_ppn(entry) webdata = WebRequest().set_apparat(self.app_nr).get_ppn(entry)
if self.use_any: if self.use_any:
webdata = webdata.use_any_book webdata = webdata.use_any_book
webdata = webdata.get_data() webdata = webdata.get_data()
@@ -186,7 +189,7 @@ class BookGrabberTest(QThread):
state = 0 state = 0
for result in transformer.RDS_DATA: for result in transformer.RDS_DATA:
# log.debug(result.RDS_LOCATION) # log.debug(result.RDS_LOCATION)
if str(self.app_id) in result.RDS_LOCATION: if str(self.app_nr) in result.RDS_LOCATION:
state = 1 state = 1
break break

View File

@@ -1,19 +1,26 @@
import os
import re import re
import sys import sys
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
from datetime import datetime from datetime import datetime
from math import ceil from math import ceil
from queue import Empty, Queue from queue import Empty, Queue
from typing import List, Optional, Set, Union
from time import monotonic # <-- NEW from time import monotonic # <-- NEW
from typing import List, Optional
import loguru import loguru
from PySide6.QtCore import QThread, Signal from PySide6.QtCore import QThread, Signal
from src import LOG_DIR from src import LOG_DIR
# from src.logic.webrequest import BibTextTransformer, WebRequest
from src.backend.catalogue import Catalogue
from src.logic import BookData from src.logic import BookData
from src.logic.lehmannsapi import LehmannsClient from src.logic.SRU import SWB
from src.logic.swb import SWB
# use all available cores - 2, but at least 1
THREAD_COUNT = max(os.cpu_count() - 2, 1)
THREAD_MIN_ITEMS = 5
log = loguru.logger log = loguru.logger
log.remove() log.remove()
@@ -23,89 +30,136 @@ log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
log.add( log.add(
f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log", f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
rotation="1 day", rotation="1 day",
retention="1 month", retention="7 days",
) )
swb = SWB()
dnb = SWB()
cat = Catalogue()
def _norm_text(s: Optional[str]) -> str: RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative RVK character set
if not s:
def find_newer_edition(
swb_result: BookData, dnb_result: List[BookData]
) -> Optional[List[BookData]]:
"""
New edition if:
- year > swb.year OR
- edition_number > swb.edition_number
BUT: discard any candidate with year < swb.year (if both years are known).
Same-work check:
- Compare RVK roots of signatures (after stripping trailing '+N' and '(N)').
- If both have signatures and RVKs differ -> skip.
Preferences (in order):
1) RVK matches SWB
2) Print over Online-Ressource
3) Has signature
4) Newer: (year desc, edition_number desc)
"""
def strip_copy_and_edition(s: str) -> str:
s = re.sub(r"\(\s*\d+\s*\)", "", s) # remove '(N)'
s = re.sub(r"\s*\+\s*\d+\s*$", "", s) # remove trailing '+N'
return s
def extract_rvk_root(sig: Optional[str]) -> str:
if not sig:
return "" return ""
# lowercase, collapse whitespace, drop some punctuation t = strip_copy_and_edition(sig.upper())
s = s.lower() t = re.sub(r"\s+", " ", t).strip()
s = re.sub(r"[\s\-\u2013\u2014]+", " ", s) # spaces/dashes m = re.match(rf"^([A-Z]{{1,3}}\s*{RVK_ALLOWED}*)", t)
s = re.sub(r"[\"'`:.,;!?()\[\]{}]", "", s) if not m:
return s.strip() cleaned = re.sub(rf"[^{RVK_ALLOWED} ]+", "", t).strip()
return cleaned.split(" ")[0] if cleaned else ""
return re.sub(r"\s+", " ", m.group(1)).strip()
def has_sig(b: BookData) -> bool:
return bool(getattr(b, "signature", None))
def _same_book(a: BookData, b: BookData) -> bool: def is_online(b: BookData) -> bool:
"""Heuristic: same if ISBNs intersect; fallback to (title, author, year) normalized.""" return (getattr(b, "media_type", None) or "").strip() == "Online-Ressource"
isbns_a = _norm_isbns(a.isbn)
isbns_b = _norm_isbns(b.isbn)
if isbns_a and isbns_b and (isbns_a & isbns_b):
return True
ta, tb = _norm_text(a.title), _norm_text(b.title) def is_print(b: BookData) -> bool:
aa, ab = _norm_text(a.author), _norm_text(b.author) return not is_online(b)
ya, yb = (a.year or "").strip(), (b.year or "").strip()
# strong title match required; then author if available; then year if available def rvk_matches_swb(b: BookData) -> bool:
if ta and tb and ta == tb: if not has_sig(b) or not has_sig(swb_result):
if aa and ab and aa == ab: return False
if ya and yb: return extract_rvk_root(b.signature) == extract_rvk_root(swb_result.signature)
return ya == yb
return True
if ya and yb:
return ya == yb
return True
def strictly_newer(b: BookData) -> bool:
# Hard guard: if both years are known and candidate is older, discard
if (
b.year is not None
and swb_result.year is not None
and b.year < swb_result.year
):
return False return False
newer_by_year = (
b.year is not None
and swb_result.year is not None
and b.year > swb_result.year
)
newer_by_edition = (
b.edition_number is not None
and swb_result.edition_number is not None
and b.edition_number > swb_result.edition_number
)
# Thanks to the guard above, newer_by_edition can't pick something with a smaller year.
return newer_by_year or newer_by_edition
def _norm_isbns(value: Union[str, List[str], None]) -> Set[str]: swb_has_sig = has_sig(swb_result)
"""Return a set of 10/13-digit ISBNs (digits only, keep X for ISBN-10 if present).""" swb_rvk = extract_rvk_root(getattr(swb_result, "signature", None))
if value is None:
return set()
vals = value if isinstance(value, list) else [value]
out: Set[str] = set()
for v in vals:
s = str(v)
digits = re.sub(r"[^0-9Xx]", "", s)
# keep 13-digit or 10-digit tokens
m13 = re.findall(r"97[89]\d{10}", digits)
if m13:
out.update(m13)
else:
m10 = re.findall(r"\d{9}[0-9Xx]", digits)
out.update(x.upper() for x in m10)
return out
# 1) Filter: same work (by RVK if both have sigs) AND strictly newer
candidates: List[BookData] = []
for b in dnb_result:
if has_sig(b) and swb_has_sig:
if extract_rvk_root(b.signature) != swb_rvk:
continue # different work
if strictly_newer(b):
candidates.append(b)
def filter_prefer_swb(records: List[BookData]) -> List[BookData]: if not candidates:
""" return None
If an SWB entry with a non-empty signature exists for a book, drop the HTTP(S) duplicate(s).
Returns a NEW list (does not mutate the input).
"""
swb_with_sig = [
r
for r in records
if (r.link == "SWB") and (r.signature and r.signature.strip())
]
if not swb_with_sig:
return list(records)
to_remove: Set[int] = set() # 2) Dedupe by PPN → prefer (rvk-match, is-print, has-signature)
def pref_score(x: BookData) -> tuple[int, int, int]:
return (
1 if rvk_matches_swb(x) else 0,
1 if is_print(x) else 0,
1 if has_sig(x) else 0,
)
# For each URL entry, see if it matches any SWB-with-signature entry by_ppn: dict[Optional[str], BookData] = {}
for idx, rec in enumerate(records): for b in candidates:
if not rec.link or not rec.link.lower().startswith("http"): key = getattr(b, "ppn", None)
continue prev = by_ppn.get(key)
for swb in swb_with_sig: if prev is None or pref_score(b) > pref_score(prev):
if _same_book(swb, rec): by_ppn[key] = b
to_remove.add(idx)
break
# Build filtered list deduped = list(by_ppn.values())
return [rec for i, rec in enumerate(records) if i not in to_remove] if not deduped:
return None
# 3) Final pick (single best)
def sort_key(b: BookData):
year = b.year if b.year is not None else -1
ed = b.edition_number if b.edition_number is not None else -1
return (
1 if rvk_matches_swb(b) else 0,
1 if is_print(b) else 0,
1 if has_sig(b) else 0,
year,
ed,
)
best = max(deduped, key=sort_key)
return [best] if best else None
class NewEditionCheckerThread(QThread): class NewEditionCheckerThread(QThread):
@@ -157,54 +211,64 @@ class NewEditionCheckerThread(QThread):
def _process_book( def _process_book(
cls, book: "BookData" cls, book: "BookData"
) -> tuple["BookData", list["BookData"]] | None: ) -> tuple["BookData", list["BookData"]] | None:
author = ( """Process one book; returns (original, [found editions]) or None on failure."""
book.author.split(";")[0].replace(" ", "") if not book.title:
if (book.author and ";" in book.author)
else (book.author or "").replace(" ", "")
)
title = cls._clean_title(book.title or "")
# Query SWB
response: list[BookData] = SWB().getBooks(
[
"pica.bib=20735",
f"pica.tit={title.split(':')[0].strip()}",
# f"pica.per={author}",
]
)
# Remove same PPN
response = [entry for entry in response if entry.ppn != book.ppn]
for respo in response:
respo.link = "SWB"
# Query Lehmanns
with LehmannsClient() as client:
results = client.search_by_title(title, strict=True)
if results:
for res in results:
response.append(BookData().from_LehmannsSearchResult(res))
if not response:
return None return None
response: list["BookData"] = []
response = filter_prefer_swb(response) query = [
f"pica.tit={book.title}",
# Remove entries matching the same ISBN as the current book f"pica.vlg={book.publisher}",
response = [
entry
for entry in response
if not (_norm_isbns(entry.isbn) & _norm_isbns(book.isbn))
]
response = [
entry
for entry in response
if book.publisher in entry.publisher
] ]
if not response: swb_result = swb.getBooks(["pica.bib=20735", f"pica.ppn={book.ppn}"])[0]
dnb_results = swb.getBooks(query)
new_editions = find_newer_edition(swb_result, dnb_results)
if new_editions is not None:
for new_edition in new_editions:
new_edition.library_location = cat.get_location(new_edition.ppn)
try:
isbn = (
str(new_edition.isbn[0])
if isinstance(new_edition.isbn, list)
else str(new_edition.isbn)
)
new_edition.link = (
f"https://www.lehmanns.de/search/quick?mediatype_id=2&q={isbn}"
)
except (IndexError, TypeError):
isbn = None
new_edition.in_library = cat.in_library(new_edition.ppn)
response = new_editions
# client = SWB()
# response: list["BookData"] = []
# # First, search by title only
# results = client.getBooks([f"pica.title={title}", f"pica.vlg={book.publisher}"])
# lehmanns = LehmannsClient()
# results = lehmanns.search_by_title(title)
# for result in results:
# if "(eBook)" in result.title:
# result.title = result.title.replace("(eBook)", "").strip()
# swb_results = client.getBooks(
# [
# f"pica.tit={result.title}",
# f"pica.vlg={result.publisher.split(',')[0]}",
# ]
# )
# for swb in swb_results:
# if swb.isbn == result.isbn:
# result.ppn = swb.ppn
# result.signature = swb.signature
# response.append(result)
# if (result.edition_number < swb.edition_number) and (
# swb.year > result.year
# ):
# response.append(result)
if response == []:
return None return None
# Remove duplicates based on ppn
return (book, response) return (book, response)
@classmethod @classmethod
@@ -240,7 +304,7 @@ class NewEditionCheckerThread(QThread):
return return
# Up to 4 workers; ~20 items per worker # Up to 4 workers; ~20 items per worker
num_workers = min(4, max(1, ceil(total / 20))) num_workers = min(THREAD_COUNT, max(1, ceil(total / THREAD_MIN_ITEMS)))
chunks = self._split_evenly(self.entries, num_workers) chunks = self._split_evenly(self.entries, num_workers)
sizes = [len(ch) for ch in chunks] sizes = [len(ch) for ch in chunks]

View File

@@ -1,13 +1,15 @@
import sys
import time import time
import loguru
# from icecream import ic # from icecream import ic
from PySide6.QtCore import QThread from PySide6.QtCore import QThread
from PySide6.QtCore import Signal as Signal from PySide6.QtCore import Signal as Signal
from src.backend import Database
import loguru
import sys
from src import LOG_DIR from src import LOG_DIR
from src.backend import Database
log = loguru.logger log = loguru.logger
log.remove() log.remove()
log.add(sys.stdout, level="INFO") log.add(sys.stdout, level="INFO")
@@ -29,8 +31,8 @@ class AutoAdder(QThread):
self.app_id = app_id self.app_id = app_id
self.prof_id = prof_id self.prof_id = prof_id
# print("Launched AutoAdder") # #print("Launched AutoAdder")
# print(self.data, self.app_id, self.prof_id) # #print(self.data, self.app_id, self.prof_id)
def run(self): def run(self):
self.db = Database() self.db = Database()
@@ -46,7 +48,7 @@ class AutoAdder(QThread):
time.sleep(1) time.sleep(1)
except Exception as e: except Exception as e:
# print(e) # #print(e)
log.exception( log.exception(
f"The query failed with message {e} for signature {entry}" f"The query failed with message {e} for signature {entry}"
) )

View File

@@ -1,24 +1,23 @@
import sys
import time import time
# from src.transformers import RDS_AVAIL_DATA
import loguru
# from icecream import ic # from icecream import ic
from PySide6.QtCore import QThread from PySide6.QtCore import QThread
from PySide6.QtCore import Signal as Signal from PySide6.QtCore import Signal as Signal
from src.backend.database import Database
from src import LOG_DIR from src import LOG_DIR
from src.backend.database import Database
from src.logic.webrequest import BibTextTransformer, WebRequest from src.logic.webrequest import BibTextTransformer, WebRequest
# from src.transformers import RDS_AVAIL_DATA
import loguru
import sys
log = loguru.logger log = loguru.logger
log.remove() log.remove()
log.add(sys.stdout, level="INFO") log.add(sys.stdout, level="INFO")
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days") log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
class AvailChecker(QThread): class AvailChecker(QThread):
updateSignal = Signal(str, int) updateSignal = Signal(str, int)
updateProgress = Signal(int, int) updateProgress = Signal(int, int)
@@ -62,8 +61,8 @@ class AvailChecker(QThread):
for item in rds.items: for item in rds.items:
sign = item.superlocation sign = item.superlocation
loc = item.location loc = item.location
# # print(item.location) # # #print(item.location)
if self.appnumber in sign or self.appnumber in loc: if str(self.appnumber) in sign or str(self.appnumber) in loc:
state = 1 state = 1
break break
for book in self.books: for book in self.books:
@@ -71,7 +70,7 @@ class AvailChecker(QThread):
book_id = book["id"] book_id = book["id"]
break break
log.info(f"State of {link}: " + str(state)) log.info(f"State of {link}: " + str(state))
# print("Updating availability of " + str(book_id) + " to " + str(state)) # #print("Updating availability of " + str(book_id) + " to " + str(state))
self.db.setAvailability(book_id, state) self.db.setAvailability(book_id, state)
count += 1 count += 1
self.updateProgress.emit(count, len(self.links)) self.updateProgress.emit(count, len(self.links))