minor and major reworks: rename swb to SRU, add a test for pdf parsing

major: rework mail to send mail as plaintext instead of html, preventing the bleed-in of html text
2025-10-07 14:15:10 +02:00
parent 0df7fd9fe6
commit 06965db26a
25 changed files with 1174 additions and 303 deletions
--- a/src/backend/init.py
+++ b/src/backend/init.py
@@ -1,6 +1,5 @@
 __all__ = [
    "AdminCommands",
-    "Semester",
    "AutoAdder",
    "AvailChecker",
    "BookGrabber",
@@ -9,16 +8,15 @@ __all__ = [
    "NewEditionCheckerThread",
    "recreateElsaFile",
    "recreateFile",
-    "Catalogue"
+    "Catalogue",
 ]

 from .admin_console import AdminCommands
+from .catalogue import Catalogue
 from .create_file import recreateElsaFile, recreateFile
 from .database import Database
 from .documentation_thread import DocumentationThread
-from .semester import Semester
 from .thread_bookgrabber import BookGrabber
 from .thread_neweditions import NewEditionCheckerThread
 from .threads_autoadder import AutoAdder
 from .threads_availchecker import AvailChecker
-from .catalogue import Catalogue
--- a/src/backend/catalogue.py
+++ b/src/backend/catalogue.py
@@ -2,6 +2,7 @@ import sys
 from datetime import datetime

 import loguru
+import regex
 import requests
 from bs4 import BeautifulSoup

@@ -24,7 +25,7 @@ log.add(


 class Catalogue:
-    def __init__(self, timeout=5):
+    def __init__(self, timeout=15):
        self.timeout = timeout
        reachable = self.check_connection()
        if not reachable:
@@ -61,8 +62,8 @@ class Catalogue:

        links = self.get_book_links(searchterm)
        print(links)
-        for link in links:
-            result = self.search(link)
+        for elink in links:
+            result = self.search(elink)
            # in result search for class col-xs-12 rds-dl RDS_LOCATION
            # if found, return text of href
            soup = BeautifulSoup(result, "html.parser")
@@ -74,8 +75,45 @@ class Catalogue:
            ppn_el = soup.find(
                "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
            )
-            ppn = ppn_el.get_text(strip=True) if ppn_el else None
+            # in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
+            ppn = (
+                ppn_el.find_next_sibling(
+                    "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
+                ).get_text(strip=True)
+                if ppn_el
+                else None
+            )

+            # get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION
+            edition_el = soup.find(
+                "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION"
+            )
+            edition = (
+                edition_el.find_next_sibling(
+                    "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
+                ).get_text(strip=True)
+                if edition_el
+                else None
+            )
+
+            authors = soup.find_all(
+                "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
+            )
+            author = None
+            if authors:
+                # get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
+                author_names = []
+                for author in authors:
+                    panel = author.find_next_sibling(
+                        "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
+                    )
+                    if panel:
+                        links = panel.find_all("a")
+                        for link in links:
+                            author_names.append(link.text.strip())
+                author = (
+                    ";".join(author_names) if len(author_names) > 1 else author_names[0]
+                )
            signature = None

            panel = soup.select_one("div.panel-body")
@@ -121,4 +159,147 @@ class Catalogue:
                            title=title,
                            ppn=ppn,
                            signature=signature,
+                            library_location=loc.split("-")[-1],
+                            link=elink,
+                            author=author,
+                            edition=edition,
                        )
+                    else:
+                        return Book(
+                            title=title,
+                            ppn=ppn,
+                            signature=signature,
+                            library_location=loc.split("\n\n")[-1],
+                            link=elink,
+                            author=author,
+                            edition=edition,
+                        )
+
+    def get(self, ppn: str) -> Book | None:
+        # based on PPN, get title, people, edition, year, language, pages, isbn,
+        link = f"https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{ppn}"
+        result = self.search(link)
+        soup = BeautifulSoup(result, "html.parser")
+
+    def get_ppn(self, searchterm: str) -> str | None:
+        links = self.get_book_links(searchterm)
+        ppn = None
+        for link in links:
+            result = self.search(link)
+            soup = BeautifulSoup(result, "html.parser")
+            print(link)
+            ppn = link.split("/")[-1]
+            if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn):
+                return ppn
+        return ppn
+
+    def get_semesterapparat_number(self, searchterm: str) -> int:
+        links = self.get_book_links(searchterm)
+        for link in links:
+            result = self.search(link)
+            # in result search for class col-xs-12 rds-dl RDS_LOCATION
+            # if found, return text of href
+            soup = BeautifulSoup(result, "html.parser")
+
+            locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
+            for location_el in locations:
+                if "Semesterapparat-" in location_el.text:
+                    match = regex.search(r"Semesterapparat-(\d+)", location_el.text)
+                    if match:
+                        return int(match.group(1))
+                if "Handbibliothek-" in location_el.text:
+                    return location_el.text.strip().split("\n\n")[-1].strip()
+                return location_el.text.strip().split("\n\n")[-1].strip()
+        return 0
+
+    def get_author(self, link: str) -> str:
+        links = self.get_book_links(f"kid:{link}")
+        author = None
+        for link in links:
+            # print(link)
+            result = self.search(link)
+            soup = BeautifulSoup(result, "html.parser")
+            # get all authors, return them as a string seperated by ;
+            authors = soup.find_all(
+                "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
+            )
+            if authors:
+                # get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
+                author_names = []
+                for author in authors:
+                    panel = author.find_next_sibling(
+                        "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
+                    )
+                    if panel:
+                        links = panel.find_all("a")
+                        for link in links:
+                            author_names.append(link.text.strip())
+                author = "; ".join(author_names)
+        return author
+
+    def get_signature(self, isbn: str):
+        links = self.get_book_links(f"{isbn}")
+        signature = None
+        for link in links:
+            result = self.search(link)
+            soup = BeautifulSoup(result, "html.parser")
+            panel = soup.select_one("div.panel-body")
+            if panel:
+                # Collect the RDS_* blocks in order, using the 'space' divs as separators
+                groups = []
+                cur = {}
+                for node in panel.select(
+                    "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
+                ):
+                    classes = node.get("class", [])
+                    # Separator between entries
+                    if "space" in classes:
+                        if cur:
+                            groups.append(cur)
+                            cur = {}
+                        continue
+
+                    # Read the value from the corresponding panel cell
+                    val_el = node.select_one(".rds-dl-panel")
+                    val = (
+                        val_el.get_text(" ", strip=True)
+                        if val_el
+                        else node.get_text(" ", strip=True)
+                    )
+
+                    if "RDS_SIGNATURE" in classes:
+                        cur["signature"] = val
+                    elif "RDS_STATUS" in classes:
+                        cur["status"] = val
+                    elif "RDS_LOCATION" in classes:
+                        cur["location"] = val
+
+                if cur:  # append the last group if not followed by a space
+                    groups.append(cur)
+
+                # Find the signature for the entry whose location mentions "Semesterapparat"
+                for g in groups:
+                    print(g)
+                    loc = g.get("location", "").lower()
+                    if "semesterapparat" in loc:
+                        signature = g.get("signature")
+                        return signature
+                    else:
+                        signature = g.get("signature")
+                        return signature
+        print("No signature found")
+        return signature
+
+    def in_library(self, ppn: str) -> bool:
+        if ppn is None:
+            return False
+        links = self.get_book_links(f"kid:{ppn}")
+        return len(links) > 0
+
+    def get_location(self, ppn: str) -> str | None:
+        if ppn is None:
+            return None
+        link = self.get_book(f"{ppn}")
+        if link is None:
+            return None
+        return link.library_location
--- a/src/backend/database.py
+++ b/src/backend/database.py
@@ -30,10 +30,9 @@ from src.backend.db import (
 from src.errors import AppPresentError, NoResultError
 from src.logic import ELSA, Apparat, ApparatData, BookData, Prof
 from src.logic.constants import SEMAP_MEDIA_ACCOUNTS
+from src.logic.semester import Semester
 from src.utils.blob import create_blob

-from .semester import Semester
-
 log = loguru.logger
 log.remove()
 log.add(sys.stdout, level="INFO")
@@ -1873,7 +1872,7 @@ class Database:
        Returns:
            list[tuple]: A list of tuples containing the new editions data
        """
-        query = "SELECT * FROM neweditions WHERE for_apparat=?"
+        query = "SELECT * FROM neweditions WHERE for_apparat=? AND ordered=0"
        results = self.query_db(query, (apparat_id,))
        res = []
        for result in results:
@@ -1887,9 +1886,25 @@ class Database:
        query = "UPDATE neweditions SET ordered=1 WHERE id=?"
        self.query_db(query, (newBook_id,))

+    def getBooksWithNewEditions(self, app_id) -> List[BookData]:
+        # select all bookdata from media, based on the old_edition_id in neweditions where for_apparat = app_id; also get the new_edition bookdata
+
+        query = "SELECT m.bookdata, new_bookdata FROM media m JOIN neweditions n ON m.id = n.old_edition_id WHERE n.for_apparat = ?"
+        results = self.query_db(query, (app_id,))
+        # store results in tuple old,new
+        res = []
+        for result in results:
+            oldedition = BookData().from_string(result[0])
+            newedition = BookData().from_string(result[1])
+            res.append((oldedition, newedition))
+        return res
+
    def getNewEditionId(self, newBook: BookData):
-        query = "SELECT id FROM neweditions WHERE new_bookdata=?"
-        params = (newBook.to_dict,)
+        query = "SELECT id FROM neweditions WHERE new_bookdata LIKE ?"
+        args = (
+            newBook.isbn[0] if newBook.isbn and len(newBook.isbn) > 0 else newBook.ppn
+        )
+        params = (f"%{args}%",)
        data = self.query_db(query, params, one=True)
        if data:
            return data[0]
@@ -1897,6 +1912,14 @@ class Database:
            return None

    def insertNewEdition(self, newBook: BookData, oldBookId: int, for_apparat: int):
+        # check if new edition already in table, check based on newBook.ppn
+        check_query = "SELECT id FROM neweditions WHERE new_bookdata LIKE ?"
+        check_params = (f"%{newBook.ppn}%",)
+        data = self.query_db(check_query, check_params, one=True)
+        if data:
+            log.info("New edition already in table, skipping insert")
+            return
+
        query = "INSERT INTO neweditions (new_bookdata, old_edition_id, for_apparat) VALUES (?,?,?)"
        params = (newBook.to_dict, oldBookId, for_apparat)