rework catalogue wrapper to split entries based on space div

2025-09-08 10:34:34 +02:00
parent 373257864f
commit 1f34442397
1 changed files with 62 additions and 39 deletions
--- a/src/backend/catalogue.py
+++ b/src/backend/catalogue.py
@@ -1,12 +1,13 @@
+import sys
+from datetime import datetime
+
+import loguru
 import requests
 from bs4 import BeautifulSoup

+from src import LOG_DIR
 from src.logic import BookData as Book

-from datetime import datetime
-import sys
-import loguru
-from src import LOG_DIR
 URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND"
 BASE = "https://rds.ibs-bw.de"

@@ -20,6 +21,8 @@ log.add(
    rotation="1 day",
    retention="1 month",
 )
+
+
 class Catalogue:
    def __init__(self, timeout=5):
        self.timeout = timeout
@@ -57,45 +60,65 @@ class Catalogue:
        log.info(f"Searching for term: {searchterm}")

        links = self.get_book_links(searchterm)
+        print(links)
        for link in links:
            result = self.search(link)
            # in result search for class col-xs-12 rds-dl RDS_LOCATION
            # if found, return text of href
            soup = BeautifulSoup(result, "html.parser")
-            location = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
-            for loc in location:
-                if f"1. OG Semesterapparat" in loc.text:
-                    title = (
-                        soup.find("div", class_="headline text")
-                        .text.replace("\n", "")
-                        .strip()
-                    )
-                    ppn = soup.find(
+
+            # Optional (unchanged): title and ppn if you need them
+            title_el = soup.find("div", class_="headline text")
+            title = title_el.get_text(strip=True) if title_el else None
+
+            ppn_el = soup.find(
                "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
            )
-                    signature = soup.find(
-                        "div", class_="col-xs-12 rds-dl RDS_SIGNATURE"
+            ppn = ppn_el.get_text(strip=True) if ppn_el else None
+
+            signature = None
+
+            panel = soup.select_one("div.panel-body")
+            if panel:
+                # Collect the RDS_* blocks in order, using the 'space' divs as separators
+                groups = []
+                cur = {}
+                for node in panel.select(
+                    "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
+                ):
+                    classes = node.get("class", [])
+                    # Separator between entries
+                    if "space" in classes:
+                        if cur:
+                            groups.append(cur)
+                            cur = {}
+                        continue
+
+                    # Read the value from the corresponding panel cell
+                    val_el = node.select_one(".rds-dl-panel")
+                    val = (
+                        val_el.get_text(" ", strip=True)
+                        if val_el
+                        else node.get_text(" ", strip=True)
                    )
-                    if signature:
-                        signature = (
-                            signature.find_next("div")
-                            .find_next("div")
-                            .text.replace("\n", "")
-                            .strip()
-                        )
-                    # use ppn to find the next div and extract the text
-                    if ppn:
-                        ppn = ppn.find_next("div").text.replace("\n", "").strip()
-                    else:
-                        ppn = None
-                    isbn = soup.find(
-                        "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_ISBN"
-                    )
-                    if isbn:
-                        isbn = isbn.find_next("div").find_next("div").text
-                    else:
-                        isbn = None
+
+                    if "RDS_SIGNATURE" in classes:
+                        cur["signature"] = val
+                    elif "RDS_STATUS" in classes:
+                        cur["status"] = val
+                    elif "RDS_LOCATION" in classes:
+                        cur["location"] = val
+
+                if cur:  # append the last group if not followed by a space
+                    groups.append(cur)
+
+                # Find the signature for the entry whose location mentions "Semesterapparat"
+                for g in groups:
+                    loc = g.get("location", "").lower()
+                    if "semesterapparat" in loc:
+                        signature = g.get("signature")
                        return Book(
-                        title=title, ppn=ppn, signature=signature, isbn=isbn, link=link
+                            title=title,
+                            ppn=ppn,
+                            signature=signature,
                        )
-        return False