rework catalogue wrapper to split entries based on space div

2025-09-08 10:34:34 +02:00
parent 373257864f
commit 1f34442397
1 changed files with 62 additions and 39 deletions
--- a/src/backend/catalogue.py
+++ b/src/backend/catalogue.py
@@ -1,12 +1,13 @@
 import sys
 from datetime import datetime
 import loguru
 import requests
 from bs4 import BeautifulSoup
 from src import LOG_DIR
 from src.logic import BookData as Book
 from datetime import datetime
 import sys
 import loguru
 from src import LOG_DIR
 URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND"
 BASE = "https://rds.ibs-bw.de"
@@ -20,6 +21,8 @@ log.add(
    rotation="1 day",
    retention="1 month",
 )
 class Catalogue:
    def __init__(self, timeout=5):
        self.timeout = timeout
@@ -57,45 +60,65 @@ class Catalogue:
        log.info(f"Searching for term: {searchterm}")
        links = self.get_book_links(searchterm)
        print(links)
        for link in links:
            result = self.search(link)
            # in result search for class col-xs-12 rds-dl RDS_LOCATION
            # if found, return text of href
            soup = BeautifulSoup(result, "html.parser")
-            location = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
+
-            for loc in location:
+            # Optional (unchanged): title and ppn if you need them
-                if f"1. OG Semesterapparat" in loc.text:
+            title_el = soup.find("div", class_="headline text")
-                    title = (
+            title = title_el.get_text(strip=True) if title_el else None
-                        soup.find("div", class_="headline text")
+
-                        .text.replace("\n", "")
+            ppn_el = soup.find(
-                        .strip()
+                "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
            )
            ppn = ppn_el.get_text(strip=True) if ppn_el else None
            signature = None
            panel = soup.select_one("div.panel-body")
            if panel:
                # Collect the RDS_* blocks in order, using the 'space' divs as separators
                groups = []
                cur = {}
                for node in panel.select(
                    "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
                ):
                    classes = node.get("class", [])
                    # Separator between entries
                    if "space" in classes:
                        if cur:
                            groups.append(cur)
                            cur = {}
                        continue
                    # Read the value from the corresponding panel cell
                    val_el = node.select_one(".rds-dl-panel")
                    val = (
                        val_el.get_text(" ", strip=True)
                        if val_el
                        else node.get_text(" ", strip=True)
                    )
-                    ppn = soup.find(
+
-                        "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
+                    if "RDS_SIGNATURE" in classes:
-                    )
+                        cur["signature"] = val
-                    signature = soup.find(
+                    elif "RDS_STATUS" in classes:
-                        "div", class_="col-xs-12 rds-dl RDS_SIGNATURE"
+                        cur["status"] = val
-                    )
+                    elif "RDS_LOCATION" in classes:
-                    if signature:
+                        cur["location"] = val
-                        signature = (
+
-                            signature.find_next("div")
+                if cur:  # append the last group if not followed by a space
-                            .find_next("div")
+                    groups.append(cur)
-                            .text.replace("\n", "")
+
-                            .strip()
+                # Find the signature for the entry whose location mentions "Semesterapparat"
                for g in groups:
                    loc = g.get("location", "").lower()
                    if "semesterapparat" in loc:
                        signature = g.get("signature")
                        return Book(
                            title=title,
                            ppn=ppn,
                            signature=signature,
                        )
                    # use ppn to find the next div and extract the text
                    if ppn:
                        ppn = ppn.find_next("div").text.replace("\n", "").strip()
                    else:
                        ppn = None
                    isbn = soup.find(
                        "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_ISBN"
                    )
                    if isbn:
                        isbn = isbn.find_next("div").find_next("div").text
                    else:
                        isbn = None
                    return Book(
                        title=title, ppn=ppn, signature=signature, isbn=isbn, link=link
                    )
        return False