From 1f34442397b856c510d613170d0a9850ff2f4700 Mon Sep 17 00:00:00 2001 From: WorldTeacher Date: Mon, 8 Sep 2025 10:34:34 +0200 Subject: [PATCH] rework catalogue wrapper to split entries based on space div --- src/backend/catalogue.py | 101 ++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 39 deletions(-) diff --git a/src/backend/catalogue.py b/src/backend/catalogue.py index 4f72ec1..439972d 100644 --- a/src/backend/catalogue.py +++ b/src/backend/catalogue.py @@ -1,12 +1,13 @@ +import sys +from datetime import datetime + +import loguru import requests from bs4 import BeautifulSoup +from src import LOG_DIR from src.logic import BookData as Book -from datetime import datetime -import sys -import loguru -from src import LOG_DIR URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND" BASE = "https://rds.ibs-bw.de" @@ -20,6 +21,8 @@ log.add( rotation="1 day", retention="1 month", ) + + class Catalogue: def __init__(self, timeout=5): self.timeout = timeout @@ -57,45 +60,65 @@ class Catalogue: log.info(f"Searching for term: {searchterm}") links = self.get_book_links(searchterm) + print(links) for link in links: result = self.search(link) # in result search for class col-xs-12 rds-dl RDS_LOCATION # if found, return text of href soup = BeautifulSoup(result, "html.parser") - location = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION") - for loc in location: - if f"1. OG Semesterapparat" in loc.text: - title = ( - soup.find("div", class_="headline text") - .text.replace("\n", "") - .strip() + + # Optional (unchanged): title and ppn if you need them + title_el = soup.find("div", class_="headline text") + title = title_el.get_text(strip=True) if title_el else None + + ppn_el = soup.find( + "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN" + ) + ppn = ppn_el.get_text(strip=True) if ppn_el else None + + signature = None + + panel = soup.select_one("div.panel-body") + if panel: + # Collect the RDS_* blocks in order, using the 'space' divs as separators + groups = [] + cur = {} + for node in panel.select( + "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space" + ): + classes = node.get("class", []) + # Separator between entries + if "space" in classes: + if cur: + groups.append(cur) + cur = {} + continue + + # Read the value from the corresponding panel cell + val_el = node.select_one(".rds-dl-panel") + val = ( + val_el.get_text(" ", strip=True) + if val_el + else node.get_text(" ", strip=True) ) - ppn = soup.find( - "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN" - ) - signature = soup.find( - "div", class_="col-xs-12 rds-dl RDS_SIGNATURE" - ) - if signature: - signature = ( - signature.find_next("div") - .find_next("div") - .text.replace("\n", "") - .strip() + + if "RDS_SIGNATURE" in classes: + cur["signature"] = val + elif "RDS_STATUS" in classes: + cur["status"] = val + elif "RDS_LOCATION" in classes: + cur["location"] = val + + if cur: # append the last group if not followed by a space + groups.append(cur) + + # Find the signature for the entry whose location mentions "Semesterapparat" + for g in groups: + loc = g.get("location", "").lower() + if "semesterapparat" in loc: + signature = g.get("signature") + return Book( + title=title, + ppn=ppn, + signature=signature, ) - # use ppn to find the next div and extract the text - if ppn: - ppn = ppn.find_next("div").text.replace("\n", "").strip() - else: - ppn = None - isbn = soup.find( - "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_ISBN" - ) - if isbn: - isbn = isbn.find_next("div").find_next("div").text - else: - isbn = None - return Book( - title=title, ppn=ppn, signature=signature, isbn=isbn, link=link - ) - return False