rework catalogue wrapper to split entries based on space div

This commit is contained in:
2025-09-08 10:34:34 +02:00
parent 373257864f
commit 1f34442397

View File

@@ -1,12 +1,13 @@
import sys
from datetime import datetime
import loguru
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from src import LOG_DIR
from src.logic import BookData as Book from src.logic import BookData as Book
from datetime import datetime
import sys
import loguru
from src import LOG_DIR
URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND" URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND"
BASE = "https://rds.ibs-bw.de" BASE = "https://rds.ibs-bw.de"
@@ -20,6 +21,8 @@ log.add(
rotation="1 day", rotation="1 day",
retention="1 month", retention="1 month",
) )
class Catalogue: class Catalogue:
def __init__(self, timeout=5): def __init__(self, timeout=5):
self.timeout = timeout self.timeout = timeout
@@ -57,45 +60,65 @@ class Catalogue:
log.info(f"Searching for term: {searchterm}") log.info(f"Searching for term: {searchterm}")
links = self.get_book_links(searchterm) links = self.get_book_links(searchterm)
print(links)
for link in links: for link in links:
result = self.search(link) result = self.search(link)
# in result search for class col-xs-12 rds-dl RDS_LOCATION # in result search for class col-xs-12 rds-dl RDS_LOCATION
# if found, return text of href # if found, return text of href
soup = BeautifulSoup(result, "html.parser") soup = BeautifulSoup(result, "html.parser")
location = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
for loc in location: # Optional (unchanged): title and ppn if you need them
if f"1. OG Semesterapparat" in loc.text: title_el = soup.find("div", class_="headline text")
title = ( title = title_el.get_text(strip=True) if title_el else None
soup.find("div", class_="headline text")
.text.replace("\n", "") ppn_el = soup.find(
.strip() "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
)
ppn = ppn_el.get_text(strip=True) if ppn_el else None
signature = None
panel = soup.select_one("div.panel-body")
if panel:
# Collect the RDS_* blocks in order, using the 'space' divs as separators
groups = []
cur = {}
for node in panel.select(
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
):
classes = node.get("class", [])
# Separator between entries
if "space" in classes:
if cur:
groups.append(cur)
cur = {}
continue
# Read the value from the corresponding panel cell
val_el = node.select_one(".rds-dl-panel")
val = (
val_el.get_text(" ", strip=True)
if val_el
else node.get_text(" ", strip=True)
) )
ppn = soup.find(
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN" if "RDS_SIGNATURE" in classes:
) cur["signature"] = val
signature = soup.find( elif "RDS_STATUS" in classes:
"div", class_="col-xs-12 rds-dl RDS_SIGNATURE" cur["status"] = val
) elif "RDS_LOCATION" in classes:
if signature: cur["location"] = val
signature = (
signature.find_next("div") if cur: # append the last group if not followed by a space
.find_next("div") groups.append(cur)
.text.replace("\n", "")
.strip() # Find the signature for the entry whose location mentions "Semesterapparat"
for g in groups:
loc = g.get("location", "").lower()
if "semesterapparat" in loc:
signature = g.get("signature")
return Book(
title=title,
ppn=ppn,
signature=signature,
) )
# use ppn to find the next div and extract the text
if ppn:
ppn = ppn.find_next("div").text.replace("\n", "").strip()
else:
ppn = None
isbn = soup.find(
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_ISBN"
)
if isbn:
isbn = isbn.find_next("div").find_next("div").text
else:
isbn = None
return Book(
title=title, ppn=ppn, signature=signature, isbn=isbn, link=link
)
return False