import regex import requests from bs4 import BeautifulSoup from src.logic import BookData as Book from src.shared.logging import log URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND" BASE = "https://rds.ibs-bw.de" class Catalogue: def __init__(self, timeout=15): self.timeout = timeout reachable = self.check_connection() if not reachable: log.error("No internet connection available.") raise ConnectionError("No internet connection available.") def check_connection(self): try: response = requests.get("https://www.google.com", timeout=self.timeout) if response.status_code == 200: return True except requests.exceptions.RequestException as e: log.error(f"Could not connect to google.com: {e}") def search_book(self, searchterm: str): response = requests.get(URL.format(searchterm), timeout=self.timeout) return response.text def search(self, link: str): response = requests.get(link, timeout=self.timeout) return response.text def get_book_links(self, searchterm: str): response = self.search_book(searchterm) soup = BeautifulSoup(response, "html.parser") links = soup.find_all("a", class_="title getFull") res = [] for link in links: res.append(BASE + link["href"]) return res def get_book(self, searchterm: str): log.info(f"Searching for term: {searchterm}") links = self.get_book_links(searchterm) print(links) for elink in links: result = self.search(elink) # in result search for class col-xs-12 rds-dl RDS_LOCATION # if found, return text of href soup = BeautifulSoup(result, "html.parser") # Optional (unchanged): title and ppn if you need them title_el = soup.find("div", class_="headline text") title = title_el.get_text(strip=True) if title_el else None ppn_el = soup.find( "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN" ) # in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel ppn = ( ppn_el.find_next_sibling( "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" ).get_text(strip=True) if ppn_el else None ) # get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION edition_el = soup.find( "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION" ) edition = ( edition_el.find_next_sibling( "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" ).get_text(strip=True) if edition_el else None ) authors = soup.find_all( "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON" ) author = None if authors: # get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel author_names = [] for author in authors: panel = author.find_next_sibling( "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" ) if panel: links = panel.find_all("a") for link in links: author_names.append(link.text.strip()) author = ( ";".join(author_names) if len(author_names) > 1 else author_names[0] ) signature = None panel = soup.select_one("div.panel-body") if panel: # Collect the RDS_* blocks in order, using the 'space' divs as separators groups = [] cur = {} for node in panel.select( "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space" ): classes = node.get("class", []) # Separator between entries if "space" in classes: if cur: groups.append(cur) cur = {} continue # Read the value from the corresponding panel cell val_el = node.select_one(".rds-dl-panel") val = ( val_el.get_text(" ", strip=True) if val_el else node.get_text(" ", strip=True) ) if "RDS_SIGNATURE" in classes: cur["signature"] = val elif "RDS_STATUS" in classes: cur["status"] = val elif "RDS_LOCATION" in classes: cur["location"] = val if cur: # append the last group if not followed by a space groups.append(cur) # Find the signature for the entry whose location mentions "Semesterapparat" for g in groups: loc = g.get("location", "").lower() if "semesterapparat" in loc: signature = g.get("signature") return Book( title=title, ppn=ppn, signature=signature, library_location=loc.split("-")[-1], link=elink, author=author, edition=edition, ) else: return Book( title=title, ppn=ppn, signature=signature, library_location=loc.split("\n\n")[-1], link=elink, author=author, edition=edition, ) def get(self, ppn: str) -> Book | None: # based on PPN, get title, people, edition, year, language, pages, isbn, link = f"https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{ppn}" result = self.search(link) soup = BeautifulSoup(result, "html.parser") def get_ppn(self, searchterm: str) -> str | None: links = self.get_book_links(searchterm) ppn = None for link in links: result = self.search(link) soup = BeautifulSoup(result, "html.parser") print(link) ppn = link.split("/")[-1] if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn): return ppn return ppn def get_semesterapparat_number(self, searchterm: str) -> int: links = self.get_book_links(searchterm) for link in links: result = self.search(link) # in result search for class col-xs-12 rds-dl RDS_LOCATION # if found, return text of href soup = BeautifulSoup(result, "html.parser") locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION") for location_el in locations: if "Semesterapparat-" in location_el.text: match = regex.search(r"Semesterapparat-(\d+)", location_el.text) if match: return int(match.group(1)) if "Handbibliothek-" in location_el.text: return location_el.text.strip().split("\n\n")[-1].strip() return location_el.text.strip().split("\n\n")[-1].strip() return 0 def get_author(self, link: str) -> str: links = self.get_book_links(f"kid:{link}") author = None for link in links: # print(link) result = self.search(link) soup = BeautifulSoup(result, "html.parser") # get all authors, return them as a string seperated by ; authors = soup.find_all( "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON" ) if authors: # get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel author_names = [] for author in authors: panel = author.find_next_sibling( "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" ) if panel: links = panel.find_all("a") for link in links: author_names.append(link.text.strip()) author = "; ".join(author_names) return author def get_signature(self, isbn: str): links = self.get_book_links(f"{isbn}") signature = None for link in links: result = self.search(link) soup = BeautifulSoup(result, "html.parser") panel = soup.select_one("div.panel-body") if panel: # Collect the RDS_* blocks in order, using the 'space' divs as separators groups = [] cur = {} for node in panel.select( "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space" ): classes = node.get("class", []) # Separator between entries if "space" in classes: if cur: groups.append(cur) cur = {} continue # Read the value from the corresponding panel cell val_el = node.select_one(".rds-dl-panel") val = ( val_el.get_text(" ", strip=True) if val_el else node.get_text(" ", strip=True) ) if "RDS_SIGNATURE" in classes: cur["signature"] = val elif "RDS_STATUS" in classes: cur["status"] = val elif "RDS_LOCATION" in classes: cur["location"] = val if cur: # append the last group if not followed by a space groups.append(cur) # Find the signature for the entry whose location mentions "Semesterapparat" for g in groups: print(g) loc = g.get("location", "").lower() if "semesterapparat" in loc: signature = g.get("signature") return signature else: signature = g.get("signature") return signature print("No signature found") return signature def in_library(self, ppn: str) -> bool: if ppn is None: return False links = self.get_book_links(f"kid:{ppn}") return len(links) > 0 def get_location(self, ppn: str) -> str | None: if ppn is None: return None link = self.get_book(f"{ppn}") if link is None: return None return link.library_location