diff --git a/src/bibapi/catalogue.py b/src/bibapi/catalogue.py index e69de29..a933a81 100644 --- a/src/bibapi/catalogue.py +++ b/src/bibapi/catalogue.py @@ -0,0 +1,286 @@ +from typing import List + +import regex +import requests +from bs4 import BeautifulSoup + +URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND" +BASE = "https://rds.ibs-bw.de" + + +class Catalogue: + def __init__(self, timeout=15): + self.timeout = timeout + reachable = self.check_connection() + if not reachable: + raise ConnectionError("No internet connection available.") + + def check_connection(self): + try: + response = requests.get("https://www.google.com", timeout=self.timeout) + if response.status_code == 200: + return True + except requests.exceptions.RequestException as e: + print(f"Could not connect to google.com: {e}") + + def search_book(self, searchterm: str): + response = requests.get(URL.format(searchterm), timeout=self.timeout) + return response.text + + def search(self, link: str): + response = requests.get(link, timeout=self.timeout) + return response.text + + def get_book_links(self, searchterm: str) -> List[str]: + response = self.search_book(searchterm) + soup = BeautifulSoup(response, "html.parser") + links = soup.find_all("a", class_="title getFull") + res: List[str] = [] + for link in links: + res.append(BASE + link["href"]) # type: ignore + return res + + def get_book(self, searchterm: str): + links = self.get_book_links(searchterm) + print(links) + for elink in links: + result = self.search(elink) + # in result search for class col-xs-12 rds-dl RDS_LOCATION + # if found, return text of href + soup = BeautifulSoup(result, "html.parser") + + # Optional (unchanged): title and ppn if you need them + title_el = soup.find("div", class_="headline text") + title = title_el.get_text(strip=True) if title_el else None + + ppn_el = soup.find( + "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN" + ) + # in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel + ppn = ( + ppn_el.find_next_sibling( + "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" + ).get_text(strip=True) + if ppn_el + else None + ) + + # get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION + edition_el = soup.find( + "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION" + ) + edition = ( + edition_el.find_next_sibling( + "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" + ).get_text(strip=True) + if edition_el + else None + ) + + authors = soup.find_all( + "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON" + ) + author = None + if authors: + # get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel + author_names = [] + for author in authors: + panel = author.find_next_sibling( + "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" + ) + if panel: + links = panel.find_all("a") + for link in links: + author_names.append(link.text.strip()) + author = ( + ";".join(author_names) if len(author_names) > 1 else author_names[0] + ) + signature = None + + panel = soup.select_one("div.panel-body") + if panel: + # Collect the RDS_* blocks in order, using the 'space' divs as separators + groups = [] + cur = {} + for node in panel.select( + "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space" + ): + classes = node.get("class", []) + # Separator between entries + if "space" in classes: + if cur: + groups.append(cur) + cur = {} + continue + + # Read the value from the corresponding panel cell + val_el = node.select_one(".rds-dl-panel") + val = ( + val_el.get_text(" ", strip=True) + if val_el + else node.get_text(" ", strip=True) + ) + + if "RDS_SIGNATURE" in classes: + cur["signature"] = val + elif "RDS_STATUS" in classes: + cur["status"] = val + elif "RDS_LOCATION" in classes: + cur["location"] = val + + if cur: # append the last group if not followed by a space + groups.append(cur) + + # Find the signature for the entry whose location mentions "Semesterapparat" + for g in groups: + loc = g.get("location", "").lower() + if "semesterapparat" in loc: + signature = g.get("signature") + return Book( + title=title, + ppn=ppn, + signature=signature, + library_location=loc.split("-")[-1], + link=elink, + author=author, + edition=edition, + ) + else: + return Book( + title=title, + ppn=ppn, + signature=signature, + library_location=loc.split("\n\n")[-1], + link=elink, + author=author, + edition=edition, + ) + + def get(self, ppn: str) -> Book | None: + # based on PPN, get title, people, edition, year, language, pages, isbn, + link = f"https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{ppn}" + result = self.search(link) + soup = BeautifulSoup(result, "html.parser") + + def get_ppn(self, searchterm: str) -> str | None: + links = self.get_book_links(searchterm) + ppn = None + for link in links: + result = self.search(link) + soup = BeautifulSoup(result, "html.parser") + print(link) + ppn = link.split("/")[-1] + if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn): + return ppn + return ppn + + def get_semesterapparat_number(self, searchterm: str) -> int: + links = self.get_book_links(searchterm) + for link in links: + result = self.search(link) + # in result search for class col-xs-12 rds-dl RDS_LOCATION + # if found, return text of href + soup = BeautifulSoup(result, "html.parser") + + locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION") + for location_el in locations: + if "Semesterapparat-" in location_el.text: + match = regex.search(r"Semesterapparat-(\d+)", location_el.text) + if match: + return int(match.group(1)) + if "Handbibliothek-" in location_el.text: + return location_el.text.strip().split("\n\n")[-1].strip() + return location_el.text.strip().split("\n\n")[-1].strip() + return 0 + + def get_author(self, link: str) -> str: + links = self.get_book_links(f"kid:{link}") + author = None + for link in links: + # print(link) + result = self.search(link) + soup = BeautifulSoup(result, "html.parser") + # get all authors, return them as a string seperated by ; + authors = soup.find_all( + "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON" + ) + if authors: + # get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel + author_names = [] + for author in authors: + panel = author.find_next_sibling( + "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" + ) + if panel: + links = panel.find_all("a") + for link in links: + author_names.append(link.text.strip()) + author = "; ".join(author_names) + return author + + def get_signature(self, isbn: str): + links = self.get_book_links(f"{isbn}") + signature = None + for link in links: + result = self.search(link) + soup = BeautifulSoup(result, "html.parser") + panel = soup.select_one("div.panel-body") + if panel: + # Collect the RDS_* blocks in order, using the 'space' divs as separators + groups = [] + cur = {} + for node in panel.select( + "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space" + ): + classes = node.get("class", []) + # Separator between entries + if "space" in classes: + if cur: + groups.append(cur) + cur = {} + continue + + # Read the value from the corresponding panel cell + val_el = node.select_one(".rds-dl-panel") + val = ( + val_el.get_text(" ", strip=True) + if val_el + else node.get_text(" ", strip=True) + ) + + if "RDS_SIGNATURE" in classes: + cur["signature"] = val + elif "RDS_STATUS" in classes: + cur["status"] = val + elif "RDS_LOCATION" in classes: + cur["location"] = val + + if cur: # append the last group if not followed by a space + groups.append(cur) + + # Find the signature for the entry whose location mentions "Semesterapparat" + for g in groups: + print(g) + loc = g.get("location", "").lower() + if "semesterapparat" in loc: + signature = g.get("signature") + return signature + else: + signature = g.get("signature") + return signature + print("No signature found") + return signature + + def in_library(self, ppn: str) -> bool: + if ppn is None: + return False + links = self.get_book_links(f"kid:{ppn}") + return len(links) > 0 + + def get_location(self, ppn: str) -> str | None: + if ppn is None: + return None + link = self.get_book(f"{ppn}") + if link is None: + return None + return link.library_location