SemesterapparatsManager/src/backend/catalogue.py

import regex
import requests
from bs4 import BeautifulSoup

from src.logic import BookData as Book
from src.shared.logging import log

URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND"
BASE = "https://rds.ibs-bw.de"


class Catalogue:
    def __init__(self, timeout=15):
        self.timeout = timeout
        reachable = self.check_connection()
        if not reachable:
            log.error("No internet connection available.")
            raise ConnectionError("No internet connection available.")

    def check_connection(self):
        try:
            response = requests.get("https://www.google.com", timeout=self.timeout)
            if response.status_code == 200:
                return True
        except requests.exceptions.RequestException as e:
            log.error(f"Could not connect to google.com: {e}")

    def search_book(self, searchterm: str):
        response = requests.get(URL.format(searchterm), timeout=self.timeout)
        return response.text

    def search(self, link: str):
        response = requests.get(link, timeout=self.timeout)
        return response.text

    def get_book_links(self, searchterm: str):
        response = self.search_book(searchterm)
        soup = BeautifulSoup(response, "html.parser")
        links = soup.find_all("a", class_="title getFull")
        res = []
        for link in links:
            res.append(BASE + link["href"])
        return res

    def get_book(self, searchterm: str):
        log.info(f"Searching for term: {searchterm}")

        links = self.get_book_links(searchterm)
        print(links)
        for elink in links:
            result = self.search(elink)
            # in result search for class col-xs-12 rds-dl RDS_LOCATION
            # if found, return text of href
            soup = BeautifulSoup(result, "html.parser")

            # Optional (unchanged): title and ppn if you need them
            title_el = soup.find("div", class_="headline text")
            title = title_el.get_text(strip=True) if title_el else None

            ppn_el = soup.find(
                "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
            )
            # in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
            ppn = (
                ppn_el.find_next_sibling(
                    "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
                ).get_text(strip=True)
                if ppn_el
                else None
            )

            # get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION
            edition_el = soup.find(
                "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION"
            )
            edition = (
                edition_el.find_next_sibling(
                    "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
                ).get_text(strip=True)
                if edition_el
                else None
            )

            authors = soup.find_all(
                "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
            )
            author = None
            if authors:
                # get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
                author_names = []
                for author in authors:
                    panel = author.find_next_sibling(
                        "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
                    )
                    if panel:
                        links = panel.find_all("a")
                        for link in links:
                            author_names.append(link.text.strip())
                author = (
                    ";".join(author_names) if len(author_names) > 1 else author_names[0]
                )
            signature = None

            panel = soup.select_one("div.panel-body")
            if panel:
                # Collect the RDS_* blocks in order, using the 'space' divs as separators
                groups = []
                cur = {}
                for node in panel.select(
                    "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
                ):
                    classes = node.get("class", [])
                    # Separator between entries
                    if "space" in classes:
                        if cur:
                            groups.append(cur)
                            cur = {}
                        continue

                    # Read the value from the corresponding panel cell
                    val_el = node.select_one(".rds-dl-panel")
                    val = (
                        val_el.get_text(" ", strip=True)
                        if val_el
                        else node.get_text(" ", strip=True)
                    )

                    if "RDS_SIGNATURE" in classes:
                        cur["signature"] = val
                    elif "RDS_STATUS" in classes:
                        cur["status"] = val
                    elif "RDS_LOCATION" in classes:
                        cur["location"] = val

                if cur:  # append the last group if not followed by a space
                    groups.append(cur)

                # Find the signature for the entry whose location mentions "Semesterapparat"
                for g in groups:
                    loc = g.get("location", "").lower()
                    if "semesterapparat" in loc:
                        signature = g.get("signature")
                        return Book(
                            title=title,
                            ppn=ppn,
                            signature=signature,
                            library_location=loc.split("-")[-1],
                            link=elink,
                            author=author,
                            edition=edition,
                        )
                    else:
                        return Book(
                            title=title,
                            ppn=ppn,
                            signature=signature,
                            library_location=loc.split("\n\n")[-1],
                            link=elink,
                            author=author,
                            edition=edition,
                        )

    def get(self, ppn: str) -> Book | None:
        # based on PPN, get title, people, edition, year, language, pages, isbn,
        link = f"https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{ppn}"
        result = self.search(link)
        soup = BeautifulSoup(result, "html.parser")

    def get_ppn(self, searchterm: str) -> str | None:
        links = self.get_book_links(searchterm)
        ppn = None
        for link in links:
            result = self.search(link)
            soup = BeautifulSoup(result, "html.parser")
            print(link)
            ppn = link.split("/")[-1]
            if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn):
                return ppn
        return ppn

    def get_semesterapparat_number(self, searchterm: str) -> int:
        links = self.get_book_links(searchterm)
        for link in links:
            result = self.search(link)
            # in result search for class col-xs-12 rds-dl RDS_LOCATION
            # if found, return text of href
            soup = BeautifulSoup(result, "html.parser")

            locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
            for location_el in locations:
                if "Semesterapparat-" in location_el.text:
                    match = regex.search(r"Semesterapparat-(\d+)", location_el.text)
                    if match:
                        return int(match.group(1))
                if "Handbibliothek-" in location_el.text:
                    return location_el.text.strip().split("\n\n")[-1].strip()
                return location_el.text.strip().split("\n\n")[-1].strip()
        return 0

    def get_author(self, link: str) -> str:
        links = self.get_book_links(f"kid:{link}")
        author = None
        for link in links:
            # print(link)
            result = self.search(link)
            soup = BeautifulSoup(result, "html.parser")
            # get all authors, return them as a string seperated by ;
            authors = soup.find_all(
                "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
            )
            if authors:
                # get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
                author_names = []
                for author in authors:
                    panel = author.find_next_sibling(
                        "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
                    )
                    if panel:
                        links = panel.find_all("a")
                        for link in links:
                            author_names.append(link.text.strip())
                author = "; ".join(author_names)
        return author

    def get_signature(self, isbn: str):
        links = self.get_book_links(f"{isbn}")
        signature = None
        for link in links:
            result = self.search(link)
            soup = BeautifulSoup(result, "html.parser")
            panel = soup.select_one("div.panel-body")
            if panel:
                # Collect the RDS_* blocks in order, using the 'space' divs as separators
                groups = []
                cur = {}
                for node in panel.select(
                    "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
                ):
                    classes = node.get("class", [])
                    # Separator between entries
                    if "space" in classes:
                        if cur:
                            groups.append(cur)
                            cur = {}
                        continue

                    # Read the value from the corresponding panel cell
                    val_el = node.select_one(".rds-dl-panel")
                    val = (
                        val_el.get_text(" ", strip=True)
                        if val_el
                        else node.get_text(" ", strip=True)
                    )

                    if "RDS_SIGNATURE" in classes:
                        cur["signature"] = val
                    elif "RDS_STATUS" in classes:
                        cur["status"] = val
                    elif "RDS_LOCATION" in classes:
                        cur["location"] = val

                if cur:  # append the last group if not followed by a space
                    groups.append(cur)

                # Find the signature for the entry whose location mentions "Semesterapparat"
                for g in groups:
                    print(g)
                    loc = g.get("location", "").lower()
                    if "semesterapparat" in loc:
                        signature = g.get("signature")
                        return signature
                    else:
                        signature = g.get("signature")
                        return signature
        print("No signature found")
        return signature

    def in_library(self, ppn: str) -> bool:
        if ppn is None:
            return False
        links = self.get_book_links(f"kid:{ppn}")
        return len(links) > 0

    def get_location(self, ppn: str) -> str | None:
        if ppn is None:
            return None
        link = self.get_book(f"{ppn}")
        if link is None:
            return None
        return link.library_location