import sys from typing import Any, Optional, Union import loguru import requests from bs4 import BeautifulSoup # import sleep_and_retry decorator to retry requests from ratelimit import limits, sleep_and_retry from src import LOG_DIR from src.logic.dataclass import BookData from src.transformers import ARRAYData, BibTeXData, COinSData, RDSData, RISData from src.transformers.transformers import RDS_AVAIL_DATA, RDS_GENERIC_DATA log = loguru.logger log.remove() log.add(sys.stdout, level="INFO") log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days") # logger.add(sys.stderr, format="{time} {level} {message}", level="INFO") API_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{}/" PPN_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND" BASE = "https://rds.ibs-bw.de" # TITLE = "RDS_TITLE" SIGNATURE = "RDS_SIGNATURE" EDITION = "RDS_EDITION" ISBN = "RDS_ISBN" AUTHOR = "RDS_PERSON" HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \ (HTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36", "Accept-Language": "en-US, en;q=0.5", } RATE_LIMIT = 20 RATE_PERIOD = 30 class WebRequest: def __init__(self) -> None: """Request data from the web, and format it depending on the mode.""" self.apparat = None self.use_any = False # use any book that matches the search term self.signature = None self.ppn = None self.data = None self.timeout = 5 log.info("Initialized WebRequest") @property def use_any_book(self): """use any book that matches the search term""" self.use_any = True log.info("Using any book") return self def set_apparat(self, apparat: int): self.apparat = apparat if int(self.apparat) < 10: self.apparat = f"0{self.apparat}" log.info(f"Set apparat to {self.apparat}") return self def get_ppn(self, signature: str): self.signature = signature if "+" in signature: signature = signature.replace("+", "%2B") if "doi.org" in signature: signature = signature.split("/")[-1] self.ppn = signature return self @sleep_and_retry @limits(calls=RATE_LIMIT, period=RATE_PERIOD) def search_book(self, searchterm: str) -> str: response = requests.get(PPN_URL.format(searchterm), timeout=self.timeout) return response.text def get_book_links(self, searchterm: str) -> list[str]: response: str = self.search_book(searchterm) # type:ignore soup = BeautifulSoup(response, "html.parser") links = soup.find_all("a", class_="title getFull") res: list[str] = [] for link in links: res.append(BASE + link["href"]) return res @sleep_and_retry @limits(calls=RATE_LIMIT, period=RATE_PERIOD) def search(self, link: str): try: response = requests.get(link, timeout=self.timeout) return response.text except requests.exceptions.RequestException as e: log.error(f"Request failed: {e}") return None def get_data(self) -> Union[list[str], None]: links = self.get_book_links(self.ppn) log.debug(f"Links: {links}") return_data: list[str] = [] for link in links: result: str = self.search(link) # type:ignore # in result search for class col-xs-12 rds-dl RDS_LOCATION # if found, return text of href soup = BeautifulSoup(result, "html.parser") locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION") if locations: for location in locations: if "1. OG Semesterapparat" in location.text: log.success("Found Semesterapparat, adding entry") pre_tag = soup.find_all("pre") return_data = [] if pre_tag: for tag in pre_tag: data = tag.text.strip() return_data.append(data) return return_data else: log.error("No
 tag found")
                            return return_data
                    else:
                        item_location = location.find(
                            "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
                        ).text.strip()
                        log.debug(f"Item location: {item_location}")
                        if self.use_any:
                            pre_tag = soup.find_all("pre")
                            if pre_tag:
                                for tag in pre_tag:
                                    data = tag.text.strip()
                                    return_data.append(data)
                                return return_data
                            else:
                                log.error("No 
 tag found")
                                raise ValueError("No 
 tag found")
                        elif f"Semesterapparat-{self.apparat}" in item_location:
                            pre_tag = soup.find_all("pre")
                            return_data = []
                            if pre_tag:
                                for tag in pre_tag:
                                    data = tag.text.strip()
                                    return_data.append(data)
                                return return_data
                            else:
                                log.error("No 
 tag found")
                                return return_data
                        else:
                            log.error(
                                f"Signature {self.signature} not found in {item_location}"
                            )
                            # return_data = []

        return return_data

    def get_data_elsa(self):
        links = self.get_book_links(self.ppn)
        for link in links:
            result = self.search(link)
            # in result search for class col-xs-12 rds-dl RDS_LOCATION
            # if found, return text of href
            soup = BeautifulSoup(result, "html.parser")
            locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
            if locations:
                for _ in locations:
                    pre_tag = soup.find_all("pre")
                    return_data = []
                    if pre_tag:
                        for tag in pre_tag:
                            data = tag.text.strip()
                            return_data.append(data)
                        return return_data
                    else:
                        log.error("No 
 tag found")
                        return return_data


class BibTextTransformer:
    """Transforms data from the web into a BibText format.
        Valid Modes are ARRAY, COinS, BibTeX, RIS, RDS
    Raises:
        ValueError: Raised if mode is not in valid_modes
    """

    valid_modes = ["ARRAY", "COinS", "BibTeX", "RIS", "RDS"]

    def __init__(self, mode: str = "ARRAY") -> None:
        self.mode = mode
        self.field = None
        self.signature = None
        if mode not in self.valid_modes:
            log.error(f"Mode {mode} not valid")
            raise ValueError(f"Mode {mode} not valid")
        self.data = None
        # self.bookdata = BookData(**self.data)

    def use_signature(self, signature: str):
        """use the exact signature to search for the book"""
        self.signature = signature
        return self

    def get_data(self, data: Union[list[str]] = None) -> "BibTextTransformer":
        RIS_IDENT = "TY  -"
        ARRAY_IDENT = "[kid]"
        COinS_IDENT = "ctx_ver"
        BIBTEX_IDENT = "@book"
        RDS_IDENT = "RDS ---------------------------------- "

        if data is None:
            self.data = None
            return self

        if self.mode == "RIS":
            for line in data:
                if RIS_IDENT in line:
                    self.data = line
        elif self.mode == "ARRAY":
            for line in data:
                if ARRAY_IDENT in line:
                    self.data = line
        elif self.mode == "COinS":
            for line in data:
                if COinS_IDENT in line:
                    self.data = line
        elif self.mode == "BibTeX":
            for line in data:
                if BIBTEX_IDENT in line:
                    self.data = line
        elif self.mode == "RDS":
            for line in data:
                if RDS_IDENT in line:
                    self.data = line
        return self

    def return_data(
        self, option: Any = None
    ) -> Union[
        Optional[BookData],
        Optional[RDS_GENERIC_DATA],
        Optional[RDS_AVAIL_DATA],
        None,
        dict[str, Union[RDS_AVAIL_DATA, RDS_GENERIC_DATA]],
    ]:
        """Return Data to caller.

        Args:
            option (string, optional): Option for RDS as there are two filetypes. Use rds_availability or rds_data. Anything else gives a dict of both responses. Defaults to None.

        Returns:
            BookData: a dataclass containing data about the book
        """
        if self.data is None:
            return None
        match self.mode:
            case "ARRAY":
                return ARRAYData(self.signature).transform(self.data)
            case "COinS":
                return COinSData().transform(self.data)
            case "BibTeX":
                return BibTeXData().transform(self.data)
            case "RIS":
                return RISData().transform(self.data)
            case "RDS":
                return RDSData().transform(self.data).return_data(option)
            case _:
                return None

        # if self.mode == "ARRAY":
        #     return ARRAYData().transform(self.data)
        # elif self.mode == "COinS":
        #     return COinSData().transform(self.data)
        # elif self.mode == "BibTeX":
        #     return BibTeXData().transform(self.data)
        # elif self.mode == "RIS":
        #     return RISData().transform(self.data)
        # elif self.mode == "RDS":
        #     return RDSData().transform(self.data).return_data(option)


def cover(isbn):
    test_url = f"https://www.buchhandel.de/cover/{isbn}/{isbn}-cover-m.jpg"
    # log.debug(test_url)
    data = requests.get(test_url, stream=True)
    return data.content


def get_content(soup, css_class):
    return soup.find("div", class_=css_class).text.strip()


if __name__ == "__main__":
    # log.debug("main")
    link = "CU 8500 K64"
    data = WebRequest(71).get_ppn(link).get_data()
    bib = BibTextTransformer("ARRAY").get_data().return_data()
    log.debug(bib)