from typing import Any, Optional, Union import requests from bs4 import BeautifulSoup # import sleep_and_retry decorator to retry requests from ratelimit import limits, sleep_and_retry from src.logic.dataclass import BookData from src.shared.logging import log from src.transformers import ARRAYData, BibTeXData, COinSData, RDSData, RISData from src.transformers.transformers import RDS_AVAIL_DATA, RDS_GENERIC_DATA # logger.add(sys.stderr, format="{time} {level} {message}", level="INFO") API_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{}/" PPN_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND" BASE = "https://rds.ibs-bw.de" # TITLE = "RDS_TITLE" SIGNATURE = "RDS_SIGNATURE" EDITION = "RDS_EDITION" ISBN = "RDS_ISBN" AUTHOR = "RDS_PERSON" HEADERS = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \ (HTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36", "Accept-Language": "en-US, en;q=0.5", } RATE_LIMIT = 20 RATE_PERIOD = 30 class WebRequest: def __init__(self) -> None: """Request data from the web, and format it depending on the mode.""" self.apparat = None self.use_any = False # use any book that matches the search term self.signature = None self.ppn = None self.data = None self.timeout = 5 log.info("Initialized WebRequest") @property def use_any_book(self): """use any book that matches the search term""" self.use_any = True log.info("Using any book") return self def set_apparat(self, apparat: int): self.apparat = apparat if int(self.apparat) < 10: self.apparat = f"0{self.apparat}" log.info(f"Set apparat to {self.apparat}") return self def get_ppn(self, signature: str): self.signature = signature if "+" in signature: signature = signature.replace("+", "%2B") if "doi.org" in signature: signature = signature.split("/")[-1] self.ppn = signature return self @sleep_and_retry @limits(calls=RATE_LIMIT, period=RATE_PERIOD) def search_book(self, searchterm: str) -> str: response = requests.get(PPN_URL.format(searchterm), timeout=self.timeout) return response.text @sleep_and_retry @limits(calls=RATE_LIMIT, period=RATE_PERIOD) def search_ppn(self, ppn: str) -> str: response = requests.get(API_URL.format(ppn), timeout=self.timeout) return response.text def get_book_links(self, searchterm: str) -> list[str]: response: str = self.search_book(searchterm) # type:ignore soup = BeautifulSoup(response, "html.parser") links = soup.find_all("a", class_="title getFull") res: list[str] = [] for link in links: res.append(BASE + link["href"]) return res @sleep_and_retry @limits(calls=RATE_LIMIT, period=RATE_PERIOD) def search(self, link: str): try: response = requests.get(link, timeout=self.timeout) return response.text except requests.exceptions.RequestException as e: log.error(f"Request failed: {e}") return None def get_data(self) -> Union[list[str], None]: links = self.get_book_links(self.ppn) log.debug(f"Links: {links}") return_data: list[str] = [] for link in links: result: str = self.search(link) # type:ignore # in result search for class col-xs-12 rds-dl RDS_LOCATION # if found, return text of href soup = BeautifulSoup(result, "html.parser") locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION") if locations: for location in locations: if "1. OG Semesterapparat" in location.text: log.success("Found Semesterapparat, adding entry") pre_tag = soup.find_all("pre") return_data = [] if pre_tag: for tag in pre_tag: data = tag.text.strip() return_data.append(data) return return_data else: log.error("No
tag found")
return return_data
else:
item_location = location.find(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
).text.strip()
log.debug(f"Item location: {item_location}")
if self.use_any:
pre_tag = soup.find_all("pre")
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
log.error("No tag found")
raise ValueError("No tag found")
elif f"Semesterapparat-{self.apparat}" in item_location:
pre_tag = soup.find_all("pre")
return_data = []
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
log.error("No tag found")
return return_data
else:
log.error(
f"Signature {self.signature} not found in {item_location}"
)
# return_data = []
return return_data
def get_data_elsa(self):
links = self.get_book_links(self.ppn)
for link in links:
result = self.search(link)
# in result search for class col-xs-12 rds-dl RDS_LOCATION
# if found, return text of href
soup = BeautifulSoup(result, "html.parser")
locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
if locations:
for _ in locations:
pre_tag = soup.find_all("pre")
return_data = []
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
log.error("No tag found")
return return_data
class BibTextTransformer:
"""Transforms data from the web into a BibText format.
Valid Modes are ARRAY, COinS, BibTeX, RIS, RDS
Raises:
ValueError: Raised if mode is not in valid_modes
"""
valid_modes = ["ARRAY", "COinS", "BibTeX", "RIS", "RDS"]
def __init__(self, mode: str = "ARRAY") -> None:
self.mode = mode
self.field = None
self.signature = None
if mode not in self.valid_modes:
log.error(f"Mode {mode} not valid")
raise ValueError(f"Mode {mode} not valid")
self.data = None
# self.bookdata = BookData(**self.data)
def use_signature(self, signature: str):
"""use the exact signature to search for the book"""
self.signature = signature
return self
def get_data(self, data: Union[list[str]] = None) -> "BibTextTransformer":
RIS_IDENT = "TY -"
ARRAY_IDENT = "[kid]"
COinS_IDENT = "ctx_ver"
BIBTEX_IDENT = "@book"
RDS_IDENT = "RDS ---------------------------------- "
if data is None:
self.data = None
return self
if self.mode == "RIS":
for line in data:
if RIS_IDENT in line:
self.data = line
elif self.mode == "ARRAY":
for line in data:
if ARRAY_IDENT in line:
self.data = line
elif self.mode == "COinS":
for line in data:
if COinS_IDENT in line:
self.data = line
elif self.mode == "BibTeX":
for line in data:
if BIBTEX_IDENT in line:
self.data = line
elif self.mode == "RDS":
for line in data:
if RDS_IDENT in line:
self.data = line
return self
def return_data(
self, option: Any = None
) -> Union[
Optional[BookData],
Optional[RDS_GENERIC_DATA],
Optional[RDS_AVAIL_DATA],
None,
dict[str, Union[RDS_AVAIL_DATA, RDS_GENERIC_DATA]],
]:
"""Return Data to caller.
Args:
option (string, optional): Option for RDS as there are two filetypes. Use rds_availability or rds_data. Anything else gives a dict of both responses. Defaults to None.
Returns:
BookData: a dataclass containing data about the book
"""
if self.data is None:
return None
match self.mode:
case "ARRAY":
return ARRAYData(self.signature).transform(self.data)
case "COinS":
return COinSData().transform(self.data)
case "BibTeX":
return BibTeXData().transform(self.data)
case "RIS":
return RISData().transform(self.data)
case "RDS":
return RDSData().transform(self.data).return_data(option)
case _:
return None
# if self.mode == "ARRAY":
# return ARRAYData().transform(self.data)
# elif self.mode == "COinS":
# return COinSData().transform(self.data)
# elif self.mode == "BibTeX":
# return BibTeXData().transform(self.data)
# elif self.mode == "RIS":
# return RISData().transform(self.data)
# elif self.mode == "RDS":
# return RDSData().transform(self.data).return_data(option)
def cover(isbn):
test_url = f"https://www.buchhandel.de/cover/{isbn}/{isbn}-cover-m.jpg"
# log.debug(test_url)
data = requests.get(test_url, stream=True)
return data.content
def get_content(soup, css_class):
return soup.find("div", class_=css_class).text.strip()
if __name__ == "__main__":
# log.debug("main")
link = "CU 8500 K64"
data = WebRequest(71).get_ppn(link).get_data()
bib = BibTextTransformer("ARRAY").get_data().return_data()
log.debug(bib)