Files
SemesterapparatsManager/src/logic/webrequest.py

315 lines
12 KiB
Python

from enum import Enum
from typing import Any, Optional, Union
import requests
from bs4 import BeautifulSoup
# import sleep_and_retry decorator to retry requests
from ratelimit import limits, sleep_and_retry
from src.logic.dataclass import BookData
from src.shared.logging import log
from src.transformers import ARRAYData, BibTeXData, COinSData, RDSData, RISData
from src.transformers.transformers import RDS_AVAIL_DATA, RDS_GENERIC_DATA
# logger.add(sys.stderr, format="{time} {level} {message}", level="INFO")
API_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{}/"
PPN_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND"
BASE = "https://rds.ibs-bw.de"
#
TITLE = "RDS_TITLE"
SIGNATURE = "RDS_SIGNATURE"
EDITION = "RDS_EDITION"
ISBN = "RDS_ISBN"
AUTHOR = "RDS_PERSON"
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
(HTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
"Accept-Language": "en-US, en;q=0.5",
}
RATE_LIMIT = 20
RATE_PERIOD = 30
class TransformerType(Enum):
ARRAY = "ARRAY"
COinS = "COinS"
BibTeX = "BibTeX"
RIS = "RIS"
RDS = "RDS"
class WebRequest:
def __init__(self) -> None:
"""Request data from the web, and format it depending on the mode."""
self.apparat = None
self.use_any = False # use any book that matches the search term
self.signature = None
self.ppn = None
self.data = None
self.timeout = 5
log.info("Initialized WebRequest")
@property
def use_any_book(self):
"""use any book that matches the search term"""
self.use_any = True
log.info("Using any book")
return self
def set_apparat(self, apparat: int) -> "WebRequest":
self.apparat = apparat
if int(self.apparat) < 10:
self.apparat = f"0{self.apparat}"
log.info(f"Set apparat to {self.apparat}")
return self
def get_ppn(self, signature: str) -> "WebRequest":
self.signature = signature
if "+" in signature:
signature = signature.replace("+", "%2B")
if "doi.org" in signature:
signature = signature.split("/")[-1]
self.ppn = signature
return self
@sleep_and_retry
@limits(calls=RATE_LIMIT, period=RATE_PERIOD)
def search_book(self, searchterm: str) -> str:
response = requests.get(PPN_URL.format(searchterm), timeout=self.timeout)
return response.text
@sleep_and_retry
@limits(calls=RATE_LIMIT, period=RATE_PERIOD)
def search_ppn(self, ppn: str) -> str:
response = requests.get(API_URL.format(ppn), timeout=self.timeout)
return response.text
def get_book_links(self, searchterm: str) -> list[str]:
response: str = self.search_book(searchterm) # type:ignore
soup = BeautifulSoup(response, "html.parser")
links = soup.find_all("a", class_="title getFull")
res: list[str] = []
for link in links:
res.append(BASE + link["href"])
return res
@sleep_and_retry
@limits(calls=RATE_LIMIT, period=RATE_PERIOD)
def search(self, link: str) -> Optional[str]:
try:
response = requests.get(link, timeout=self.timeout)
return response.text
except requests.exceptions.RequestException as e:
log.error(f"Request failed: {e}")
return None
def get_data(self) -> Optional[list[str]]:
links = self.get_book_links(self.ppn)
log.debug(f"Links: {links}")
return_data: list[str] = []
for link in links:
result: str = self.search(link) # type:ignore
# in result search for class col-xs-12 rds-dl RDS_LOCATION
# if found, return text of href
soup = BeautifulSoup(result, "html.parser")
locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
if locations:
for location in locations:
if "1. OG Semesterapparat" in location.text:
log.success("Found Semesterapparat, adding entry")
pre_tag = soup.find_all("pre")
return_data = []
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
log.error("No <pre> tag found")
return return_data
else:
item_location = location.find(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
).text.strip()
log.debug(f"Item location: {item_location}")
if self.use_any:
pre_tag = soup.find_all("pre")
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
log.error("No <pre> tag found")
raise ValueError("No <pre> tag found")
elif f"Semesterapparat-{self.apparat}" in item_location:
pre_tag = soup.find_all("pre")
return_data = []
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
log.error("No <pre> tag found")
return return_data
else:
log.error(
f"Signature {self.signature} not found in {item_location}"
)
# return_data = []
return return_data
def get_data_elsa(self) -> Optional[list[str]]:
links = self.get_book_links(self.ppn)
for link in links:
result = self.search(link)
# in result search for class col-xs-12 rds-dl RDS_LOCATION
# if found, return text of href
soup = BeautifulSoup(result, "html.parser")
locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
if locations:
for _ in locations:
pre_tag = soup.find_all("pre")
return_data = []
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
log.error("No <pre> tag found")
return return_data
class BibTextTransformer:
"""Transforms data from the web into a BibText format.
Valid Modes are ARRAY, COinS, BibTeX, RIS, RDS
Raises:
ValueError: Raised if mode is not in valid_modes
"""
valid_modes = [
TransformerType.ARRAY,
TransformerType.COinS,
TransformerType.BibTeX,
TransformerType.RIS,
TransformerType.RDS,
]
def __init__(self, mode: TransformerType = TransformerType.ARRAY) -> None:
self.mode = mode.value
self.field = None
self.signature = None
if mode not in self.valid_modes:
log.error(f"Mode {mode} not valid")
raise ValueError(f"Mode {mode} not valid")
self.data = None
# self.bookdata = BookData(**self.data)
def use_signature(self, signature: str) -> "BibTextTransformer":
"""use the exact signature to search for the book"""
self.signature = signature
return self
def get_data(self, data: Optional[list[str]] = None) -> "BibTextTransformer":
RIS_IDENT = "TY -"
ARRAY_IDENT = "[kid]"
COinS_IDENT = "ctx_ver"
BIBTEX_IDENT = "@book"
RDS_IDENT = "RDS ---------------------------------- "
if data is None:
self.data = None
return self
if self.mode == "RIS":
for line in data:
if RIS_IDENT in line:
self.data = line
elif self.mode == "ARRAY":
for line in data:
if ARRAY_IDENT in line:
self.data = line
elif self.mode == "COinS":
for line in data:
if COinS_IDENT in line:
self.data = line
elif self.mode == "BibTeX":
for line in data:
if BIBTEX_IDENT in line:
self.data = line
elif self.mode == "RDS":
for line in data:
if RDS_IDENT in line:
self.data = line
return self
def return_data(
self, option: Any = None
) -> Union[
Optional[BookData],
Optional[RDS_GENERIC_DATA],
Optional[RDS_AVAIL_DATA],
None,
dict[str, Union[RDS_AVAIL_DATA, RDS_GENERIC_DATA]],
]:
"""Return Data to caller.
Args:
option (string, optional): Option for RDS as there are two filetypes. Use rds_availability or rds_data. Anything else gives a dict of both responses. Defaults to None.
Returns:
BookData: a dataclass containing data about the book
"""
if self.data is None:
return None
match self.mode:
case "ARRAY":
return ARRAYData(self.signature).transform(self.data)
case "COinS":
return COinSData().transform(self.data)
case "BibTeX":
return BibTeXData().transform(self.data)
case "RIS":
return RISData().transform(self.data)
case "RDS":
return RDSData().transform(self.data).return_data(option)
case _:
return None
# if self.mode == "ARRAY":
# return ARRAYData().transform(self.data)
# elif self.mode == "COinS":
# return COinSData().transform(self.data)
# elif self.mode == "BibTeX":
# return BibTeXData().transform(self.data)
# elif self.mode == "RIS":
# return RISData().transform(self.data)
# elif self.mode == "RDS":
# return RDSData().transform(self.data).return_data(option)
def cover(isbn):
test_url = f"https://www.buchhandel.de/cover/{isbn}/{isbn}-cover-m.jpg"
# log.debug(test_url)
data = requests.get(test_url, stream=True)
return data.content
def get_content(soup, css_class):
return soup.find("div", class_=css_class).text.strip()
if __name__ == "__main__":
# log.debug("main")
link = "CU 8500 K64"
data = WebRequest(71).get_ppn(link).get_data()
bib = BibTextTransformer("ARRAY").get_data().return_data()
log.debug(bib)