Files
SemesterapparatsManager/src/logic/webrequest.py

282 lines
10 KiB
Python

import requests
from bs4 import BeautifulSoup
# import sleep_and_retry decorator to retry requests
from ratelimit import limits, sleep_and_retry
from typing import Union, Any
from src.logic.dataclass import BookData
from src.transformers import ARRAYData, BibTeXData, COinSData, RDSData, RISData
import sys
from loguru import logger as log
logger = log
logger.remove()
logger.add("logs/application.log", rotation="1 week", enqueue=True)
log.add(
f"logs/webrequest.log",
rotation="1 day",
compression="zip",
)
# logger.add(sys.stderr, format="{time} {level} {message}", level="INFO")
logger.add(sys.stdout)
API_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{}/"
PPN_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND"
BASE = "https://rds.ibs-bw.de"
#
TITLE = "RDS_TITLE"
SIGNATURE = "RDS_SIGNATURE"
EDITION = "RDS_EDITION"
ISBN = "RDS_ISBN"
AUTHOR = "RDS_PERSON"
HEADERS = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
(HTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36",
"Accept-Language": "en-US, en;q=0.5",
}
RATE_LIMIT = 20
RATE_PERIOD = 30
class WebRequest:
def __init__(self) -> None:
"""Request data from the web, and format it depending on the mode."""
self.apparat = None
self.use_any = False # use any book that matches the search term
self.signature = None
self.ppn = None
self.data = None
self.timeout = 5
logger.info("Initialized WebRequest")
@property
def use_any_book(self):
"""use any book that matches the search term"""
self.use_any = True
logger.info("Using any book")
return self
def set_apparat(self, apparat: int):
self.apparat = apparat
if int(self.apparat) < 10:
self.apparat = f"0{self.apparat}"
logger.info(f"Set apparat to {self.apparat}")
return self
def get_ppn(self, signature: str):
self.signature = signature
if "+" in signature:
signature = signature.replace("+", "%2B")
if "doi.org" in signature:
signature = signature.split("/")[-1]
self.ppn = signature
return self
@sleep_and_retry
@limits(calls=RATE_LIMIT, period=RATE_PERIOD)
def search_book(self, searchterm: str) -> str:
response = requests.get(PPN_URL.format(searchterm), timeout=self.timeout)
return response.text
def get_book_links(self, searchterm: str) -> list[str]:
response: str = self.search_book(searchterm) # type:ignore
soup = BeautifulSoup(response, "html.parser")
links = soup.find_all("a", class_="title getFull")
res: list[str] = []
for link in links:
res.append(BASE + link["href"])
return res
@sleep_and_retry
@limits(calls=RATE_LIMIT, period=RATE_PERIOD)
def search(self, link: str):
try:
response = requests.get(link, timeout=self.timeout)
return response.text
except requests.exceptions.RequestException as e:
logger.error(f"Request failed: {e}")
return None
def get_data(self) -> Union[list[str], None]:
links = self.get_book_links(self.ppn)
logger.debug(f"Links: {links}")
for link in links:
result: str = self.search(link) # type:ignore
# in result search for class col-xs-12 rds-dl RDS_LOCATION
# if found, return text of href
soup = BeautifulSoup(result, "html.parser")
locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
if locations:
for location in locations:
item_location = location.find(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
).text.strip()
if self.use_any:
pre_tag = soup.find_all("pre")
return_data: list[str] = []
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
logger.error("No <pre> tag found")
raise ValueError("No <pre> tag found")
elif f"Semesterapparat-{self.apparat}" in item_location:
pre_tag = soup.find_all("pre")
return_data = []
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
logger.error("No <pre> tag found")
return return_data
else:
logger.error(
f"Signature {self.signature} not found in {item_location}"
)
return_data = []
return return_data
def get_data_elsa(self):
links = self.get_book_links(self.ppn)
for link in links:
result = self.search(link)
# in result search for class col-xs-12 rds-dl RDS_LOCATION
# if found, return text of href
soup = BeautifulSoup(result, "html.parser")
locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
if locations:
for _ in locations:
pre_tag = soup.find_all("pre")
return_data = []
if pre_tag:
for tag in pre_tag:
data = tag.text.strip()
return_data.append(data)
return return_data
else:
logger.error("No <pre> tag found")
return return_data
class BibTextTransformer:
"""Transforms data from the web into a BibText format.
Valid Modes are ARRAY, COinS, BibTeX, RIS, RDS
Raises:
ValueError: Raised if mode is not in valid_modes
"""
valid_modes = ["ARRAY", "COinS", "BibTeX", "RIS", "RDS"]
def __init__(self, mode: str = "ARRAY") -> None:
self.mode = mode
self.field = None
self.signature = None
if mode not in self.valid_modes:
logger.error(f"Mode {mode} not valid")
raise ValueError(f"Mode {mode} not valid")
self.data = None
# self.bookdata = BookData(**self.data)
def use_signature(self, signature: str):
"""use the exact signature to search for the book"""
self.signature = signature
return self
def get_data(self, data: Union[list[str]] = None) -> "BibTextTransformer":
RIS_IDENT = "TY -"
ARRAY_IDENT = "[kid]"
COinS_IDENT = "ctx_ver"
BIBTEX_IDENT = "@book"
RDS_IDENT = "RDS ---------------------------------- "
if data is None:
self.data = None
return self
if self.mode == "RIS":
for line in data:
if RIS_IDENT in line:
self.data = line
elif self.mode == "ARRAY":
for line in data:
if ARRAY_IDENT in line:
self.data = line
elif self.mode == "COinS":
for line in data:
if COinS_IDENT in line:
self.data = line
elif self.mode == "BibTeX":
for line in data:
if BIBTEX_IDENT in line:
self.data = line
elif self.mode == "RDS":
for line in data:
if RDS_IDENT in line:
self.data = line
return self
def return_data(self, option: Any = None) -> Union[BookData, None]:
"""Return Data to caller.
Args:
option (string, optional): Option for RDS as there are two filetypes. Use rds_availability or rds_data. Anything else gives a dict of both responses. Defaults to None.
Returns:
BookData: a dataclass containing data about the book
"""
if self.data is None:
return None
match self.mode:
case "ARRAY":
return ARRAYData(self.signature).transform(self.data)
case "COinS":
return COinSData().transform(self.data)
case "BibTeX":
return BibTeXData().transform(self.data)
case "RIS":
return RISData().transform(self.data)
case "RDS":
return RDSData().transform(self.data).return_data(option)
case _:
return None
# if self.mode == "ARRAY":
# return ARRAYData().transform(self.data)
# elif self.mode == "COinS":
# return COinSData().transform(self.data)
# elif self.mode == "BibTeX":
# return BibTeXData().transform(self.data)
# elif self.mode == "RIS":
# return RISData().transform(self.data)
# elif self.mode == "RDS":
# return RDSData().transform(self.data).return_data(option)
def cover(isbn):
test_url = f"https://www.buchhandel.de/cover/{isbn}/{isbn}-cover-m.jpg"
# logger.debug(test_url)
data = requests.get(test_url, stream=True)
return data.content
def get_content(soup, css_class):
return soup.find("div", class_=css_class).text.strip()
if __name__ == "__main__":
# logger.debug("main")
link = "CU 8500 K64"
data = WebRequest(71).get_ppn(link).get_data()
bib = BibTextTransformer("ARRAY").get_data().return_data()
logger.debug(bib)