diff --git a/src/backend/catalogue.py b/src/backend/catalogue.py new file mode 100644 index 0000000..4f72ec1 --- /dev/null +++ b/src/backend/catalogue.py @@ -0,0 +1,101 @@ +import requests +from bs4 import BeautifulSoup + +from src.logic import BookData as Book + +from datetime import datetime +import sys +import loguru +from src import LOG_DIR +URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND" +BASE = "https://rds.ibs-bw.de" + +log = loguru.logger +log.remove() +log.add(sys.stdout, level="INFO") +log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days") + +log.add( + f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log", + rotation="1 day", + retention="1 month", +) +class Catalogue: + def __init__(self, timeout=5): + self.timeout = timeout + reachable = self.check_connection() + if not reachable: + log.error("No internet connection available.") + raise ConnectionError("No internet connection available.") + + def check_connection(self): + try: + response = requests.get("https://www.google.com", timeout=self.timeout) + if response.status_code == 200: + return True + except requests.exceptions.RequestException as e: + log.error(f"Could not connect to google.com: {e}") + + def search_book(self, searchterm: str): + response = requests.get(URL.format(searchterm), timeout=self.timeout) + return response.text + + def search(self, link: str): + response = requests.get(link, timeout=self.timeout) + return response.text + + def get_book_links(self, searchterm: str): + response = self.search_book(searchterm) + soup = BeautifulSoup(response, "html.parser") + links = soup.find_all("a", class_="title getFull") + res = [] + for link in links: + res.append(BASE + link["href"]) + return res + + def get_book(self, searchterm: str): + log.info(f"Searching for term: {searchterm}") + + links = self.get_book_links(searchterm) + for link in links: + result = self.search(link) + # in result search for class col-xs-12 rds-dl RDS_LOCATION + # if found, return text of href + soup = BeautifulSoup(result, "html.parser") + location = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION") + for loc in location: + if f"1. OG Semesterapparat" in loc.text: + title = ( + soup.find("div", class_="headline text") + .text.replace("\n", "") + .strip() + ) + ppn = soup.find( + "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN" + ) + signature = soup.find( + "div", class_="col-xs-12 rds-dl RDS_SIGNATURE" + ) + if signature: + signature = ( + signature.find_next("div") + .find_next("div") + .text.replace("\n", "") + .strip() + ) + # use ppn to find the next div and extract the text + if ppn: + ppn = ppn.find_next("div").text.replace("\n", "").strip() + else: + ppn = None + isbn = soup.find( + "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_ISBN" + ) + if isbn: + isbn = isbn.find_next("div").find_next("div").text + else: + isbn = None + return Book( + title=title, ppn=ppn, signature=signature, isbn=isbn, link=link + ) + return False