From eee3cf2a4faa3f3addb84518595abd908e6f753f Mon Sep 17 00:00:00 2001 From: WorldTeacher <41587052+WorldTeacher@users.noreply.github.com> Date: Mon, 15 Jul 2024 12:36:55 +0200 Subject: [PATCH] custom class to request data from our webcatalogue and return dataclass books --- src/logic/catalogue.py | 75 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 src/logic/catalogue.py diff --git a/src/logic/catalogue.py b/src/logic/catalogue.py new file mode 100644 index 0000000..c1451e7 --- /dev/null +++ b/src/logic/catalogue.py @@ -0,0 +1,75 @@ +import requests +from bs4 import BeautifulSoup +from src import config +from src.schemas import Book + +URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?lookfor={}+&type=AllFields&limit=10&sort=py+desc%2C+title" +BASE = "https://rds.ibs-bw.de" + + +class Catalogue: + def __init__(self, timeout=5): + self.timeout = timeout + + def search_book(self, searchterm: str): + response = requests.get(URL.format(searchterm), timeout=self.timeout) + return response.text + + def search(self, link: str): + response = requests.get(link, timeout=self.timeout) + return response.text + + def get_book_links(self, searchterm: str): + response = self.search_book(searchterm) + soup = BeautifulSoup(response, "html.parser") + links = soup.find_all("a", class_="title getFull") + res = [] + for link in links: + res.append(BASE + link["href"]) + return res + + def get_book(self, searchterm: str): + links = self.get_book_links(searchterm) + for link in links: + result = self.search(link) + # in result search for class col-xs-12 rds-dl RDS_LOCATION + # if found, return text of href + soup = BeautifulSoup(result, "html.parser") + location = soup.find("div", class_="col-xs-12 rds-dl RDS_LOCATION") + if location: + if config.institution_name in location.text: + location = config.institution_name + title = ( + soup.find("div", class_="headline text") + .text.replace("\n", "") + .strip() + ) + ppn = soup.find( + "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN" + ) + signature = soup.find( + "div", class_="col-xs-12 rds-dl RDS_SIGNATURE" + ) + if signature: + signature = ( + signature.find_next("div") + .find_next("div") + .text.replace("\n", "") + .strip() + ) + # use ppn to find the next div and extract the text + if ppn: + ppn = ppn.find_next("div").text.replace("\n", "").strip() + else: + ppn = None + isbn = soup.find( + "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_ISBN" + ) + if isbn: + isbn = isbn.find_next("div").find_next("div").text + else: + isbn = None + return Book( + title=title, ppn=ppn, signature=signature, isbn=isbn, link=link + ) + return False