From ee8862a220a819fa2c18e3d1b54cc09ca3166d8c Mon Sep 17 00:00:00 2001 From: WorldTeacher <41587052+WorldTeacher@users.noreply.github.com> Date: Fri, 2 Aug 2024 15:22:52 +0200 Subject: [PATCH] add appnr, new url --- src/logic/thread_bookgrabber.py | 5 +- src/logic/threads_availchecker.py | 2 +- src/logic/webrequest.py | 96 +++++++++++++++++-------------- 3 files changed, 57 insertions(+), 46 deletions(-) diff --git a/src/logic/thread_bookgrabber.py b/src/logic/thread_bookgrabber.py index 38d39cc..2c13874 100644 --- a/src/logic/thread_bookgrabber.py +++ b/src/logic/thread_bookgrabber.py @@ -14,7 +14,7 @@ class BookGrabber(QThread): updateSignal = Signal(int, int) done = Signal() - def __init__(self): + def __init__(self, appnr): super(BookGrabber, self).__init__(parent=None) self.is_Running = True self.logger = MyLogger("Worker") @@ -24,6 +24,7 @@ class BookGrabber(QThread): self.prof_id = None self.mode = None self.book_id = None + self.appnr = appnr self.tstate = (self.app_id, self.prof_id, self.mode, self.data) def add_values(self, app_id, prof_id, mode, data): @@ -46,7 +47,7 @@ class BookGrabber(QThread): signature = str(entry) self.logger.log_info("Processing entry: " + signature) - webdata = WebRequest().get_ppn(entry).get_data() + webdata = WebRequest(self.appnr).get_ppn(entry).get_data() if webdata == "error": continue bd = BibTextTransformer(self.mode).get_data(webdata).return_data() diff --git a/src/logic/threads_availchecker.py b/src/logic/threads_availchecker.py index 2dea557..431e3dd 100644 --- a/src/logic/threads_availchecker.py +++ b/src/logic/threads_availchecker.py @@ -44,7 +44,7 @@ class AvailChecker(QThread): count = 0 for link in self.links: self.logger.log_info("Processing entry: " + str(link)) - data = WebRequest().get_ppn(link).get_data() + data = WebRequest(self.appnumber).get_ppn(link).get_data() transformer = BibTextTransformer("RDS") rds = transformer.get_data(data).return_data("rds_availability") diff --git a/src/logic/webrequest.py b/src/logic/webrequest.py index fa47c86..4925d7b 100644 --- a/src/logic/webrequest.py +++ b/src/logic/webrequest.py @@ -13,7 +13,9 @@ logger = MyLogger(__name__) config = OmegaConf.load("config.yaml") API_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{}/" -PPN_URL = 'https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?lookfor="{}"+&type=AllFields&limit=10&sort=py+desc%2C+title' +PPN_URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND" +BASE = "https://rds.ibs-bw.de" +# TITLE = "RDS_TITLE" SIGNATURE = "RDS_SIGNATURE" EDITION = "RDS_EDITION" @@ -28,11 +30,15 @@ HEADERS = { class WebRequest: - def __init__(self) -> None: + def __init__(self, appnr) -> None: """Request data from the web, and format it depending on the mode.""" + self.apparat = appnr + if int(self.apparat) < 10: + self.apparat = f"0{self.apparat}" self.signature = None self.ppn = None self.data = None + self.timeout = 5 logger.log_info("Initialized WebRequest") def get_ppn(self, signature): @@ -41,50 +47,54 @@ class WebRequest: signature = signature.replace("+", "%2B") if "doi.org" in signature: signature = signature.split("/")[-1] - url = PPN_URL.format(signature) - page = requests.get(url) - - soup = BeautifulSoup(page.content, "html.parser", from_encoding="utf-8") - if soup.find("div", class_="media") is None: - logger.log_error(f"No data found for {signature}") - return self - ppn = soup.find("div", class_="media").get("id") - self.ppn = ppn + self.ppn = signature return self - def get_link_data(self): - page = requests.get(PPN_URL.format(self.ppn)) - soup = BeautifulSoup(page.content, "html.parser") - # find div that contains daia_ in the id - # find the pre tag in that div - # return the text - # div = soup.find("div",id=lambda x: x and "daia_" in x) - # pre = div.find("pre") - return soup + def search_book(self, searchterm: str): + response = requests.get(PPN_URL.format(searchterm), timeout=self.timeout) + return response.text - def get_data(self) -> list[str] | str: - # url = API_URL.format(self.ppn) - if self.ppn is None: - logger.log_error("No PPN found") - return "error" - page = requests.get(API_URL.format(self.ppn)) - logger.log_info(f"Requesting data from {API_URL.format(self.ppn)}") - logger.log_info(f"Status code: {page.status_code}") - # print(page.content) - soup = BeautifulSoup(page.content, "html.parser") - pre_tag = soup.find_all("pre") - # print(pre_tag) - return_data = [] + def get_book_links(self, searchterm: str): + response = self.search_book(searchterm) + soup = BeautifulSoup(response, "html.parser") + links = soup.find_all("a", class_="title getFull") + res = [] + for link in links: + res.append(BASE + link["href"]) + return res - if pre_tag: - for tag in pre_tag: - data = tag.text.strip() - return_data.append(data) - return return_data - else: - print("No
tag found")
- logger.log_error("No tag found")
- return return_data
+ def search(self, link: str):
+ response = requests.get(link, timeout=self.timeout)
+ return response.text
+
+ def get_data(
+ self,
+ ):
+ links = self.get_book_links(self.ppn)
+ print(links)
+ for link in links:
+ result = self.search(link)
+ # in result search for class col-xs-12 rds-dl RDS_LOCATION
+ # if found, return text of href
+ soup = BeautifulSoup(result, "html.parser")
+ locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
+ if locations:
+ for location in locations:
+ item_location = location.find(
+ "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
+ ).text.strip()
+ print(item_location)
+ if f"Semesterapparat-{self.apparat}" in item_location:
+ pre_tag = soup.find_all("pre")
+ return_data = []
+ if pre_tag:
+ for tag in pre_tag:
+ data = tag.text.strip()
+ return_data.append(data)
+ return return_data
+ else:
+ logger.log_error("No tag found")
+ return return_data
class BibTextTransformer:
@@ -109,7 +119,7 @@ class BibTextTransformer:
self.data = None
# self.bookdata = BookData(**self.data)
- def get_data(self, data: list) -> str:
+ def get_data(self, data: list):
RIS_IDENT = "TY -"
ARRAY_IDENT = "[kid]"
COinS_IDENT = "ctx_ver"