From 7e2adc5416802dab25856b295d1fc562c0a49fda Mon Sep 17 00:00:00 2001 From: WorldTeacher <41587052+WorldTeacher@users.noreply.github.com> Date: Thu, 10 Oct 2024 14:37:54 +0200 Subject: [PATCH] implement rate limit and retry to prevent ratelimits --- src/logic/webrequest.py | 58 ++++++++++++++++++++++++++++++----------- 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/src/logic/webrequest.py b/src/logic/webrequest.py index c414d79..6f22c3f 100644 --- a/src/logic/webrequest.py +++ b/src/logic/webrequest.py @@ -27,19 +27,25 @@ HEADERS = { (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36", "Accept-Language": "en-US, en;q=0.5", } - +RATE_LIMIT = 20 +RATE_PERIOD = 30 class WebRequest: - def __init__(self, appnr) -> None: + def __init__(self) -> None: """Request data from the web, and format it depending on the mode.""" - self.apparat = appnr - if int(self.apparat) < 10: - self.apparat = f"0{self.apparat}" + self.apparat = None + self.signature = None self.ppn = None self.data = None self.timeout = 5 logger.log_info("Initialized WebRequest") + + def set_apparat(self, apparat): + self.apparat = apparat + if int(self.apparat) < 10: + self.apparat = f"0{self.apparat}" + return self def get_ppn(self, signature): self.signature = signature @@ -49,7 +55,8 @@ class WebRequest: signature = signature.split("/")[-1] self.ppn = signature return self - + @sleep_and_retry + @limits(calls=RATE_LIMIT, period=RATE_PERIOD) def search_book(self, searchterm: str): response = requests.get(PPN_URL.format(searchterm), timeout=self.timeout) return response.text @@ -62,16 +69,17 @@ class WebRequest: for link in links: res.append(BASE + link["href"]) return res - + @sleep_and_retry + @limits(calls=RATE_LIMIT, period=RATE_PERIOD) def search(self, link: str): - response = requests.get(link, timeout=self.timeout) - return response.text - - def get_data( - self - ): + try: + response = requests.get(link, timeout=self.timeout) + return response.text + except requests.exceptions.RequestException as e: + logger.log_error(f"Request failed: {e}") + return None + def get_data(self): links = self.get_book_links(self.ppn) - print(links) for link in links: result = self.search(link) # in result search for class col-xs-12 rds-dl RDS_LOCATION @@ -95,6 +103,27 @@ class WebRequest: logger.log_error("No
 tag found")
                             return return_data
 
+    def get_data_elsa(self):
+        links = self.get_book_links(self.ppn)
+        for link in links:
+            result = self.search(link)
+            # in result search for class col-xs-12 rds-dl RDS_LOCATION
+            # if found, return text of href
+            soup = BeautifulSoup(result, "html.parser")
+            locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
+            if locations:
+                for location in locations:
+                    pre_tag = soup.find_all("pre")
+                    return_data = []
+                    if pre_tag:
+                        for tag in pre_tag:
+                            data = tag.text.strip()
+                            return_data.append(data)
+                        return return_data
+                    else:
+                        logger.log_error("No 
 tag found")
+                        return return_data
+
 
 class BibTextTransformer:
     """Transforms data from the web into a BibText format.
@@ -114,7 +143,6 @@ class BibTextTransformer:
         if mode not in self.valid_modes:
             logger.log_error(f"Mode {mode} not valid")
             raise ValueError(f"Mode {mode} not valid")
-        # # print(self.field)
         self.data = None
         # self.bookdata = BookData(**self.data)