import requests from ratelimit import limits, sleep_and_retry # type: ignore from src.database import Database import xml.etree.ElementTree as ET db = Database("lfer.db") @sleep_and_retry @limits(calls=10, period=1) def fetch_data(ppn: str): api_url = f"https://sru.bsz-bw.de/swb?version=1.1&query=pica.ppn%3D{ppn}&operation=searchRetrieve&maximumRecords=10&recordSchema=marcxmlk10os" response = requests.get(api_url) return response.text def process_response(response): """Extracts URLs from datafield 856, subfield u.""" try: root = ET.fromstring(response) namespace = { "zs": "http://www.loc.gov/zing/srw/", "marc": "http://www.loc.gov/MARC21/slim", } # Find all recordData elements record_data = root.find(".//zs:recordData", namespace) if record_data is None: return None # Find all datafield 856 elements links = [] for datafield in record_data.findall( ".//marc:datafield[@tag='856']", namespace ): for subfield in datafield.findall("marc:subfield[@code='u']", namespace): links.append(subfield.text) return links if links else None except ET.ParseError: return None def get_data(): with open("ppnlist.txt", "r") as f: ppns = f.read() ppns = ppns.split("\n") for ppn in ppns: data = fetch_data(ppn) links = process_response(data) if links is None: db.add_data(ppn, "Error: No data found") else: for link in links: db.add_data(ppn, link) print("Progress: ", ppns.index(ppn) + 1, "/", len(ppns), end="\r") if __name__ == "__main__": print("Hello from webscraper!\nScraping the list of PPNs...") get_data() print("Done")