Files
LinkAvailableChecker/webscraper.py

64 lines
1.8 KiB
Python

import requests
from ratelimit import limits, sleep_and_retry # type: ignore
from src.database import Database
import xml.etree.ElementTree as ET
db = Database("lfer.db")
@sleep_and_retry
@limits(calls=10, period=1)
def fetch_data(ppn: str):
api_url = f"https://sru.bsz-bw.de/swb?version=1.1&query=pica.ppn%3D{ppn}&operation=searchRetrieve&maximumRecords=10&recordSchema=marcxmlk10os"
response = requests.get(api_url)
return response.text
def process_response(response):
"""Extracts URLs from datafield 856, subfield u."""
try:
root = ET.fromstring(response)
namespace = {
"zs": "http://www.loc.gov/zing/srw/",
"marc": "http://www.loc.gov/MARC21/slim",
}
# Find all recordData elements
record_data = root.find(".//zs:recordData", namespace)
if record_data is None:
return None
# Find all datafield 856 elements
links = []
for datafield in record_data.findall(
".//marc:datafield[@tag='856']", namespace
):
for subfield in datafield.findall("marc:subfield[@code='u']", namespace):
links.append(subfield.text)
return links if links else None
except ET.ParseError:
return None
def get_data():
with open("ppnlist.txt", "r") as f:
ppns = f.read()
ppns = ppns.split("\n")
for ppn in ppns:
data = fetch_data(ppn)
links = process_response(data)
if links is None:
db.add_data(ppn, "Error: No data found")
else:
for link in links:
db.add_data(ppn, link)
print("Progress: ", ppns.index(ppn) + 1, "/", len(ppns), end="\r")
if __name__ == "__main__":
print("Hello from webscraper!\nScraping the list of PPNs...")
get_data()
print("Done")