64 lines
1.8 KiB
Python
64 lines
1.8 KiB
Python
import requests
|
|
from ratelimit import limits, sleep_and_retry # type: ignore
|
|
from src.database import Database
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
|
db = Database("lfer.db")
|
|
|
|
|
|
@sleep_and_retry
|
|
@limits(calls=10, period=1)
|
|
def fetch_data(ppn: str):
|
|
api_url = f"https://sru.bsz-bw.de/swb?version=1.1&query=pica.ppn%3D{ppn}&operation=searchRetrieve&maximumRecords=10&recordSchema=marcxmlk10os"
|
|
response = requests.get(api_url)
|
|
return response.text
|
|
|
|
|
|
def process_response(response):
|
|
"""Extracts URLs from datafield 856, subfield u."""
|
|
try:
|
|
root = ET.fromstring(response)
|
|
namespace = {
|
|
"zs": "http://www.loc.gov/zing/srw/",
|
|
"marc": "http://www.loc.gov/MARC21/slim",
|
|
}
|
|
|
|
# Find all recordData elements
|
|
record_data = root.find(".//zs:recordData", namespace)
|
|
if record_data is None:
|
|
return None
|
|
|
|
# Find all datafield 856 elements
|
|
links = []
|
|
for datafield in record_data.findall(
|
|
".//marc:datafield[@tag='856']", namespace
|
|
):
|
|
for subfield in datafield.findall("marc:subfield[@code='u']", namespace):
|
|
links.append(subfield.text)
|
|
|
|
return links if links else None
|
|
except ET.ParseError:
|
|
return None
|
|
|
|
|
|
def get_data():
|
|
with open("ppnlist.txt", "r") as f:
|
|
ppns = f.read()
|
|
ppns = ppns.split("\n")
|
|
for ppn in ppns:
|
|
data = fetch_data(ppn)
|
|
links = process_response(data)
|
|
if links is None:
|
|
db.add_data(ppn, "Error: No data found")
|
|
else:
|
|
for link in links:
|
|
db.add_data(ppn, link)
|
|
print("Progress: ", ppns.index(ppn) + 1, "/", len(ppns), end="\r")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
print("Hello from webscraper!\nScraping the list of PPNs...")
|
|
get_data()
|
|
print("Done")
|