update code, add all features
This commit is contained in:
63
webscraper.py
Normal file
63
webscraper.py
Normal file
@@ -0,0 +1,63 @@
|
||||
import requests
|
||||
from ratelimit import limits, sleep_and_retry
|
||||
from src.database import Database
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
|
||||
db = Database("lfer.db")
|
||||
|
||||
|
||||
@sleep_and_retry
|
||||
@limits(calls=10, period=1)
|
||||
def fetch_data(ppn):
|
||||
api_url = f"https://sru.bsz-bw.de/swb?version=1.1&query=pica.ppn%3D{ppn}&operation=searchRetrieve&maximumRecords=10&recordSchema=marcxmlk10os"
|
||||
response = requests.get(api_url)
|
||||
return response.text
|
||||
|
||||
|
||||
def process_response(response):
|
||||
"""Extracts URLs from datafield 856, subfield u."""
|
||||
try:
|
||||
root = ET.fromstring(response)
|
||||
namespace = {
|
||||
"zs": "http://www.loc.gov/zing/srw/",
|
||||
"marc": "http://www.loc.gov/MARC21/slim",
|
||||
}
|
||||
|
||||
# Find all recordData elements
|
||||
record_data = root.find(".//zs:recordData", namespace)
|
||||
if record_data is None:
|
||||
return None
|
||||
|
||||
# Find all datafield 856 elements
|
||||
links = []
|
||||
for datafield in record_data.findall(
|
||||
".//marc:datafield[@tag='856']", namespace
|
||||
):
|
||||
for subfield in datafield.findall("marc:subfield[@code='u']", namespace):
|
||||
links.append(subfield.text)
|
||||
|
||||
return links if links else None
|
||||
except ET.ParseError:
|
||||
return None
|
||||
|
||||
|
||||
def get_data():
|
||||
with open("ppnlist.txt", "r") as f:
|
||||
ppns = f.read()
|
||||
ppns = ppns.split("\n")
|
||||
for ppn in ppns:
|
||||
data = fetch_data(ppn)
|
||||
links = process_response(data)
|
||||
if links is None:
|
||||
db.add_data(ppn, "Error: No data found")
|
||||
else:
|
||||
for link in links:
|
||||
db.add_data(ppn, link)
|
||||
print("Progress: ", ppns.index(ppn) + 1, "/", len(ppns), end="\r")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Hello from webscraper!\nScraping the list of PPNs...")
|
||||
get_data()
|
||||
print("Done")
|
||||
Reference in New Issue
Block a user