291 lines
12 KiB
Python
291 lines
12 KiB
Python
import regex
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
from src.logic import BookData as Book
|
|
from src.shared.logging import log
|
|
|
|
URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND"
|
|
BASE = "https://rds.ibs-bw.de"
|
|
|
|
|
|
class Catalogue:
|
|
def __init__(self, timeout=15):
|
|
self.timeout = timeout
|
|
reachable = self.check_connection()
|
|
if not reachable:
|
|
log.error("No internet connection available.")
|
|
raise ConnectionError("No internet connection available.")
|
|
|
|
def check_connection(self):
|
|
try:
|
|
response = requests.get("https://www.google.com", timeout=self.timeout)
|
|
if response.status_code == 200:
|
|
return True
|
|
except requests.exceptions.RequestException as e:
|
|
log.error(f"Could not connect to google.com: {e}")
|
|
|
|
def search_book(self, searchterm: str):
|
|
response = requests.get(URL.format(searchterm), timeout=self.timeout)
|
|
return response.text
|
|
|
|
def search(self, link: str):
|
|
response = requests.get(link, timeout=self.timeout)
|
|
return response.text
|
|
|
|
def get_book_links(self, searchterm: str):
|
|
response = self.search_book(searchterm)
|
|
soup = BeautifulSoup(response, "html.parser")
|
|
links = soup.find_all("a", class_="title getFull")
|
|
res = []
|
|
for link in links:
|
|
res.append(BASE + link["href"])
|
|
return res
|
|
|
|
def get_book(self, searchterm: str):
|
|
log.info(f"Searching for term: {searchterm}")
|
|
|
|
links = self.get_book_links(searchterm)
|
|
print(links)
|
|
for elink in links:
|
|
result = self.search(elink)
|
|
# in result search for class col-xs-12 rds-dl RDS_LOCATION
|
|
# if found, return text of href
|
|
soup = BeautifulSoup(result, "html.parser")
|
|
|
|
# Optional (unchanged): title and ppn if you need them
|
|
title_el = soup.find("div", class_="headline text")
|
|
title = title_el.get_text(strip=True) if title_el else None
|
|
|
|
ppn_el = soup.find(
|
|
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
|
|
)
|
|
# in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
|
|
ppn = (
|
|
ppn_el.find_next_sibling(
|
|
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
|
).get_text(strip=True)
|
|
if ppn_el
|
|
else None
|
|
)
|
|
|
|
# get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION
|
|
edition_el = soup.find(
|
|
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION"
|
|
)
|
|
edition = (
|
|
edition_el.find_next_sibling(
|
|
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
|
).get_text(strip=True)
|
|
if edition_el
|
|
else None
|
|
)
|
|
|
|
authors = soup.find_all(
|
|
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
|
|
)
|
|
author = None
|
|
if authors:
|
|
# get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
|
|
author_names = []
|
|
for author in authors:
|
|
panel = author.find_next_sibling(
|
|
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
|
)
|
|
if panel:
|
|
links = panel.find_all("a")
|
|
for link in links:
|
|
author_names.append(link.text.strip())
|
|
author = (
|
|
";".join(author_names) if len(author_names) > 1 else author_names[0]
|
|
)
|
|
signature = None
|
|
|
|
panel = soup.select_one("div.panel-body")
|
|
if panel:
|
|
# Collect the RDS_* blocks in order, using the 'space' divs as separators
|
|
groups = []
|
|
cur = {}
|
|
for node in panel.select(
|
|
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
|
|
):
|
|
classes = node.get("class", [])
|
|
# Separator between entries
|
|
if "space" in classes:
|
|
if cur:
|
|
groups.append(cur)
|
|
cur = {}
|
|
continue
|
|
|
|
# Read the value from the corresponding panel cell
|
|
val_el = node.select_one(".rds-dl-panel")
|
|
val = (
|
|
val_el.get_text(" ", strip=True)
|
|
if val_el
|
|
else node.get_text(" ", strip=True)
|
|
)
|
|
|
|
if "RDS_SIGNATURE" in classes:
|
|
cur["signature"] = val
|
|
elif "RDS_STATUS" in classes:
|
|
cur["status"] = val
|
|
elif "RDS_LOCATION" in classes:
|
|
cur["location"] = val
|
|
|
|
if cur: # append the last group if not followed by a space
|
|
groups.append(cur)
|
|
|
|
# Find the signature for the entry whose location mentions "Semesterapparat"
|
|
for g in groups:
|
|
loc = g.get("location", "").lower()
|
|
if "semesterapparat" in loc:
|
|
signature = g.get("signature")
|
|
return Book(
|
|
title=title,
|
|
ppn=ppn,
|
|
signature=signature,
|
|
library_location=loc.split("-")[-1],
|
|
link=elink,
|
|
author=author,
|
|
edition=edition,
|
|
)
|
|
else:
|
|
return Book(
|
|
title=title,
|
|
ppn=ppn,
|
|
signature=signature,
|
|
library_location=loc.split("\n\n")[-1],
|
|
link=elink,
|
|
author=author,
|
|
edition=edition,
|
|
)
|
|
|
|
def get(self, ppn: str) -> Book | None:
|
|
# based on PPN, get title, people, edition, year, language, pages, isbn,
|
|
link = f"https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{ppn}"
|
|
result = self.search(link)
|
|
soup = BeautifulSoup(result, "html.parser")
|
|
|
|
def get_ppn(self, searchterm: str) -> str | None:
|
|
links = self.get_book_links(searchterm)
|
|
ppn = None
|
|
for link in links:
|
|
result = self.search(link)
|
|
soup = BeautifulSoup(result, "html.parser")
|
|
print(link)
|
|
ppn = link.split("/")[-1]
|
|
if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn):
|
|
return ppn
|
|
return ppn
|
|
|
|
def get_semesterapparat_number(self, searchterm: str) -> int:
|
|
links = self.get_book_links(searchterm)
|
|
for link in links:
|
|
result = self.search(link)
|
|
# in result search for class col-xs-12 rds-dl RDS_LOCATION
|
|
# if found, return text of href
|
|
soup = BeautifulSoup(result, "html.parser")
|
|
|
|
locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
|
|
for location_el in locations:
|
|
if "Semesterapparat-" in location_el.text:
|
|
match = regex.search(r"Semesterapparat-(\d+)", location_el.text)
|
|
if match:
|
|
return int(match.group(1))
|
|
if "Handbibliothek-" in location_el.text:
|
|
return location_el.text.strip().split("\n\n")[-1].strip()
|
|
return location_el.text.strip().split("\n\n")[-1].strip()
|
|
return 0
|
|
|
|
def get_author(self, link: str) -> str:
|
|
links = self.get_book_links(f"kid:{link}")
|
|
author = None
|
|
for link in links:
|
|
# print(link)
|
|
result = self.search(link)
|
|
soup = BeautifulSoup(result, "html.parser")
|
|
# get all authors, return them as a string seperated by ;
|
|
authors = soup.find_all(
|
|
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
|
|
)
|
|
if authors:
|
|
# get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
|
|
author_names = []
|
|
for author in authors:
|
|
panel = author.find_next_sibling(
|
|
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
|
)
|
|
if panel:
|
|
links = panel.find_all("a")
|
|
for link in links:
|
|
author_names.append(link.text.strip())
|
|
author = "; ".join(author_names)
|
|
return author
|
|
|
|
def get_signature(self, isbn: str):
|
|
links = self.get_book_links(f"{isbn}")
|
|
signature = None
|
|
for link in links:
|
|
result = self.search(link)
|
|
soup = BeautifulSoup(result, "html.parser")
|
|
panel = soup.select_one("div.panel-body")
|
|
if panel:
|
|
# Collect the RDS_* blocks in order, using the 'space' divs as separators
|
|
groups = []
|
|
cur = {}
|
|
for node in panel.select(
|
|
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
|
|
):
|
|
classes = node.get("class", [])
|
|
# Separator between entries
|
|
if "space" in classes:
|
|
if cur:
|
|
groups.append(cur)
|
|
cur = {}
|
|
continue
|
|
|
|
# Read the value from the corresponding panel cell
|
|
val_el = node.select_one(".rds-dl-panel")
|
|
val = (
|
|
val_el.get_text(" ", strip=True)
|
|
if val_el
|
|
else node.get_text(" ", strip=True)
|
|
)
|
|
|
|
if "RDS_SIGNATURE" in classes:
|
|
cur["signature"] = val
|
|
elif "RDS_STATUS" in classes:
|
|
cur["status"] = val
|
|
elif "RDS_LOCATION" in classes:
|
|
cur["location"] = val
|
|
|
|
if cur: # append the last group if not followed by a space
|
|
groups.append(cur)
|
|
|
|
# Find the signature for the entry whose location mentions "Semesterapparat"
|
|
for g in groups:
|
|
print(g)
|
|
loc = g.get("location", "").lower()
|
|
if "semesterapparat" in loc:
|
|
signature = g.get("signature")
|
|
return signature
|
|
else:
|
|
signature = g.get("signature")
|
|
return signature
|
|
print("No signature found")
|
|
return signature
|
|
|
|
def in_library(self, ppn: str) -> bool:
|
|
if ppn is None:
|
|
return False
|
|
links = self.get_book_links(f"kid:{ppn}")
|
|
return len(links) > 0
|
|
|
|
def get_location(self, ppn: str) -> str | None:
|
|
if ppn is None:
|
|
return None
|
|
link = self.get_book(f"{ppn}")
|
|
if link is None:
|
|
return None
|
|
return link.library_location
|