minor and major reworks: rename swb to SRU, add a test for pdf parsing

major: rework mail to send mail as plaintext instead of html, preventing the bleed-in of html text
This commit is contained in:
2025-10-07 14:15:10 +02:00
parent 0df7fd9fe6
commit 06965db26a
25 changed files with 1174 additions and 303 deletions

View File

@@ -2,6 +2,7 @@ import sys
from datetime import datetime
import loguru
import regex
import requests
from bs4 import BeautifulSoup
@@ -24,7 +25,7 @@ log.add(
class Catalogue:
def __init__(self, timeout=5):
def __init__(self, timeout=15):
self.timeout = timeout
reachable = self.check_connection()
if not reachable:
@@ -61,8 +62,8 @@ class Catalogue:
links = self.get_book_links(searchterm)
print(links)
for link in links:
result = self.search(link)
for elink in links:
result = self.search(elink)
# in result search for class col-xs-12 rds-dl RDS_LOCATION
# if found, return text of href
soup = BeautifulSoup(result, "html.parser")
@@ -74,8 +75,45 @@ class Catalogue:
ppn_el = soup.find(
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
)
ppn = ppn_el.get_text(strip=True) if ppn_el else None
# in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
ppn = (
ppn_el.find_next_sibling(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
).get_text(strip=True)
if ppn_el
else None
)
# get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION
edition_el = soup.find(
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION"
)
edition = (
edition_el.find_next_sibling(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
).get_text(strip=True)
if edition_el
else None
)
authors = soup.find_all(
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
)
author = None
if authors:
# get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
author_names = []
for author in authors:
panel = author.find_next_sibling(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
)
if panel:
links = panel.find_all("a")
for link in links:
author_names.append(link.text.strip())
author = (
";".join(author_names) if len(author_names) > 1 else author_names[0]
)
signature = None
panel = soup.select_one("div.panel-body")
@@ -121,4 +159,147 @@ class Catalogue:
title=title,
ppn=ppn,
signature=signature,
library_location=loc.split("-")[-1],
link=elink,
author=author,
edition=edition,
)
else:
return Book(
title=title,
ppn=ppn,
signature=signature,
library_location=loc.split("\n\n")[-1],
link=elink,
author=author,
edition=edition,
)
def get(self, ppn: str) -> Book | None:
# based on PPN, get title, people, edition, year, language, pages, isbn,
link = f"https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{ppn}"
result = self.search(link)
soup = BeautifulSoup(result, "html.parser")
def get_ppn(self, searchterm: str) -> str | None:
links = self.get_book_links(searchterm)
ppn = None
for link in links:
result = self.search(link)
soup = BeautifulSoup(result, "html.parser")
print(link)
ppn = link.split("/")[-1]
if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn):
return ppn
return ppn
def get_semesterapparat_number(self, searchterm: str) -> int:
links = self.get_book_links(searchterm)
for link in links:
result = self.search(link)
# in result search for class col-xs-12 rds-dl RDS_LOCATION
# if found, return text of href
soup = BeautifulSoup(result, "html.parser")
locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
for location_el in locations:
if "Semesterapparat-" in location_el.text:
match = regex.search(r"Semesterapparat-(\d+)", location_el.text)
if match:
return int(match.group(1))
if "Handbibliothek-" in location_el.text:
return location_el.text.strip().split("\n\n")[-1].strip()
return location_el.text.strip().split("\n\n")[-1].strip()
return 0
def get_author(self, link: str) -> str:
links = self.get_book_links(f"kid:{link}")
author = None
for link in links:
# print(link)
result = self.search(link)
soup = BeautifulSoup(result, "html.parser")
# get all authors, return them as a string seperated by ;
authors = soup.find_all(
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
)
if authors:
# get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
author_names = []
for author in authors:
panel = author.find_next_sibling(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
)
if panel:
links = panel.find_all("a")
for link in links:
author_names.append(link.text.strip())
author = "; ".join(author_names)
return author
def get_signature(self, isbn: str):
links = self.get_book_links(f"{isbn}")
signature = None
for link in links:
result = self.search(link)
soup = BeautifulSoup(result, "html.parser")
panel = soup.select_one("div.panel-body")
if panel:
# Collect the RDS_* blocks in order, using the 'space' divs as separators
groups = []
cur = {}
for node in panel.select(
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
):
classes = node.get("class", [])
# Separator between entries
if "space" in classes:
if cur:
groups.append(cur)
cur = {}
continue
# Read the value from the corresponding panel cell
val_el = node.select_one(".rds-dl-panel")
val = (
val_el.get_text(" ", strip=True)
if val_el
else node.get_text(" ", strip=True)
)
if "RDS_SIGNATURE" in classes:
cur["signature"] = val
elif "RDS_STATUS" in classes:
cur["status"] = val
elif "RDS_LOCATION" in classes:
cur["location"] = val
if cur: # append the last group if not followed by a space
groups.append(cur)
# Find the signature for the entry whose location mentions "Semesterapparat"
for g in groups:
print(g)
loc = g.get("location", "").lower()
if "semesterapparat" in loc:
signature = g.get("signature")
return signature
else:
signature = g.get("signature")
return signature
print("No signature found")
return signature
def in_library(self, ppn: str) -> bool:
if ppn is None:
return False
links = self.get_book_links(f"kid:{ppn}")
return len(links) > 0
def get_location(self, ppn: str) -> str | None:
if ppn is None:
return None
link = self.get_book(f"{ppn}")
if link is None:
return None
return link.library_location