minor and major reworks: rename swb to SRU, add a test for pdf parsing
major: rework mail to send mail as plaintext instead of html, preventing the bleed-in of html text
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
__all__ = [
|
||||
"AdminCommands",
|
||||
"Semester",
|
||||
"AutoAdder",
|
||||
"AvailChecker",
|
||||
"BookGrabber",
|
||||
@@ -9,16 +8,15 @@ __all__ = [
|
||||
"NewEditionCheckerThread",
|
||||
"recreateElsaFile",
|
||||
"recreateFile",
|
||||
"Catalogue"
|
||||
"Catalogue",
|
||||
]
|
||||
|
||||
from .admin_console import AdminCommands
|
||||
from .catalogue import Catalogue
|
||||
from .create_file import recreateElsaFile, recreateFile
|
||||
from .database import Database
|
||||
from .documentation_thread import DocumentationThread
|
||||
from .semester import Semester
|
||||
from .thread_bookgrabber import BookGrabber
|
||||
from .thread_neweditions import NewEditionCheckerThread
|
||||
from .threads_autoadder import AutoAdder
|
||||
from .threads_availchecker import AvailChecker
|
||||
from .catalogue import Catalogue
|
||||
|
||||
@@ -2,6 +2,7 @@ import sys
|
||||
from datetime import datetime
|
||||
|
||||
import loguru
|
||||
import regex
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
@@ -24,7 +25,7 @@ log.add(
|
||||
|
||||
|
||||
class Catalogue:
|
||||
def __init__(self, timeout=5):
|
||||
def __init__(self, timeout=15):
|
||||
self.timeout = timeout
|
||||
reachable = self.check_connection()
|
||||
if not reachable:
|
||||
@@ -61,8 +62,8 @@ class Catalogue:
|
||||
|
||||
links = self.get_book_links(searchterm)
|
||||
print(links)
|
||||
for link in links:
|
||||
result = self.search(link)
|
||||
for elink in links:
|
||||
result = self.search(elink)
|
||||
# in result search for class col-xs-12 rds-dl RDS_LOCATION
|
||||
# if found, return text of href
|
||||
soup = BeautifulSoup(result, "html.parser")
|
||||
@@ -74,8 +75,45 @@ class Catalogue:
|
||||
ppn_el = soup.find(
|
||||
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
|
||||
)
|
||||
ppn = ppn_el.get_text(strip=True) if ppn_el else None
|
||||
# in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
|
||||
ppn = (
|
||||
ppn_el.find_next_sibling(
|
||||
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
||||
).get_text(strip=True)
|
||||
if ppn_el
|
||||
else None
|
||||
)
|
||||
|
||||
# get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION
|
||||
edition_el = soup.find(
|
||||
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION"
|
||||
)
|
||||
edition = (
|
||||
edition_el.find_next_sibling(
|
||||
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
||||
).get_text(strip=True)
|
||||
if edition_el
|
||||
else None
|
||||
)
|
||||
|
||||
authors = soup.find_all(
|
||||
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
|
||||
)
|
||||
author = None
|
||||
if authors:
|
||||
# get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
|
||||
author_names = []
|
||||
for author in authors:
|
||||
panel = author.find_next_sibling(
|
||||
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
||||
)
|
||||
if panel:
|
||||
links = panel.find_all("a")
|
||||
for link in links:
|
||||
author_names.append(link.text.strip())
|
||||
author = (
|
||||
";".join(author_names) if len(author_names) > 1 else author_names[0]
|
||||
)
|
||||
signature = None
|
||||
|
||||
panel = soup.select_one("div.panel-body")
|
||||
@@ -121,4 +159,147 @@ class Catalogue:
|
||||
title=title,
|
||||
ppn=ppn,
|
||||
signature=signature,
|
||||
library_location=loc.split("-")[-1],
|
||||
link=elink,
|
||||
author=author,
|
||||
edition=edition,
|
||||
)
|
||||
else:
|
||||
return Book(
|
||||
title=title,
|
||||
ppn=ppn,
|
||||
signature=signature,
|
||||
library_location=loc.split("\n\n")[-1],
|
||||
link=elink,
|
||||
author=author,
|
||||
edition=edition,
|
||||
)
|
||||
|
||||
def get(self, ppn: str) -> Book | None:
|
||||
# based on PPN, get title, people, edition, year, language, pages, isbn,
|
||||
link = f"https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{ppn}"
|
||||
result = self.search(link)
|
||||
soup = BeautifulSoup(result, "html.parser")
|
||||
|
||||
def get_ppn(self, searchterm: str) -> str | None:
|
||||
links = self.get_book_links(searchterm)
|
||||
ppn = None
|
||||
for link in links:
|
||||
result = self.search(link)
|
||||
soup = BeautifulSoup(result, "html.parser")
|
||||
print(link)
|
||||
ppn = link.split("/")[-1]
|
||||
if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn):
|
||||
return ppn
|
||||
return ppn
|
||||
|
||||
def get_semesterapparat_number(self, searchterm: str) -> int:
|
||||
links = self.get_book_links(searchterm)
|
||||
for link in links:
|
||||
result = self.search(link)
|
||||
# in result search for class col-xs-12 rds-dl RDS_LOCATION
|
||||
# if found, return text of href
|
||||
soup = BeautifulSoup(result, "html.parser")
|
||||
|
||||
locations = soup.find_all("div", class_="col-xs-12 rds-dl RDS_LOCATION")
|
||||
for location_el in locations:
|
||||
if "Semesterapparat-" in location_el.text:
|
||||
match = regex.search(r"Semesterapparat-(\d+)", location_el.text)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
if "Handbibliothek-" in location_el.text:
|
||||
return location_el.text.strip().split("\n\n")[-1].strip()
|
||||
return location_el.text.strip().split("\n\n")[-1].strip()
|
||||
return 0
|
||||
|
||||
def get_author(self, link: str) -> str:
|
||||
links = self.get_book_links(f"kid:{link}")
|
||||
author = None
|
||||
for link in links:
|
||||
# print(link)
|
||||
result = self.search(link)
|
||||
soup = BeautifulSoup(result, "html.parser")
|
||||
# get all authors, return them as a string seperated by ;
|
||||
authors = soup.find_all(
|
||||
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
|
||||
)
|
||||
if authors:
|
||||
# get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
|
||||
author_names = []
|
||||
for author in authors:
|
||||
panel = author.find_next_sibling(
|
||||
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
||||
)
|
||||
if panel:
|
||||
links = panel.find_all("a")
|
||||
for link in links:
|
||||
author_names.append(link.text.strip())
|
||||
author = "; ".join(author_names)
|
||||
return author
|
||||
|
||||
def get_signature(self, isbn: str):
|
||||
links = self.get_book_links(f"{isbn}")
|
||||
signature = None
|
||||
for link in links:
|
||||
result = self.search(link)
|
||||
soup = BeautifulSoup(result, "html.parser")
|
||||
panel = soup.select_one("div.panel-body")
|
||||
if panel:
|
||||
# Collect the RDS_* blocks in order, using the 'space' divs as separators
|
||||
groups = []
|
||||
cur = {}
|
||||
for node in panel.select(
|
||||
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
|
||||
):
|
||||
classes = node.get("class", [])
|
||||
# Separator between entries
|
||||
if "space" in classes:
|
||||
if cur:
|
||||
groups.append(cur)
|
||||
cur = {}
|
||||
continue
|
||||
|
||||
# Read the value from the corresponding panel cell
|
||||
val_el = node.select_one(".rds-dl-panel")
|
||||
val = (
|
||||
val_el.get_text(" ", strip=True)
|
||||
if val_el
|
||||
else node.get_text(" ", strip=True)
|
||||
)
|
||||
|
||||
if "RDS_SIGNATURE" in classes:
|
||||
cur["signature"] = val
|
||||
elif "RDS_STATUS" in classes:
|
||||
cur["status"] = val
|
||||
elif "RDS_LOCATION" in classes:
|
||||
cur["location"] = val
|
||||
|
||||
if cur: # append the last group if not followed by a space
|
||||
groups.append(cur)
|
||||
|
||||
# Find the signature for the entry whose location mentions "Semesterapparat"
|
||||
for g in groups:
|
||||
print(g)
|
||||
loc = g.get("location", "").lower()
|
||||
if "semesterapparat" in loc:
|
||||
signature = g.get("signature")
|
||||
return signature
|
||||
else:
|
||||
signature = g.get("signature")
|
||||
return signature
|
||||
print("No signature found")
|
||||
return signature
|
||||
|
||||
def in_library(self, ppn: str) -> bool:
|
||||
if ppn is None:
|
||||
return False
|
||||
links = self.get_book_links(f"kid:{ppn}")
|
||||
return len(links) > 0
|
||||
|
||||
def get_location(self, ppn: str) -> str | None:
|
||||
if ppn is None:
|
||||
return None
|
||||
link = self.get_book(f"{ppn}")
|
||||
if link is None:
|
||||
return None
|
||||
return link.library_location
|
||||
|
||||
@@ -30,10 +30,9 @@ from src.backend.db import (
|
||||
from src.errors import AppPresentError, NoResultError
|
||||
from src.logic import ELSA, Apparat, ApparatData, BookData, Prof
|
||||
from src.logic.constants import SEMAP_MEDIA_ACCOUNTS
|
||||
from src.logic.semester import Semester
|
||||
from src.utils.blob import create_blob
|
||||
|
||||
from .semester import Semester
|
||||
|
||||
log = loguru.logger
|
||||
log.remove()
|
||||
log.add(sys.stdout, level="INFO")
|
||||
@@ -1873,7 +1872,7 @@ class Database:
|
||||
Returns:
|
||||
list[tuple]: A list of tuples containing the new editions data
|
||||
"""
|
||||
query = "SELECT * FROM neweditions WHERE for_apparat=?"
|
||||
query = "SELECT * FROM neweditions WHERE for_apparat=? AND ordered=0"
|
||||
results = self.query_db(query, (apparat_id,))
|
||||
res = []
|
||||
for result in results:
|
||||
@@ -1887,9 +1886,25 @@ class Database:
|
||||
query = "UPDATE neweditions SET ordered=1 WHERE id=?"
|
||||
self.query_db(query, (newBook_id,))
|
||||
|
||||
def getBooksWithNewEditions(self, app_id) -> List[BookData]:
|
||||
# select all bookdata from media, based on the old_edition_id in neweditions where for_apparat = app_id; also get the new_edition bookdata
|
||||
|
||||
query = "SELECT m.bookdata, new_bookdata FROM media m JOIN neweditions n ON m.id = n.old_edition_id WHERE n.for_apparat = ?"
|
||||
results = self.query_db(query, (app_id,))
|
||||
# store results in tuple old,new
|
||||
res = []
|
||||
for result in results:
|
||||
oldedition = BookData().from_string(result[0])
|
||||
newedition = BookData().from_string(result[1])
|
||||
res.append((oldedition, newedition))
|
||||
return res
|
||||
|
||||
def getNewEditionId(self, newBook: BookData):
|
||||
query = "SELECT id FROM neweditions WHERE new_bookdata=?"
|
||||
params = (newBook.to_dict,)
|
||||
query = "SELECT id FROM neweditions WHERE new_bookdata LIKE ?"
|
||||
args = (
|
||||
newBook.isbn[0] if newBook.isbn and len(newBook.isbn) > 0 else newBook.ppn
|
||||
)
|
||||
params = (f"%{args}%",)
|
||||
data = self.query_db(query, params, one=True)
|
||||
if data:
|
||||
return data[0]
|
||||
@@ -1897,6 +1912,14 @@ class Database:
|
||||
return None
|
||||
|
||||
def insertNewEdition(self, newBook: BookData, oldBookId: int, for_apparat: int):
|
||||
# check if new edition already in table, check based on newBook.ppn
|
||||
check_query = "SELECT id FROM neweditions WHERE new_bookdata LIKE ?"
|
||||
check_params = (f"%{newBook.ppn}%",)
|
||||
data = self.query_db(check_query, check_params, one=True)
|
||||
if data:
|
||||
log.info("New edition already in table, skipping insert")
|
||||
return
|
||||
|
||||
query = "INSERT INTO neweditions (new_bookdata, old_edition_id, for_apparat) VALUES (?,?,?)"
|
||||
params = (newBook.to_dict, oldBookId, for_apparat)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user