13 Commits

Author SHA1 Message Date
34939474b4 chore(deps): update python docker tag to v3.14
Some checks failed
/ typecheck (pull_request) Failing after 2m59s
2025-11-29 00:02:29 +00:00
9556588d9d Merge pull request 'Configure Renovate' (#11) from renovate/configure into main
Reviewed-on: #11
2025-11-28 07:58:02 +00:00
14ec61d209 Add renovate.json
Some checks failed
/ typecheck (pull_request) Failing after 51s
2025-11-27 17:27:14 +00:00
Gitea CI
cb470c2850 Bump version: 0.0.5 → 0.0.6 2025-11-25 09:13:33 +00:00
4eb3856c36 Merge pull request 'dev' (#10) from dev into main
Reviewed-on: #10
2025-11-25 09:13:04 +00:00
97a1becc86 chore: remove debug print statements
Some checks failed
/ typecheck (pull_request) Failing after 1m1s
2025-11-25 10:11:06 +01:00
3a83ef27da move schemas to schema folder 2025-11-21 09:50:09 +01:00
14f9748957 feat: rework pyproject to allow installing only parts 2025-11-21 09:46:38 +01:00
Gitea CI
c8b3590355 Bump version: 0.0.4 → 0.0.5 2025-11-19 13:50:08 +00:00
cf8ec8b07e Merge pull request 'feat: get additional data from catalogue:' (#9) from dev into main
Reviewed-on: #9
2025-11-19 13:49:16 +00:00
d74b94b769 feat: get additional data from catalogue:
Some checks failed
/ typecheck (pull_request) Failing after 37s
- signature
 - isbn (bit broken rn)
 - pages (only for print books)
2025-11-19 14:48:42 +01:00
Gitea CI
2d08c2959a Bump version: 0.0.3 → 0.0.4
Some checks failed
continuous-integration/drone/push Build is failing
2025-11-13 20:20:31 +00:00
5da3050da6 Update pyproject.toml
Some checks failed
continuous-integration/drone/push Build is failing
2025-11-13 09:58:44 +00:00
6 changed files with 97 additions and 118 deletions

View File

@@ -8,7 +8,7 @@ trigger:
steps: steps:
- name: setup+deps - name: setup+deps
image: python:3.12-slim image: python:3.14-slim
environment: environment:
UV_NO_SYNC_PROGRESS: "1" UV_NO_SYNC_PROGRESS: "1"
commands: commands:
@@ -32,7 +32,7 @@ steps:
- uv pip install pytest pytest-cov mypy ruff - uv pip install pytest pytest-cov mypy ruff
- name: lint-typecheck - name: lint-typecheck
image: python:3.12-slim image: python:3.14-slim
commands: commands:
- export PATH="$HOME/.local/bin:$PATH" - export PATH="$HOME/.local/bin:$PATH"
- . .venv/bin/activate - . .venv/bin/activate
@@ -40,7 +40,7 @@ steps:
- mypy --ignore-missing-imports . - mypy --ignore-missing-imports .
- name: test - name: test
image: python:3.12-slim image: python:3.14-slim
commands: commands:
- export PATH="$HOME/.local/bin:$PATH" - export PATH="$HOME/.local/bin:$PATH"
- . .venv/bin/activate - . .venv/bin/activate

View File

@@ -1 +1 @@
3.13 3.14

View File

@@ -1,6 +1,6 @@
[project] [project]
name = "bibapi" name = "bibapi"
version = "0.1.0" version = "0.0.6"
description = "Add your description here" description = "Add your description here"
readme = "README.md" readme = "README.md"
authors = [ authors = [
@@ -8,19 +8,32 @@ authors = [
] ]
requires-python = ">=3.13" requires-python = ">=3.13"
dependencies = [ dependencies = [
"beautifulsoup4>=4.14.2",
"cloudscraper>=1.2.71",
"playwright>=1.55.0",
"regex>=2025.9.18", "regex>=2025.9.18",
]
[project.optional-dependencies]
# SRU API feature: for accessing library catalogs via SRU protocol
sru = [
"requests>=2.32.5", "requests>=2.32.5",
] ]
# Catalogue feature: web scraping local library catalog
catalogue = [
"requests>=2.32.5",
"beautifulsoup4>=4.12.0",
]
# Install all features
all = [
"bibapi[sru,catalogue]",
]
[build-system] [build-system]
requires = ["uv_build >= 0.9.5, <0.10.0"] requires = ["uv_build >= 0.9.5, <0.10.0"]
build-backend = "uv_build" build-backend = "uv_build"
[tool.bumpversion] [tool.bumpversion]
current_version = "0.0.3" current_version = "0.0.6"
parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)" parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
serialize = ["{major}.{minor}.{patch}"] serialize = ["{major}.{minor}.{patch}"]
search = "{current_version}" search = "{current_version}"

3
renovate.json Normal file
View File

@@ -0,0 +1,3 @@
{
"$schema": "https://docs.renovatebot.com/renovate-schema.json"
}

View File

@@ -1,5 +1,4 @@
from enum import Enum from .schemas.api_types import *
from .sru import Api as _Api from .sru import Api as _Api
__all__ = [ __all__ = [
@@ -12,86 +11,6 @@ __all__ = [
] ]
class PicaSchema(Enum):
TITLE = "pica.tit"
CALLSIGN = "pica.abr"
ALL = "pica.all"
DATE_FIRST_CREATION = "pica.ser"
DATE_LAST_MODIFIED = "pica.aed"
ISBN = "pica.isb"
ISSN = "pica.isn"
ISMN = "pica.ism"
PPN = "pica.ppn"
AUTHOR = "pica.per"
YEAR = "pica.jhr"
AUTHOR_SCHEMA = "NoSpaceAfterComma"
ENCLOSE_TITLE_IN_QUOTES = False
class ALMASchema(Enum):
TITLE = "alma.title"
AUTHOR = "alma.author"
ENCLOSE_TITLE_IN_QUOTES = True
AUTHOR_SCHEMA = "NoSpaceAfterComma"
YEAR = "date_of_publication"
class DublinCoreSchema(Enum):
TITLE = "dc.title"
AUTHOR = "dc.creator"
AUTHOR_SCHEMA = "SpaceAfterComma"
ENCLOSE_TITLE_IN_QUOTES = False
YEAR = "dc.date"
class CQLSchema(Enum):
pass
class SWBSchema(Enum):
URL = "https://sru.k10plus.de/opac-de-627!rec=1?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=marcxml"
ARGSCHEMA = PicaSchema
NAME = "SWB"
LIBRARY_NAME_LOCATION_FIELD = "924$b"
class DNBSchema(Enum):
URL = "https://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=MARC21-xml"
ARGSCHEMA = DublinCoreSchema
NAME = "DNB"
class KOBVSchema(Enum):
URL = "https://sru.kobv.de/k2?version=1.1&operation=searchRetrieve&query={}&startRecord=1&maximumRecords=100&recordSchema=marcxml"
ARGSCHEMA = DublinCoreSchema
NAME = "KOBV"
LIBRARY_NAME_LOCATION_FIELD = "924$b"
class HebisSchema(Enum):
URL = "http://sru.hebis.de/sru/DB=2.1?query={}&version=1.1&operation=searchRetrieve&stylesheet=http%3A%2F%2Fsru.hebis.de%2Fsru%2F%3Fxsl%3DsearchRetrieveResponse&recordSchema=marc21&maximumRecords=100&startRecord=1&recordPacking=xml&sortKeys=LST_Y%2Cpica%2C0%2C%2C"
ARGSCHEMA = PicaSchema
NOTSUPPORTEDARGS = ["YEAR"]
NAME = "HEBIS"
REPLACE = {" ": "+", "&": "%26", "=": "+%3D+"}
LIBRARY_NAME_LOCATION_FIELD = "924$b"
class OEVKSchema(Enum):
URL = "https://sru.k10plus.de/opac-de-627-2?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=marcxml"
ARGSCHEMA = PicaSchema
NAME = "OEVK"
LIBRARY_NAME_LOCATION_FIELD = "924$b"
class HBZSchema(Enum):
URL = "https://eu04.alma.exlibrisgroup.com/view/sru/49HBZ_NETWORK?version=1.2&operation=searchRetrieve&recordSchema=marcxml&query={}&maximumRecords=100&recordSchema=marcxml"
ARGSCHEMA = ALMASchema
NAME = "HBZ"
LIBRARY_NAME_LOCATION_FIELD = "852$a"
NOTSUPPORTEDARGS = ["PPN"]
class SWB(_Api): class SWB(_Api):
def __init__(self): def __init__(self):
self.site = SWBSchema.NAME.value self.site = SWBSchema.NAME.value

View File

@@ -4,6 +4,8 @@ import regex
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from .schemas.bookdata import BookData as Book
URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND" URL = "https://rds.ibs-bw.de/phfreiburg/opac/RDSIndex/Search?type0%5B%5D=allfields&lookfor0%5B%5D={}&join=AND&bool0%5B%5D=AND&type0%5B%5D=au&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ti&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ct&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=isn&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=ta&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=co&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=py&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pp&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=pu&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=si&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=zr&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND&type0%5B%5D=cc&lookfor0%5B%5D=&join=AND&bool0%5B%5D=AND"
BASE = "https://rds.ibs-bw.de" BASE = "https://rds.ibs-bw.de"
@@ -42,7 +44,6 @@ class Catalogue:
def get_book(self, searchterm: str): def get_book(self, searchterm: str):
links = self.get_book_links(searchterm) links = self.get_book_links(searchterm)
print(links)
for elink in links: for elink in links:
result = self.search(elink) result = self.search(elink)
# in result search for class col-xs-12 rds-dl RDS_LOCATION # in result search for class col-xs-12 rds-dl RDS_LOCATION
@@ -54,12 +55,14 @@ class Catalogue:
title = title_el.get_text(strip=True) if title_el else None title = title_el.get_text(strip=True) if title_el else None
ppn_el = soup.find( ppn_el = soup.find(
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN" "div",
class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN",
) )
# in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel # in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
ppn = ( ppn = (
ppn_el.find_next_sibling( ppn_el.find_next_sibling(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" "div",
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
).get_text(strip=True) ).get_text(strip=True)
if ppn_el if ppn_el
else None else None
@@ -67,18 +70,21 @@ class Catalogue:
# get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION # get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION
edition_el = soup.find( edition_el = soup.find(
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION" "div",
class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION",
) )
edition = ( edition = (
edition_el.find_next_sibling( edition_el.find_next_sibling(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" "div",
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
).get_text(strip=True) ).get_text(strip=True)
if edition_el if edition_el
else None else None
) )
authors = soup.find_all( authors = soup.find_all(
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON" "div",
class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON",
) )
author = None author = None
if authors: if authors:
@@ -86,7 +92,8 @@ class Catalogue:
author_names = [] author_names = []
for author in authors: for author in authors:
panel = author.find_next_sibling( panel = author.find_next_sibling(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" "div",
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
) )
if panel: if panel:
links = panel.find_all("a") links = panel.find_all("a")
@@ -103,7 +110,7 @@ class Catalogue:
groups = [] groups = []
cur = {} cur = {}
for node in panel.select( for node in panel.select(
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space" "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space",
): ):
classes = node.get("class", []) classes = node.get("class", [])
# Separator between entries # Separator between entries
@@ -145,7 +152,6 @@ class Catalogue:
author=author, author=author,
edition=edition, edition=edition,
) )
else:
return Book( return Book(
title=title, title=title,
ppn=ppn, ppn=ppn,
@@ -156,6 +162,46 @@ class Catalogue:
edition=edition, edition=edition,
) )
def get_book_with_data(self, searchterm: str) -> Book | None:
book = self.get_book(searchterm)
if book:
# request data from book.link and parse for additional data
result = self.search(book.link)
soup = BeautifulSoup(result, "html.parser")
# from div col-xs-12 rds-dl RDS_SIGNATURE get signature (second div in this div)
signature = None
signature_el = soup.find("div", class_="RDS_SIGNATURE")
if signature_el:
signature = signature_el.find("div", class_="rds-dl-panel").get_text(
strip=True,
)
book.signature = signature
# from div col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_ISBN get isbn (second div in this div)
isbn = None
isbn_el = soup.find("div", class_="RDS_ISBN")
if isbn_el:
isbn = isbn_el.find_next_sibling(
"div",
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
).get_text(strip=True)
book.isbn = isbn
# from div col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_SCOPE get pages (second div in this div)
pages = None
pages_el = soup.find("div", class_="RDS_SCOPE")
if pages_el:
pages = pages_el.find_next_sibling(
"div",
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
).get_text(strip=True)
# regex match to get pages by grabbing the first number in the string
match = regex.search(r"(\d+)", pages)
if match:
pages = match.group(1)
book.pages = pages
return book
return None
def get(self, ppn: str) -> Book | None: def get(self, ppn: str) -> Book | None:
# based on PPN, get title, people, edition, year, language, pages, isbn, # based on PPN, get title, people, edition, year, language, pages, isbn,
link = f"https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{ppn}" link = f"https://rds.ibs-bw.de/phfreiburg/opac/RDSIndexrecord/{ppn}"
@@ -168,7 +214,6 @@ class Catalogue:
for link in links: for link in links:
result = self.search(link) result = self.search(link)
soup = BeautifulSoup(result, "html.parser") soup = BeautifulSoup(result, "html.parser")
print(link)
ppn = link.split("/")[-1] ppn = link.split("/")[-1]
if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn): if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn):
return ppn return ppn
@@ -197,19 +242,20 @@ class Catalogue:
links = self.get_book_links(f"kid:{link}") links = self.get_book_links(f"kid:{link}")
author = None author = None
for link in links: for link in links:
# print(link)
result = self.search(link) result = self.search(link)
soup = BeautifulSoup(result, "html.parser") soup = BeautifulSoup(result, "html.parser")
# get all authors, return them as a string seperated by ; # get all authors, return them as a string seperated by ;
authors = soup.find_all( authors = soup.find_all(
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON" "div",
class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON",
) )
if authors: if authors:
# get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel # get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
author_names = [] author_names = []
for author in authors: for author in authors:
panel = author.find_next_sibling( panel = author.find_next_sibling(
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" "div",
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
) )
if panel: if panel:
links = panel.find_all("a") links = panel.find_all("a")
@@ -230,7 +276,7 @@ class Catalogue:
groups = [] groups = []
cur = {} cur = {}
for node in panel.select( for node in panel.select(
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space" "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space",
): ):
classes = node.get("class", []) classes = node.get("class", [])
# Separator between entries # Separator between entries
@@ -260,12 +306,10 @@ class Catalogue:
# Find the signature for the entry whose location mentions "Semesterapparat" # Find the signature for the entry whose location mentions "Semesterapparat"
for g in groups: for g in groups:
print(g)
loc = g.get("location", "").lower() loc = g.get("location", "").lower()
if "semesterapparat" in loc: if "semesterapparat" in loc:
signature = g.get("signature") signature = g.get("signature")
return signature return signature
else:
signature = g.get("signature") signature = g.get("signature")
return signature return signature
print("No signature found") print("No signature found")