Compare commits
5 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
cb470c2850 | ||
| 4eb3856c36 | |||
|
97a1becc86
|
|||
|
3a83ef27da
|
|||
|
14f9748957
|
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "bibapi"
|
name = "bibapi"
|
||||||
version = "0.0.5"
|
version = "0.0.6"
|
||||||
description = "Add your description here"
|
description = "Add your description here"
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
authors = [
|
authors = [
|
||||||
@@ -8,19 +8,32 @@ authors = [
|
|||||||
]
|
]
|
||||||
requires-python = ">=3.13"
|
requires-python = ">=3.13"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"beautifulsoup4>=4.14.2",
|
|
||||||
"cloudscraper>=1.2.71",
|
|
||||||
"playwright>=1.55.0",
|
|
||||||
"regex>=2025.9.18",
|
"regex>=2025.9.18",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
# SRU API feature: for accessing library catalogs via SRU protocol
|
||||||
|
sru = [
|
||||||
"requests>=2.32.5",
|
"requests>=2.32.5",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Catalogue feature: web scraping local library catalog
|
||||||
|
catalogue = [
|
||||||
|
"requests>=2.32.5",
|
||||||
|
"beautifulsoup4>=4.12.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Install all features
|
||||||
|
all = [
|
||||||
|
"bibapi[sru,catalogue]",
|
||||||
|
]
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["uv_build >= 0.9.5, <0.10.0"]
|
requires = ["uv_build >= 0.9.5, <0.10.0"]
|
||||||
build-backend = "uv_build"
|
build-backend = "uv_build"
|
||||||
|
|
||||||
[tool.bumpversion]
|
[tool.bumpversion]
|
||||||
current_version = "0.0.5"
|
current_version = "0.0.6"
|
||||||
parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
|
parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
|
||||||
serialize = ["{major}.{minor}.{patch}"]
|
serialize = ["{major}.{minor}.{patch}"]
|
||||||
search = "{current_version}"
|
search = "{current_version}"
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
from enum import Enum
|
from .schemas.api_types import *
|
||||||
|
|
||||||
from .sru import Api as _Api
|
from .sru import Api as _Api
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@@ -12,86 +11,6 @@ __all__ = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
class PicaSchema(Enum):
|
|
||||||
TITLE = "pica.tit"
|
|
||||||
CALLSIGN = "pica.abr"
|
|
||||||
ALL = "pica.all"
|
|
||||||
DATE_FIRST_CREATION = "pica.ser"
|
|
||||||
DATE_LAST_MODIFIED = "pica.aed"
|
|
||||||
ISBN = "pica.isb"
|
|
||||||
ISSN = "pica.isn"
|
|
||||||
ISMN = "pica.ism"
|
|
||||||
PPN = "pica.ppn"
|
|
||||||
AUTHOR = "pica.per"
|
|
||||||
YEAR = "pica.jhr"
|
|
||||||
AUTHOR_SCHEMA = "NoSpaceAfterComma"
|
|
||||||
ENCLOSE_TITLE_IN_QUOTES = False
|
|
||||||
|
|
||||||
|
|
||||||
class ALMASchema(Enum):
|
|
||||||
TITLE = "alma.title"
|
|
||||||
AUTHOR = "alma.author"
|
|
||||||
ENCLOSE_TITLE_IN_QUOTES = True
|
|
||||||
AUTHOR_SCHEMA = "NoSpaceAfterComma"
|
|
||||||
YEAR = "date_of_publication"
|
|
||||||
|
|
||||||
|
|
||||||
class DublinCoreSchema(Enum):
|
|
||||||
TITLE = "dc.title"
|
|
||||||
AUTHOR = "dc.creator"
|
|
||||||
AUTHOR_SCHEMA = "SpaceAfterComma"
|
|
||||||
ENCLOSE_TITLE_IN_QUOTES = False
|
|
||||||
YEAR = "dc.date"
|
|
||||||
|
|
||||||
|
|
||||||
class CQLSchema(Enum):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class SWBSchema(Enum):
|
|
||||||
URL = "https://sru.k10plus.de/opac-de-627!rec=1?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=marcxml"
|
|
||||||
ARGSCHEMA = PicaSchema
|
|
||||||
NAME = "SWB"
|
|
||||||
LIBRARY_NAME_LOCATION_FIELD = "924$b"
|
|
||||||
|
|
||||||
|
|
||||||
class DNBSchema(Enum):
|
|
||||||
URL = "https://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=MARC21-xml"
|
|
||||||
ARGSCHEMA = DublinCoreSchema
|
|
||||||
NAME = "DNB"
|
|
||||||
|
|
||||||
|
|
||||||
class KOBVSchema(Enum):
|
|
||||||
URL = "https://sru.kobv.de/k2?version=1.1&operation=searchRetrieve&query={}&startRecord=1&maximumRecords=100&recordSchema=marcxml"
|
|
||||||
ARGSCHEMA = DublinCoreSchema
|
|
||||||
NAME = "KOBV"
|
|
||||||
LIBRARY_NAME_LOCATION_FIELD = "924$b"
|
|
||||||
|
|
||||||
|
|
||||||
class HebisSchema(Enum):
|
|
||||||
URL = "http://sru.hebis.de/sru/DB=2.1?query={}&version=1.1&operation=searchRetrieve&stylesheet=http%3A%2F%2Fsru.hebis.de%2Fsru%2F%3Fxsl%3DsearchRetrieveResponse&recordSchema=marc21&maximumRecords=100&startRecord=1&recordPacking=xml&sortKeys=LST_Y%2Cpica%2C0%2C%2C"
|
|
||||||
ARGSCHEMA = PicaSchema
|
|
||||||
NOTSUPPORTEDARGS = ["YEAR"]
|
|
||||||
NAME = "HEBIS"
|
|
||||||
REPLACE = {" ": "+", "&": "%26", "=": "+%3D+"}
|
|
||||||
LIBRARY_NAME_LOCATION_FIELD = "924$b"
|
|
||||||
|
|
||||||
|
|
||||||
class OEVKSchema(Enum):
|
|
||||||
URL = "https://sru.k10plus.de/opac-de-627-2?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=marcxml"
|
|
||||||
ARGSCHEMA = PicaSchema
|
|
||||||
NAME = "OEVK"
|
|
||||||
LIBRARY_NAME_LOCATION_FIELD = "924$b"
|
|
||||||
|
|
||||||
|
|
||||||
class HBZSchema(Enum):
|
|
||||||
URL = "https://eu04.alma.exlibrisgroup.com/view/sru/49HBZ_NETWORK?version=1.2&operation=searchRetrieve&recordSchema=marcxml&query={}&maximumRecords=100&recordSchema=marcxml"
|
|
||||||
ARGSCHEMA = ALMASchema
|
|
||||||
NAME = "HBZ"
|
|
||||||
LIBRARY_NAME_LOCATION_FIELD = "852$a"
|
|
||||||
NOTSUPPORTEDARGS = ["PPN"]
|
|
||||||
|
|
||||||
|
|
||||||
class SWB(_Api):
|
class SWB(_Api):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.site = SWBSchema.NAME.value
|
self.site = SWBSchema.NAME.value
|
||||||
|
|||||||
@@ -44,7 +44,6 @@ class Catalogue:
|
|||||||
|
|
||||||
def get_book(self, searchterm: str):
|
def get_book(self, searchterm: str):
|
||||||
links = self.get_book_links(searchterm)
|
links = self.get_book_links(searchterm)
|
||||||
print(links)
|
|
||||||
for elink in links:
|
for elink in links:
|
||||||
result = self.search(elink)
|
result = self.search(elink)
|
||||||
# in result search for class col-xs-12 rds-dl RDS_LOCATION
|
# in result search for class col-xs-12 rds-dl RDS_LOCATION
|
||||||
@@ -56,12 +55,14 @@ class Catalogue:
|
|||||||
title = title_el.get_text(strip=True) if title_el else None
|
title = title_el.get_text(strip=True) if title_el else None
|
||||||
|
|
||||||
ppn_el = soup.find(
|
ppn_el = soup.find(
|
||||||
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN"
|
"div",
|
||||||
|
class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN",
|
||||||
)
|
)
|
||||||
# in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
|
# in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
|
||||||
ppn = (
|
ppn = (
|
||||||
ppn_el.find_next_sibling(
|
ppn_el.find_next_sibling(
|
||||||
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
"div",
|
||||||
|
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
|
||||||
).get_text(strip=True)
|
).get_text(strip=True)
|
||||||
if ppn_el
|
if ppn_el
|
||||||
else None
|
else None
|
||||||
@@ -69,18 +70,21 @@ class Catalogue:
|
|||||||
|
|
||||||
# get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION
|
# get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION
|
||||||
edition_el = soup.find(
|
edition_el = soup.find(
|
||||||
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION"
|
"div",
|
||||||
|
class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION",
|
||||||
)
|
)
|
||||||
edition = (
|
edition = (
|
||||||
edition_el.find_next_sibling(
|
edition_el.find_next_sibling(
|
||||||
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
"div",
|
||||||
|
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
|
||||||
).get_text(strip=True)
|
).get_text(strip=True)
|
||||||
if edition_el
|
if edition_el
|
||||||
else None
|
else None
|
||||||
)
|
)
|
||||||
|
|
||||||
authors = soup.find_all(
|
authors = soup.find_all(
|
||||||
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
|
"div",
|
||||||
|
class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON",
|
||||||
)
|
)
|
||||||
author = None
|
author = None
|
||||||
if authors:
|
if authors:
|
||||||
@@ -88,7 +92,8 @@ class Catalogue:
|
|||||||
author_names = []
|
author_names = []
|
||||||
for author in authors:
|
for author in authors:
|
||||||
panel = author.find_next_sibling(
|
panel = author.find_next_sibling(
|
||||||
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
"div",
|
||||||
|
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
|
||||||
)
|
)
|
||||||
if panel:
|
if panel:
|
||||||
links = panel.find_all("a")
|
links = panel.find_all("a")
|
||||||
@@ -105,7 +110,7 @@ class Catalogue:
|
|||||||
groups = []
|
groups = []
|
||||||
cur = {}
|
cur = {}
|
||||||
for node in panel.select(
|
for node in panel.select(
|
||||||
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
|
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space",
|
||||||
):
|
):
|
||||||
classes = node.get("class", [])
|
classes = node.get("class", [])
|
||||||
# Separator between entries
|
# Separator between entries
|
||||||
@@ -147,16 +152,15 @@ class Catalogue:
|
|||||||
author=author,
|
author=author,
|
||||||
edition=edition,
|
edition=edition,
|
||||||
)
|
)
|
||||||
else:
|
return Book(
|
||||||
return Book(
|
title=title,
|
||||||
title=title,
|
ppn=ppn,
|
||||||
ppn=ppn,
|
signature=signature,
|
||||||
signature=signature,
|
library_location=loc.split("\n\n")[-1],
|
||||||
library_location=loc.split("\n\n")[-1],
|
link=elink,
|
||||||
link=elink,
|
author=author,
|
||||||
author=author,
|
edition=edition,
|
||||||
edition=edition,
|
)
|
||||||
)
|
|
||||||
|
|
||||||
def get_book_with_data(self, searchterm: str) -> Book | None:
|
def get_book_with_data(self, searchterm: str) -> Book | None:
|
||||||
book = self.get_book(searchterm)
|
book = self.get_book(searchterm)
|
||||||
@@ -168,19 +172,18 @@ class Catalogue:
|
|||||||
# from div col-xs-12 rds-dl RDS_SIGNATURE get signature (second div in this div)
|
# from div col-xs-12 rds-dl RDS_SIGNATURE get signature (second div in this div)
|
||||||
signature = None
|
signature = None
|
||||||
signature_el = soup.find("div", class_="RDS_SIGNATURE")
|
signature_el = soup.find("div", class_="RDS_SIGNATURE")
|
||||||
print(signature_el)
|
|
||||||
if signature_el:
|
if signature_el:
|
||||||
signature = signature_el.find("div", class_="rds-dl-panel").get_text(
|
signature = signature_el.find("div", class_="rds-dl-panel").get_text(
|
||||||
strip=True
|
strip=True,
|
||||||
)
|
)
|
||||||
print(signature)
|
|
||||||
book.signature = signature
|
book.signature = signature
|
||||||
# from div col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_ISBN get isbn (second div in this div)
|
# from div col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_ISBN get isbn (second div in this div)
|
||||||
isbn = None
|
isbn = None
|
||||||
isbn_el = soup.find("div", class_="RDS_ISBN")
|
isbn_el = soup.find("div", class_="RDS_ISBN")
|
||||||
if isbn_el:
|
if isbn_el:
|
||||||
isbn = isbn_el.find_next_sibling(
|
isbn = isbn_el.find_next_sibling(
|
||||||
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
"div",
|
||||||
|
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
|
||||||
).get_text(strip=True)
|
).get_text(strip=True)
|
||||||
book.isbn = isbn
|
book.isbn = isbn
|
||||||
# from div col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_SCOPE get pages (second div in this div)
|
# from div col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_SCOPE get pages (second div in this div)
|
||||||
@@ -188,7 +191,8 @@ class Catalogue:
|
|||||||
pages_el = soup.find("div", class_="RDS_SCOPE")
|
pages_el = soup.find("div", class_="RDS_SCOPE")
|
||||||
if pages_el:
|
if pages_el:
|
||||||
pages = pages_el.find_next_sibling(
|
pages = pages_el.find_next_sibling(
|
||||||
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
"div",
|
||||||
|
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
|
||||||
).get_text(strip=True)
|
).get_text(strip=True)
|
||||||
# regex match to get pages by grabbing the first number in the string
|
# regex match to get pages by grabbing the first number in the string
|
||||||
match = regex.search(r"(\d+)", pages)
|
match = regex.search(r"(\d+)", pages)
|
||||||
@@ -210,7 +214,6 @@ class Catalogue:
|
|||||||
for link in links:
|
for link in links:
|
||||||
result = self.search(link)
|
result = self.search(link)
|
||||||
soup = BeautifulSoup(result, "html.parser")
|
soup = BeautifulSoup(result, "html.parser")
|
||||||
print(link)
|
|
||||||
ppn = link.split("/")[-1]
|
ppn = link.split("/")[-1]
|
||||||
if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn):
|
if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn):
|
||||||
return ppn
|
return ppn
|
||||||
@@ -239,19 +242,20 @@ class Catalogue:
|
|||||||
links = self.get_book_links(f"kid:{link}")
|
links = self.get_book_links(f"kid:{link}")
|
||||||
author = None
|
author = None
|
||||||
for link in links:
|
for link in links:
|
||||||
# print(link)
|
|
||||||
result = self.search(link)
|
result = self.search(link)
|
||||||
soup = BeautifulSoup(result, "html.parser")
|
soup = BeautifulSoup(result, "html.parser")
|
||||||
# get all authors, return them as a string seperated by ;
|
# get all authors, return them as a string seperated by ;
|
||||||
authors = soup.find_all(
|
authors = soup.find_all(
|
||||||
"div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON"
|
"div",
|
||||||
|
class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON",
|
||||||
)
|
)
|
||||||
if authors:
|
if authors:
|
||||||
# get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
|
# get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel
|
||||||
author_names = []
|
author_names = []
|
||||||
for author in authors:
|
for author in authors:
|
||||||
panel = author.find_next_sibling(
|
panel = author.find_next_sibling(
|
||||||
"div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel"
|
"div",
|
||||||
|
class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel",
|
||||||
)
|
)
|
||||||
if panel:
|
if panel:
|
||||||
links = panel.find_all("a")
|
links = panel.find_all("a")
|
||||||
@@ -272,7 +276,7 @@ class Catalogue:
|
|||||||
groups = []
|
groups = []
|
||||||
cur = {}
|
cur = {}
|
||||||
for node in panel.select(
|
for node in panel.select(
|
||||||
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space"
|
"div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space",
|
||||||
):
|
):
|
||||||
classes = node.get("class", [])
|
classes = node.get("class", [])
|
||||||
# Separator between entries
|
# Separator between entries
|
||||||
@@ -302,14 +306,12 @@ class Catalogue:
|
|||||||
|
|
||||||
# Find the signature for the entry whose location mentions "Semesterapparat"
|
# Find the signature for the entry whose location mentions "Semesterapparat"
|
||||||
for g in groups:
|
for g in groups:
|
||||||
print(g)
|
|
||||||
loc = g.get("location", "").lower()
|
loc = g.get("location", "").lower()
|
||||||
if "semesterapparat" in loc:
|
if "semesterapparat" in loc:
|
||||||
signature = g.get("signature")
|
signature = g.get("signature")
|
||||||
return signature
|
return signature
|
||||||
else:
|
signature = g.get("signature")
|
||||||
signature = g.get("signature")
|
return signature
|
||||||
return signature
|
|
||||||
print("No signature found")
|
print("No signature found")
|
||||||
return signature
|
return signature
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user