diff --git a/pyproject.toml b/pyproject.toml index 7bcfe56..860a6d3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,13 +8,26 @@ authors = [ ] requires-python = ">=3.13" dependencies = [ - "beautifulsoup4>=4.14.2", - "cloudscraper>=1.2.71", - "playwright>=1.55.0", "regex>=2025.9.18", +] + +[project.optional-dependencies] +# SRU API feature: for accessing library catalogs via SRU protocol +sru = [ "requests>=2.32.5", ] +# Catalogue feature: web scraping local library catalog +catalogue = [ + "requests>=2.32.5", + "beautifulsoup4>=4.12.0", +] + +# Install all features +all = [ + "bibapi[sru,catalogue]", +] + [build-system] requires = ["uv_build >= 0.9.5, <0.10.0"] build-backend = "uv_build" diff --git a/src/bibapi/__init__.py b/src/bibapi/__init__.py index 0dded92..7f3f30a 100644 --- a/src/bibapi/__init__.py +++ b/src/bibapi/__init__.py @@ -1,5 +1,4 @@ -from enum import Enum - +from .schemas.api_types import * from .sru import Api as _Api __all__ = [ @@ -12,86 +11,6 @@ __all__ = [ ] -class PicaSchema(Enum): - TITLE = "pica.tit" - CALLSIGN = "pica.abr" - ALL = "pica.all" - DATE_FIRST_CREATION = "pica.ser" - DATE_LAST_MODIFIED = "pica.aed" - ISBN = "pica.isb" - ISSN = "pica.isn" - ISMN = "pica.ism" - PPN = "pica.ppn" - AUTHOR = "pica.per" - YEAR = "pica.jhr" - AUTHOR_SCHEMA = "NoSpaceAfterComma" - ENCLOSE_TITLE_IN_QUOTES = False - - -class ALMASchema(Enum): - TITLE = "alma.title" - AUTHOR = "alma.author" - ENCLOSE_TITLE_IN_QUOTES = True - AUTHOR_SCHEMA = "NoSpaceAfterComma" - YEAR = "date_of_publication" - - -class DublinCoreSchema(Enum): - TITLE = "dc.title" - AUTHOR = "dc.creator" - AUTHOR_SCHEMA = "SpaceAfterComma" - ENCLOSE_TITLE_IN_QUOTES = False - YEAR = "dc.date" - - -class CQLSchema(Enum): - pass - - -class SWBSchema(Enum): - URL = "https://sru.k10plus.de/opac-de-627!rec=1?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=marcxml" - ARGSCHEMA = PicaSchema - NAME = "SWB" - LIBRARY_NAME_LOCATION_FIELD = "924$b" - - -class DNBSchema(Enum): - URL = "https://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=MARC21-xml" - ARGSCHEMA = DublinCoreSchema - NAME = "DNB" - - -class KOBVSchema(Enum): - URL = "https://sru.kobv.de/k2?version=1.1&operation=searchRetrieve&query={}&startRecord=1&maximumRecords=100&recordSchema=marcxml" - ARGSCHEMA = DublinCoreSchema - NAME = "KOBV" - LIBRARY_NAME_LOCATION_FIELD = "924$b" - - -class HebisSchema(Enum): - URL = "http://sru.hebis.de/sru/DB=2.1?query={}&version=1.1&operation=searchRetrieve&stylesheet=http%3A%2F%2Fsru.hebis.de%2Fsru%2F%3Fxsl%3DsearchRetrieveResponse&recordSchema=marc21&maximumRecords=100&startRecord=1&recordPacking=xml&sortKeys=LST_Y%2Cpica%2C0%2C%2C" - ARGSCHEMA = PicaSchema - NOTSUPPORTEDARGS = ["YEAR"] - NAME = "HEBIS" - REPLACE = {" ": "+", "&": "%26", "=": "+%3D+"} - LIBRARY_NAME_LOCATION_FIELD = "924$b" - - -class OEVKSchema(Enum): - URL = "https://sru.k10plus.de/opac-de-627-2?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=marcxml" - ARGSCHEMA = PicaSchema - NAME = "OEVK" - LIBRARY_NAME_LOCATION_FIELD = "924$b" - - -class HBZSchema(Enum): - URL = "https://eu04.alma.exlibrisgroup.com/view/sru/49HBZ_NETWORK?version=1.2&operation=searchRetrieve&recordSchema=marcxml&query={}&maximumRecords=100&recordSchema=marcxml" - ARGSCHEMA = ALMASchema - NAME = "HBZ" - LIBRARY_NAME_LOCATION_FIELD = "852$a" - NOTSUPPORTEDARGS = ["PPN"] - - class SWB(_Api): def __init__(self): self.site = SWBSchema.NAME.value diff --git a/src/bibapi/catalogue.py b/src/bibapi/catalogue.py index 74aea10..c9babe6 100644 --- a/src/bibapi/catalogue.py +++ b/src/bibapi/catalogue.py @@ -44,7 +44,6 @@ class Catalogue: def get_book(self, searchterm: str): links = self.get_book_links(searchterm) - print(links) for elink in links: result = self.search(elink) # in result search for class col-xs-12 rds-dl RDS_LOCATION @@ -56,12 +55,14 @@ class Catalogue: title = title_el.get_text(strip=True) if title_el else None ppn_el = soup.find( - "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN" + "div", + class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PPN", ) # in ppn_el, get text of div col-xs-12 col-md-7 col-lg-8 rds-dl-panel ppn = ( ppn_el.find_next_sibling( - "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" + "div", + class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel", ).get_text(strip=True) if ppn_el else None @@ -69,18 +70,21 @@ class Catalogue: # get edition text at div class col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION edition_el = soup.find( - "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION" + "div", + class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_EDITION", ) edition = ( edition_el.find_next_sibling( - "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" + "div", + class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel", ).get_text(strip=True) if edition_el else None ) authors = soup.find_all( - "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON" + "div", + class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON", ) author = None if authors: @@ -88,7 +92,8 @@ class Catalogue: author_names = [] for author in authors: panel = author.find_next_sibling( - "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" + "div", + class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel", ) if panel: links = panel.find_all("a") @@ -105,7 +110,7 @@ class Catalogue: groups = [] cur = {} for node in panel.select( - "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space" + "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space", ): classes = node.get("class", []) # Separator between entries @@ -147,16 +152,15 @@ class Catalogue: author=author, edition=edition, ) - else: - return Book( - title=title, - ppn=ppn, - signature=signature, - library_location=loc.split("\n\n")[-1], - link=elink, - author=author, - edition=edition, - ) + return Book( + title=title, + ppn=ppn, + signature=signature, + library_location=loc.split("\n\n")[-1], + link=elink, + author=author, + edition=edition, + ) def get_book_with_data(self, searchterm: str) -> Book | None: book = self.get_book(searchterm) @@ -168,19 +172,18 @@ class Catalogue: # from div col-xs-12 rds-dl RDS_SIGNATURE get signature (second div in this div) signature = None signature_el = soup.find("div", class_="RDS_SIGNATURE") - print(signature_el) if signature_el: signature = signature_el.find("div", class_="rds-dl-panel").get_text( - strip=True + strip=True, ) - print(signature) book.signature = signature # from div col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_ISBN get isbn (second div in this div) isbn = None isbn_el = soup.find("div", class_="RDS_ISBN") if isbn_el: isbn = isbn_el.find_next_sibling( - "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" + "div", + class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel", ).get_text(strip=True) book.isbn = isbn # from div col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_SCOPE get pages (second div in this div) @@ -188,7 +191,8 @@ class Catalogue: pages_el = soup.find("div", class_="RDS_SCOPE") if pages_el: pages = pages_el.find_next_sibling( - "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" + "div", + class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel", ).get_text(strip=True) # regex match to get pages by grabbing the first number in the string match = regex.search(r"(\d+)", pages) @@ -210,7 +214,6 @@ class Catalogue: for link in links: result = self.search(link) soup = BeautifulSoup(result, "html.parser") - print(link) ppn = link.split("/")[-1] if ppn and regex.match(r"^\d{8,10}[X\d]?$", ppn): return ppn @@ -239,19 +242,20 @@ class Catalogue: links = self.get_book_links(f"kid:{link}") author = None for link in links: - # print(link) result = self.search(link) soup = BeautifulSoup(result, "html.parser") # get all authors, return them as a string seperated by ; authors = soup.find_all( - "div", class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON" + "div", + class_="col-xs-12 col-md-5 col-lg-4 rds-dl-head RDS_PERSON", ) if authors: # get the names of the a href links in the div col-xs-12 col-md-7 col-lg-8 rds-dl-panel author_names = [] for author in authors: panel = author.find_next_sibling( - "div", class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel" + "div", + class_="col-xs-12 col-md-7 col-lg-8 rds-dl-panel", ) if panel: links = panel.find_all("a") @@ -272,7 +276,7 @@ class Catalogue: groups = [] cur = {} for node in panel.select( - "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space" + "div.rds-dl.RDS_SIGNATURE, div.rds-dl.RDS_STATUS, div.rds-dl.RDS_LOCATION, div.col-xs-12.space", ): classes = node.get("class", []) # Separator between entries @@ -302,14 +306,12 @@ class Catalogue: # Find the signature for the entry whose location mentions "Semesterapparat" for g in groups: - print(g) loc = g.get("location", "").lower() if "semesterapparat" in loc: signature = g.get("signature") return signature - else: - signature = g.get("signature") - return signature + signature = g.get("signature") + return signature print("No signature found") return signature