From 4d8c5da4f525a5abdb3a166bb320b962652cdb05 Mon Sep 17 00:00:00 2001 From: WorldTeacher Date: Tue, 11 Nov 2025 14:00:42 +0100 Subject: [PATCH] feat: - add retries, timeouts - add max time of 30 seconds - add library identifier parsing add more keys to picaschema add keys to alma, dc shema - update querytransformer to format author, title based on API requirements - update Api class to drop not supported arguments before the query is created - add empty returns for results which raised timeout exceptions chore: - remove unneeded ArgumentSchema enum --- src/bibapi/sru.py | 164 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 131 insertions(+), 33 deletions(-) diff --git a/src/bibapi/sru.py b/src/bibapi/sru.py index fab50e3..a9415cf 100644 --- a/src/bibapi/sru.py +++ b/src/bibapi/sru.py @@ -1,4 +1,5 @@ import re +import time import xml.etree.ElementTree as ET from enum import Enum from typing import Any, Dict, Iterable, List, Optional, Tuple, Type, Union @@ -77,6 +78,7 @@ def parse_record(zs_record_el: ET.Element) -> Record: if marc_record_el is None: # If the MARC record uses default ns (xmlns="...") ElementTree still needs the ns-qualified name # We already searched with prefix; this covers both default and prefixed cases. + return None raise ValueError("Missing MARC21 record inside zs:recordData") marc_record = parse_marc_record(marc_record_el) @@ -129,7 +131,9 @@ def parse_search_retrieve_response( records: List[Record] = [] if records_parent is not None: for r in records_parent.findall("zs:record", NS): - records.append(parse_record(r)) + record = parse_record(r) + if record is not None: + records.append(record) echoed = parse_echoed_request(root) @@ -323,13 +327,15 @@ def subfield_value_pairs_from_fields( return out -def book_from_marc(rec: MarcRecord) -> BookData: +def book_from_marc(rec: MarcRecord, library_identifier: str) -> BookData: + library_ident_tag = library_identifier.split("$")[0] + library_ident_code = library_identifier.split("$")[1] # PPN from controlfield 001 ppn = controlfield_value(rec, "001") # Title = 245 $a + 245 $b (if present) t_a = first_subfield_value(rec, "245", "a") - t_b = first_subfield_value(rec, "245", "b") + t_b = "" # first_subfield_value(rec, "245", "b") title = _smart_join_title(t_a, t_b) if t_a else None # Signature = 924 where $9 == "Frei 129" → take that field's $g @@ -349,6 +355,7 @@ def book_from_marc(rec: MarcRecord) -> BookData: author = None if authors: author = "; ".join(authors) + libraries = subfield_values(rec, library_ident_tag, library_ident_code) return BookData( ppn=ppn, @@ -363,6 +370,7 @@ def book_from_marc(rec: MarcRecord) -> BookData: link="", author=author, media_type=mediatype, + libraries=libraries, ) @@ -377,14 +385,25 @@ class PicaSchema(Enum): ISMN = "pica.ism" PPN = "pica.ppn" AUTHOR = "pica.per" + YEAR = "pica.jhr" + AUTHOR_SCHEMA = "NoSpaceAfterComma" + ENCLOSE_TITLE_IN_QUOTES = False class ALMASchema(Enum): - pass + TITLE = "alma.title" + AUTHOR = "alma.author" + ENCLOSE_TITLE_IN_QUOTES = True + AUTHOR_SCHEMA = "NoSpaceAfterComma" + YEAR = "date_of_publication" class DublinCoreSchema(Enum): - pass + TITLE = "dc.title" + AUTHOR = "dc.creator" + AUTHOR_SCHEMA = "SpaceAfterComma" + ENCLOSE_TITLE_IN_QUOTES = False + YEAR = "dc.date" class CQLSchema(Enum): @@ -395,6 +414,7 @@ class SWBSchema(Enum): URL = "https://sru.k10plus.de/opac-de-627!rec=1?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=marcxml" ARGSCHEMA = PicaSchema NAME = "SWB" + LIBRARY_NAME_LOCATION_FIELD = "924$b" class DNBSchema(Enum): @@ -407,32 +427,30 @@ class KOBVSchema(Enum): URL = "https://sru.kobv.de/k2?version=1.1&operation=searchRetrieve&query={}&startRecord=1&maximumRecords=100&recordSchema=marcxml" ARGSCHEMA = DublinCoreSchema NAME = "KOBV" + LIBRARY_NAME_LOCATION_FIELD = "924$b" class HebisSchema(Enum): URL = "http://sru.hebis.de/sru/DB=2.1?query={}&version=1.1&operation=searchRetrieve&stylesheet=http%3A%2F%2Fsru.hebis.de%2Fsru%2F%3Fxsl%3DsearchRetrieveResponse&recordSchema=marc21&maximumRecords=100&startRecord=1&recordPacking=xml&sortKeys=LST_Y%2Cpica%2C0%2C%2C" ARGSCHEMA = PicaSchema + NOTSUPPORTEDARGS = ["YEAR"] NAME = "HEBIS" REPLACE = {" ": "+", "&": "%26", "=": "+%3D+"} + LIBRARY_NAME_LOCATION_FIELD = "924$b" class OEVKSchema(Enum): URL = "https://sru.k10plus.de/opac-de-627-2?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=marcxml" ARGSCHEMA = PicaSchema NAME = "OEVK" + LIBRARY_NAME_LOCATION_FIELD = "924$b" class HBZSchema(Enum): URL = "https://eu04.alma.exlibrisgroup.com/view/sru/49HBZ_NETWORK?version=1.2&operation=searchRetrieve&recordSchema=marcxml&query={}&maximumRecords=100&recordSchema=marcxml" ARGSCHEMA = ALMASchema NAME = "HBZ" - - -class ArgumentSchema(Enum): - TITLE = ( - "title", - "tit", - ) + LIBRARY_NAME_LOCATION_FIELD = "852$a" RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative char set typically seen in RVK notations @@ -538,25 +556,38 @@ def find_newer_edition( class QueryTransformer: - def __init__( - self, api_schema: Type[PicaSchema], arguments: Union[Iterable[str], str] - ): + def __init__(self, api_schema: Type[Enum], arguments: Union[Iterable[str], str]): self.api_schema = api_schema if isinstance(arguments, str): self.arguments = [arguments] else: self.arguments = arguments + self.drop_empty = True def transform(self) -> Dict[str, Any]: arguments: List[str] = [] schema = self.api_schema - print(schema.TITLE.name) for arg in self.arguments: if "=" not in arg: continue + if self.drop_empty and arg.endswith("="): + continue key, value = arg.split("=", 1) if hasattr(schema, key.upper()): api_key = getattr(schema, key.upper()).value + if key.upper() == "AUTHOR" and hasattr(schema, "AUTHOR_SCHEMA"): + author_schema = getattr(schema, "AUTHOR_SCHEMA").value + if author_schema == "SpaceAfterComma": + value = value.replace(",", ", ") + elif author_schema == "NoSpaceAfterComma": + value = value.replace(", ", ",") + value = value.replace(" ", " ") + if key.upper() == "TITLE" and hasattr( + schema, "ENCLOSE_TITLE_IN_QUOTES" + ): + if getattr(schema, "ENCLOSE_TITLE_IN_QUOTES"): + value = f'"{value}"' + arguments.append(f"{api_key}={value}") return arguments @@ -564,18 +595,33 @@ class QueryTransformer: class Api: def __init__( - self, site: str, url: str, prefix: str, replace: Optional[Dict[str, str]] = None + self, + site: str, + url: str, + prefix: Type[Enum], + library_identifier: str, + notsupported_args: Optional[List[str]] = None, + replace: Optional[Dict[str, str]] = None, ): self.site = site self.url = url self.prefix = prefix self.replace = replace or {} + self.library_identifier = library_identifier + self.notsupported_args = notsupported_args or [] # Reuse TCP connections across requests for better performance self._session = requests.Session() # Slightly larger connection pool for concurrent calls adapter = HTTPAdapter(pool_connections=10, pool_maxsize=20) self._session.mount("http://", adapter) self._session.mount("https://", adapter) + # Rate limiting and retry config + self._last_request_time: float = 0.0 + self._rate_limit_seconds: float = 1.0 + self._max_retries: int = 5 + self._overall_timeout_seconds: float = ( + 30.0 # max total time per logical request + ) def close(self): try: @@ -588,36 +634,88 @@ class Api: self.close() def get(self, query_args: Union[Iterable[str], str]) -> List[Record]: + start_time = time.monotonic() # if any query_arg ends with =, remove it if isinstance(query_args, str): query_args = [query_args] - if self.site == "DNB": - args = [arg for arg in query_args if not arg.startswith("pica.")] - if args == []: - raise ValueError("DNB queries must include at least one search term") - query_args = args - # query_args = [f"{self.prefix}{arg}" for arg in query_args] + if self.notsupported_args: + query_args = [ + qa + for qa in query_args + if not any(qa.startswith(na + "=") for na in self.notsupported_args) + ] + query_args = QueryTransformer( + api_schema=self.prefix, arguments=query_args + ).transform() query = "+and+".join(query_args) for old, new in self.replace.items(): query = query.replace(old, new) url = self.url.format(query) - print(url) headers = { "User-Agent": f"{self.site} SRU Client, ", "Accept": "application/xml", "Accept-Charset": "latin1,utf-8;q=0.7,*;q=0.3", } - # Use persistent session and set timeouts to avoid hanging - resp = self._session.get(url, headers=headers, timeout=(3.05, 60)) - if resp.status_code != 200: - raise Exception(f"Error fetching data from SWB: {resp.status_code}") - # Parse using raw bytes (original behavior) to preserve encoding edge cases - sr = parse_search_retrieve_response(resp.content) - return sr.records + # Use persistent session, enforce 1 req/sec, and retry up to 5 times + last_error: Optional[Exception] = None + for attempt in range(1, self._max_retries + 1): + # Abort if overall timeout exceeded before starting attempt + if time.monotonic() - start_time > self._overall_timeout_seconds: + last_error = requests.exceptions.Timeout( + f"Overall timeout {self._overall_timeout_seconds}s exceeded before attempt {attempt}" + ) + break + # Enforce rate limit relative to last request end + now = time.monotonic() + elapsed = now - self._last_request_time + if elapsed < self._rate_limit_seconds: + time.sleep(self._rate_limit_seconds - elapsed) + + try: + # Per-attempt read timeout capped at remaining overall budget (but at most 30s) + remaining = max( + 0.0, self._overall_timeout_seconds - (time.monotonic() - start_time) + ) + read_timeout = min(30.0, remaining if remaining > 0 else 0.001) + resp = self._session.get( + url, headers=headers, timeout=(3.05, read_timeout) + ) + self._last_request_time = time.monotonic() + if resp.status_code == 200: + # Parse using raw bytes (original behavior) to preserve encoding edge cases + sr = parse_search_retrieve_response(resp.content) + return sr.records + else: + last_error = Exception( + f"Error fetching data from {self.site}: HTTP {resp.status_code} (attempt {attempt}/{self._max_retries})" + ) + except requests.exceptions.ReadTimeout as e: + last_error = e + except requests.exceptions.Timeout as e: + last_error = e + except Exception as e: + last_error = e + + # Wait before the next attempt to respect rate limit between retries as well + if attempt < self._max_retries: + time.sleep(self._rate_limit_seconds) + + # If we exit the loop, all attempts failed + raise last_error if last_error else Exception("Unknown request failure") def getBooks(self, query_args: Union[Iterable[str], str]) -> List[BookData]: - records: List[Record] = self.get(query_args) + try: + records: List[Record] = self.get(query_args) + except requests.exceptions.ReadTimeout: + # Return a list with a single empty BookData object on read timeout + return [BookData()] + except requests.exceptions.Timeout: + # Overall timeout exceeded + return [BookData()] + except Exception: + # Propagate other errors (could also choose to return empty list) + raise # Avoid printing on hot paths; rely on logger if needed books: List[BookData] = [] # extract title from query_args if present @@ -627,7 +725,7 @@ class Api: title = arg.split("=")[1] break for rec in records: - book = book_from_marc(rec.recordData) + book = book_from_marc(rec.recordData, self.library_identifier) books.append(book) if title: books = [