more AI optimizations, reworked logger

This commit is contained in:
2025-10-09 12:35:15 +02:00
parent 7e07bdea0c
commit 3cc6e793d2
22 changed files with 186 additions and 320 deletions

View File

@@ -1,28 +1,17 @@
import re
import sys
import xml.etree.ElementTree as ET
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Dict, Iterable, List, Optional, Tuple
from typing import Dict, Iterable, List, Optional, Tuple, Union
import loguru
import requests
from requests.adapters import HTTPAdapter
from src import LOG_DIR
# centralized logging used via src.shared.logging
from src.logic.dataclass import BookData
from src.shared.logging import log
log = loguru.logger
log.remove()
log.add(sys.stdout, level="INFO")
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
log.add(
f"{LOG_DIR}/{datetime.now().strftime('%Y-%m-%d')}.log",
rotation="1 day",
retention="1 month",
)
log # ensure imported logger is referenced
# -----------------------
@@ -186,7 +175,9 @@ def parse_echoed_request(root: ET.Element) -> Optional[EchoedSearchRequest]:
)
def parse_search_retrieve_response(xml_str: str) -> SearchRetrieveResponse:
def parse_search_retrieve_response(
xml_str: Union[str, bytes],
) -> SearchRetrieveResponse:
root = ET.fromstring(xml_str)
# Root is zs:searchRetrieveResponse
@@ -598,12 +589,12 @@ class Api:
"Accept-Charset": "latin1,utf-8;q=0.7,*;q=0.3",
}
# Use persistent session and set timeouts to avoid hanging
response = self._session.get(url, headers=headers, timeout=(3.05, 20))
if response.status_code != 200:
raise Exception(f"Error fetching data from SWB: {response.status_code}")
# extract top-level response (decode to text for the XML parser)
response = parse_search_retrieve_response(response.text)
return response.records
resp = self._session.get(url, headers=headers, timeout=(3.05, 60))
if resp.status_code != 200:
raise Exception(f"Error fetching data from SWB: {resp.status_code}")
# Parse using raw bytes (original behavior) to preserve encoding edge cases
sr = parse_search_retrieve_response(resp.content)
return sr.records
def getBooks(self, query_args: Iterable[str]) -> List[BookData]:
records: List[Record] = self.get(query_args)

View File

@@ -18,16 +18,8 @@ from __future__ import annotations
import datetime
import re
import sys
import loguru
from src import LOG_DIR
log = loguru.logger
log.remove()
log.add(sys.stdout, level="INFO")
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
from src.shared.logging import log
class Semester:
@@ -124,21 +116,22 @@ class Semester:
# ------------------------------------------------------------------
# Comparison helpers
# ------------------------------------------------------------------
def isPastSemester(self, other: "Semester") -> bool:
if self.year < other.year:
def isPastSemester(self, current: "Semester") -> bool:
log.debug(f"Comparing {self} < {current}")
if self.year < current.year:
return True
if self.year == other.year:
if self.year == current.year:
return (
self.semester == "WiSe" and other.semester == "SoSe"
self.semester == "WiSe" and current.semester == "SoSe"
) # WiSe before next SoSe
return False
def isFutureSemester(self, other: "Semester") -> bool:
if self.year > other.year:
def isFutureSemester(self, current: "Semester") -> bool:
if self.year > current.year:
return True
if self.year == other.year:
if self.year == current.year:
return (
self.semester == "SoSe" and other.semester == "WiSe"
self.semester == "SoSe" and current.semester == "WiSe"
) # SoSe after WiSe of same year
return False

View File

@@ -1,23 +1,16 @@
import sys
from typing import Any, Optional, Union
import loguru
import requests
from bs4 import BeautifulSoup
# import sleep_and_retry decorator to retry requests
from ratelimit import limits, sleep_and_retry
from src import LOG_DIR
from src.logic.dataclass import BookData
from src.shared.logging import log
from src.transformers import ARRAYData, BibTeXData, COinSData, RDSData, RISData
from src.transformers.transformers import RDS_AVAIL_DATA, RDS_GENERIC_DATA
log = loguru.logger
log.remove()
log.add(sys.stdout, level="INFO")
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
# logger.add(sys.stderr, format="{time} {level} {message}", level="INFO")

View File

@@ -1,20 +1,13 @@
import sys
import zipfile
from typing import Any
import fitz # PyMuPDF
import loguru
import pandas as pd
from bs4 import BeautifulSoup
from docx import Document
from src import LOG_DIR
from src.logic.dataclass import Book, SemapDocument
log = loguru.logger
log.remove()
log.add(sys.stdout, level="INFO")
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
from src.shared.logging import log
def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
@@ -50,7 +43,6 @@ def get_fach(path: str) -> str:
soup = BeautifulSoup(xml_data, "xml")
# text we need is in <w:p w14:paraId="12456A32" ... > -> w:r -> w:t
paragraphs = soup.find_all("w:p")
names = []
for para in paragraphs:
para_id = para.get("w14:paraId")
if para_id == "12456A32":