minor and major reworks: rename swb to SRU, add a test for pdf parsing

major: rework mail to send mail as plaintext instead of html, preventing the bleed-in of html text
This commit is contained in:
2025-10-07 14:15:10 +02:00
parent 0df7fd9fe6
commit 06965db26a
25 changed files with 1174 additions and 303 deletions

View File

@@ -2,6 +2,7 @@ import sys
import xml.etree.ElementTree as ET
from dataclasses import dataclass, field
from datetime import datetime
from enum import Enum
from typing import Dict, Iterable, List, Optional, Tuple
import loguru
@@ -97,7 +98,7 @@ def _text(elem: Optional[ET.Element]) -> str:
def _req_text(parent: ET.Element, path: str) -> str:
el = parent.find(path, NS)
if el is None or el.text is None:
raise ValueError(f"Required element not found or empty: {path}")
return None
return el.text
@@ -188,7 +189,7 @@ def parse_search_retrieve_response(xml_str: str) -> SearchRetrieveResponse:
# Root is zs:searchRetrieveResponse
version = _req_text(root, "zs:version")
numberOfRecords = int(_req_text(root, "zs:numberOfRecords"))
numberOfRecords = int(_req_text(root, "zs:numberOfRecords") or "0")
records_parent = root.find("zs:records", NS)
records: List[Record] = []
@@ -408,8 +409,12 @@ def book_from_marc(rec: MarcRecord) -> BookData:
rec, "264", "c"
)
isbn = subfield_values(rec, "020", "a")
mediatype = first_subfield_value(rec, "338", "a")
lang = subfield_values(rec, "041", "a")
authors = subfield_values(rec, "700", "a")
author = None
if authors:
author = "; ".join(authors)
return BookData(
ppn=ppn,
@@ -422,32 +427,162 @@ def book_from_marc(rec: MarcRecord) -> BookData:
isbn=isbn,
language=lang,
link="",
author=author,
media_type=mediatype,
)
class SWB:
def __init__(self):
self.url = "https://sru.k10plus.de/opac-de-627!rec=1?version=1.1&operation=searchRetrieve&query={}&maximumRecords=10&recordSchema=marcxml"
self.bib_id = 20735
class SWBData(Enum):
URL = "https://sru.k10plus.de/opac-de-627!rec=1?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=marcxml"
ARGSCHEMA = "pica."
NAME = "SWB"
class DNBData(Enum):
URL = "https://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=MARC21-xml"
ARGSCHEMA = ""
NAME = "DNB"
class SRUSite(Enum):
SWB = SWBData
DNB = DNBData
RVK_ALLOWED = r"[A-Z0-9.\-\/]" # conservative char set typically seen in RVK notations
def find_newer_edition(
swb_result: BookData, dnb_result: List[BookData]
) -> Optional[List[BookData]]:
"""
New edition if:
- year > swb.year OR
- edition_number > swb.edition_number
Additional guards & preferences:
- If both have signatures and they differ, skip (not the same work).
- For duplicates (same ppn): keep the one that has a signature, and
prefer a signature that matches swb_result.signature.
- If multiple remain: keep the single 'latest' by (year desc,
edition_number desc, best-signature-match desc, has-signature desc).
"""
def norm_sig(s: Optional[str]) -> str:
if not s:
return ""
# normalize: lowercase, collapse whitespace, keep alnum + a few separators
s = s.lower()
s = re.sub(r"\s+", " ", s).strip()
# remove obvious noise; adjust if your signature format differs
s = re.sub(r"[^a-z0-9\-_/\. ]+", "", s)
return s
def has_sig(b: BookData) -> bool:
return bool(getattr(b, "signature", None))
def sig_matches_swb(b: BookData) -> bool:
if not has_sig(b) or not has_sig(swb_result):
return False
return norm_sig(b.signature) == norm_sig(swb_result.signature)
def strictly_newer(b: BookData) -> bool:
by_year = (
b.year is not None
and swb_result.year is not None
and b.year > swb_result.year
)
by_edition = (
b.edition_number is not None
and swb_result.edition_number is not None
and b.edition_number > swb_result.edition_number
)
return by_year or by_edition
swb_sig_norm = norm_sig(getattr(swb_result, "signature", None))
# 1) Filter to same-work AND newer
candidates: List[BookData] = []
for b in dnb_result:
# Skip if both signatures exist and don't match (different work)
b_sig = getattr(b, "signature", None)
if b_sig and swb_result.signature:
if norm_sig(b_sig) != swb_sig_norm:
continue # not the same work
# Keep only if newer by rules
if strictly_newer(b):
candidates.append(b)
if not candidates:
return None
# 2) Dedupe by PPN, preferring signature (and matching signature if possible)
by_ppn: dict[Optional[str], BookData] = {}
for b in candidates:
key = getattr(b, "ppn", None)
prev = by_ppn.get(key)
if prev is None:
by_ppn[key] = b
continue
# Compute preference score for both
def ppn_pref_score(x: BookData) -> tuple[int, int]:
# (signature matches swb, has signature)
return (1 if sig_matches_swb(x) else 0, 1 if has_sig(x) else 0)
if ppn_pref_score(b) > ppn_pref_score(prev):
by_ppn[key] = b
deduped = list(by_ppn.values())
if not deduped:
return None
# 3) If multiple remain, keep only the latest one.
# Order: year desc, edition_number desc, signature-match desc, has-signature desc
def sort_key(b: BookData):
year = b.year if b.year is not None else -1
ed = b.edition_number if b.edition_number is not None else -1
sig_match = 1 if sig_matches_swb(b) else 0
sig_present = 1 if has_sig(b) else 0
return (year, ed, sig_match, sig_present)
best = max(deduped, key=sort_key)
return [best] if best else None
class Api:
def __init__(self, site: str, url: str, prefix: str):
self.site = site
self.url = url
self.prefix = prefix
pass
def get(self, query_args: Iterable[str]) -> List[Record]:
# if any query_arg ends with =, remove it
query_args = [arg for arg in query_args if not arg.endswith("=")]
if self.site == "DNB":
args = [arg for arg in query_args if not arg.startswith("pica.")]
if args == []:
raise ValueError("DNB queries must include at least one search term")
query_args = args
# query_args = [f"{self.prefix}{arg}" for arg in query_args]
query = "+and+".join(query_args)
query = query.replace(" ", "%20").replace("&", "%26")
# query_args = [arg for arg in query_args if not arg.endswith("=")]
# query = "+and+".join(query_args)
# query = query.replace(" ", "%20").replace("&", "%26")
# insert the query into the url url is
url = self.url.format(query)
log.debug(url)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
"User-Agent": f"{self.site} SRU Client, <alexander.kirchner@ph-freiburg.de>",
"Accept": "application/xml",
"Accept-Charset": "latin1,utf-8;q=0.7,*;q=0.3",
}
response = requests.get(url, headers=headers)
if response.status_code != 200:
raise Exception(f"Error fetching data from SWB: {response.status_code}")
# #print(response.text)
data = response.content
# extract top-level response
@@ -456,6 +591,7 @@ class SWB:
def getBooks(self, query_args: Iterable[str]) -> List[BookData]:
records: List[Record] = self.get(query_args)
print(f"{self.site} found {len(records)} records")
books: List[BookData] = []
# extract title from query_args if present
title = None
@@ -476,3 +612,11 @@ class SWB:
def getLinkForBook(self, book: BookData) -> str:
results = self.getBooks()
class SWB(Api):
def __init__(self):
self.site = SWBData.NAME.value
self.url = SWBData.URL.value
self.prefix = SWBData.ARGSCHEMA.value
super().__init__(self.site, self.url, self.prefix)

View File

@@ -1,6 +1,35 @@
from .dataclass import ApparatData, BookData, Prof, Apparat, ELSA
__all__ = [
"custom_sort",
"sort_semesters_list",
"APP_NRS",
"PROF_TITLES",
"SEMAP_MEDIA_ACCOUNTS",
"csv_to_list",
"ELSA",
"Apparat",
"ApparatData",
"BookData",
"Prof",
"Semester",
"SemapDocument",
"elsa_word_to_csv",
"pdf_to_semap",
"word_docx_to_csv",
"word_to_semap",
"ZoteroController",
"eml_to_semap",
]
from .c_sort import custom_sort, sort_semesters_list
from .constants import APP_NRS, PROF_TITLES, SEMAP_MEDIA_ACCOUNTS
from .csvparser import csv_to_list
from .wordparser import elsa_word_to_csv, word_docx_to_csv, word_to_semap, SemapDocument
from .dataclass import ELSA, Apparat, ApparatData, BookData, Prof
from .semester import Semester
from .wordparser import (
SemapDocument,
elsa_word_to_csv,
pdf_to_semap,
word_docx_to_csv,
word_to_semap,
)
from .xmlparser import eml_to_semap
from .zotero import ZoteroController

View File

@@ -83,4 +83,4 @@ if __name__ == "__main__":
"SoSe 25",
]
print(sort_semesters_list(unsorted))
# print(sort_semesters_list(unsorted))

View File

@@ -1,4 +1,5 @@
import csv
from charset_normalizer import detect
@@ -19,4 +20,4 @@ def csv_to_list(path: str) -> list[str]:
if __name__ == "__main__":
text = csv_to_list("C:/Users/aky547/Desktop/semap/71.csv")
# remove linebreaks
# print(text)
# #print(text)

View File

@@ -3,6 +3,11 @@ from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Optional, Union
import regex
from src.logic.openai import name_tester, run_shortener, semester_converter
from src.logic.semester import Semester
@dataclass
class Prof:
@@ -67,21 +72,63 @@ class BookData:
language: Union[str, list[str], None] = field(default_factory=list)
publisher: str | None = None
place: str | None = None
year: str | None = None
year: int | None = None
pages: str | None = None
library_location: int | None = None
library_location: str | None = None
in_apparat: bool | None = False
adis_idn: str | None = None
old_book: Any | None = None
media_type: str | None = None #
in_library: bool | None = None # whether the book is in the library or not
def __post_init__(self):
self.library_location = (
str(self.library_location) if self.library_location else None
)
if isinstance(self.language, list) and self.language:
self.language = [lang.strip() for lang in self.language if lang.strip()]
self.language = ",".join(self.language)
self.year = regex.sub(r"[^\d]", "", str(self.year)) if self.year else None
self.in_library = True if self.signature else False
def from_dict(self, data: dict) -> "BookData":
for key, value in data.items():
setattr(self, key, value)
return self
def merge(self, other: "BookData") -> "BookData":
for key, value in other.__dict__.items():
# merge lists, if the attribute is a list, extend it
if isinstance(value, list):
current_value = getattr(self, key)
if current_value is None:
current_value = []
elif not isinstance(current_value, list):
current_value = [current_value]
# extend the list with the new values, but only if they are not already in the list
for v in value:
if v not in current_value:
current_value.append(v)
setattr(self, key, current_value)
if value is not None and (
getattr(self, key) is None or getattr(self, key) == ""
):
setattr(self, key, value)
# in language, drop all entries that are longer than 3 characters
if isinstance(self.language, list):
self.language = [lang for lang in self.language if len(lang) <= 4]
return self
@property
def to_dict(self) -> str:
"""Convert the dataclass to a dictionary."""
return json.dumps(self.__dict__, ensure_ascii=False)
data_dict = {
key: value for key, value in self.__dict__.items() if value is not None
}
# remove old_book from data_dict
if "old_book" in data_dict:
del data_dict["old_book"]
return json.dumps(data_dict, ensure_ascii=False)
def from_dataclass(self, dataclass: Optional[Any]) -> None:
if dataclass is None:
@@ -89,8 +136,15 @@ class BookData:
for key, value in dataclass.__dict__.items():
setattr(self, key, value)
def get_book_type(self) -> str:
if "Online" in self.pages:
return "eBook"
else:
return "Druckausgabe"
def from_string(self, data: str) -> "BookData":
ndata = json.loads(data)
return BookData(**ndata)
def from_LehmannsSearchResult(self, result: Any) -> "BookData":
@@ -111,6 +165,15 @@ class BookData:
# self.pages = str(result.pages) if result.pages else None
return self
@property
def edition_number(self) -> Optional[int]:
if self.edition is None:
return 0
match = regex.search(r"(\d+)", self.edition)
if match:
return int(match.group(1))
return 0
@dataclass
class MailData:
@@ -222,3 +285,124 @@ class ELSA:
class ApparatData:
prof: Prof = field(default_factory=Prof)
apparat: Apparat = field(default_factory=Apparat)
@dataclass
class XMLMailSubmission:
name: Optional[str] = None
lastname: Optional[str] = None
title: Optional[str] = None
telno: Optional[int] = None
email: Optional[str] = None
app_name: Optional[str] = None
subject: Optional[str] = None
semester: Optional[Semester] = None
books: Optional[list[BookData]] = None
@dataclass
class Book:
author: str = None
year: str = None
edition: str = None
title: str = None
location: str = None
publisher: str = None
signature: str = None
internal_notes: str = None
@property
def has_signature(self) -> bool:
return self.signature is not None and self.signature != ""
@property
def is_empty(self) -> bool:
return all(
[
self.author == "",
self.year == "",
self.edition == "",
self.title == "",
self.location == "",
self.publisher == "",
self.signature == "",
self.internal_notes == "",
]
)
def from_dict(self, data: dict[str, Any]):
for key, value in data.items():
value = value.strip()
if value == "\u2002\u2002\u2002\u2002\u2002":
value = ""
if key == "Autorenname(n):Nachname, Vorname":
self.author = value
elif key == "Jahr/Auflage":
self.year = value.split("/")[0] if "/" in value else value
self.edition = value.split("/")[1] if "/" in value else ""
elif key == "Titel":
self.title = value
elif key == "Ort und Verlag":
self.location = value.split(",")[0] if "," in value else value
self.publisher = value.split(",")[1] if "," in value else ""
elif key == "Standnummer":
self.signature = value.strip()
elif key == "Interne Vermerke":
self.internal_notes = value
@dataclass
class SemapDocument:
subject: str = None
phoneNumber: int = None
mail: str = None
title: str = None
title_suggestions: list[str] = None
semester: Union[str, Semester] = None
books: list[Book] = None
eternal: bool = False
personName: str = None
personTitle: str = None
title_length = 0
title_max_length = 0
def __post_init__(self):
self.title_suggestions = []
@property
def nameSetter(self):
data = name_tester(self.personTitle)
name = f"{data['last_name']}, {data['first_name']}"
if data["title"] is not None:
title = data["title"]
self.personTitle = title
self.personName = name
self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0])
if self.title_length > 40:
name_len = len(self.personName.split(",")[0])
self.title_max_length = 38 - name_len
suggestions = run_shortener(self.title, self.title_max_length)
for suggestion in suggestions:
self.title_suggestions.append(suggestion["shortened_string"])
else:
self.title_suggestions = []
pass
@property
def renameSemester(self) -> None:
if self.semester:
if ", Dauer" in self.semester:
self.semester = self.semester.split(",")[0]
self.eternal = True
self.semester = Semester().from_string(self.semester)
else:
self.semester = Semester().from_string(
semester_converter(self.semester)
)
@property
def signatures(self) -> list[str]:
if self.books is not None:
return [book.signature for book in self.books if book.has_signature]
return []

View File

@@ -1,13 +1,15 @@
from __future__ import annotations
import re
from dataclasses import dataclass, asdict, field
from typing import Optional, List, Iterable
from urllib.parse import urljoin, quote_plus
from dataclasses import asdict, dataclass, field
from typing import Iterable, List, Optional
from urllib.parse import quote_plus, urljoin
import httpx
from bs4 import BeautifulSoup
from src.logic.dataclass import BookData
BASE = "https://www.lehmanns.de"
SEARCH_URL = "https://www.lehmanns.de/search/quick?mediatype_id=&q="
@@ -33,9 +35,11 @@ class LehmannsSearchResult:
image: Optional[str] = None
# From detail page:
pages: Optional[str] = None # "<N> Seiten"
buyable: bool = True # set in enrich_pages (detail page)
unavailable_hint: Optional[str] = None # e.g. "Titel ist leider vergriffen; keine Neuauflage"
pages: Optional[str] = None # "<N> Seiten"
buyable: bool = True # set in enrich_pages (detail page)
unavailable_hint: Optional[str] = (
None # e.g. "Titel ist leider vergriffen; keine Neuauflage"
)
def to_dict(self) -> dict:
return asdict(self)
@@ -73,31 +77,45 @@ class LehmannsClient:
# spaces -> '+'
return SEARCH_URL + quote_plus(title)
def search_by_title(self, title: str, limit: Optional[int] = None, strict: bool = False) -> List[LehmannsSearchResult]:
def search_by_title(
self,
title: str,
limit: Optional[int] = None,
strict: bool = False,
only_latest: bool = True,
) -> List[BookData]:
"""
Parse the listing page only (no availability check here).
Use enrich_pages(...) afterwards to fetch detail pages, add 'pages',
and drop unbuyable items.
"""
url = self.build_search_url(title)
url = self.build_search_url(title=title)
html = self._get(url)
if not html:
return []
results = self._parse_results(html)
self.enrich_pages(results)
results = [BookData().from_LehmannsSearchResult(r) for r in results]
if strict:
# filter results to only those with exact title match (case-insensitive)
title_lower = title.lower()
results = [r for r in results if r.title and r.title.lower() == title_lower]
results = [r for r in results if r.buyable]
# results = [r for r in results if r.buyable]
return results
if limit is not None:
results = results[:max(0, limit)]
results = results[: max(0, limit)]
if only_latest and len(results) > 1:
# keep only the latest edition (highest edition number)
results.sort(key=lambda r: (r.edition_number or 0), reverse=True)
results = [results[0]]
return results
# ------------------- Detail enrichment & filtering -------------------
def enrich_pages(self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True) -> List[LehmannsSearchResult]:
def enrich_pages(
self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True
) -> List[LehmannsSearchResult]:
"""
Fetch each result.url, extract:
- pages: from <span class="book-meta meta-seiten" itemprop="numberOfPages">...</span>
@@ -135,11 +153,15 @@ class LehmannsClient:
# Availability via li.availability-3
avail_li = soup.select_one("li.availability-3")
if avail_li:
avail_text = " ".join(avail_li.get_text(" ", strip=True).split()).lower()
avail_text = " ".join(
avail_li.get_text(" ", strip=True).split()
).lower()
if "titel ist leider vergriffen" in avail_text:
r.buyable = False
if "keine neuauflage" in avail_text:
r.unavailable_hint = "Titel ist leider vergriffen; keine Neuauflage"
r.unavailable_hint = (
"Titel ist leider vergriffen; keine Neuauflage"
)
else:
r.unavailable_hint = "Titel ist leider vergriffen"
@@ -161,7 +183,9 @@ class LehmannsClient:
try:
r = self.client.get(url)
r.encoding = "utf-8"
if r.status_code == 200 and "text/html" in (r.headers.get("content-type") or ""):
if r.status_code == 200 and "text/html" in (
r.headers.get("content-type") or ""
):
return r.text
except httpx.HTTPError:
pass
@@ -176,12 +200,18 @@ class LehmannsClient:
if not a:
continue
url = urljoin(BASE, a["href"].strip())
base_title = (block.select_one(".title [itemprop='name']") or a).get_text(strip=True)
base_title = (block.select_one(".title [itemprop='name']") or a).get_text(
strip=True
)
# Alternative headline => extend title
alt_tag = block.select_one(".description[itemprop='alternativeHeadline']")
alternative_headline = alt_tag.get_text(strip=True) if alt_tag else None
title = f"{base_title} : {alternative_headline}" if alternative_headline else base_title
title = (
f"{base_title} : {alternative_headline}"
if alternative_headline
else base_title
)
description = alternative_headline
# Authors from .author
@@ -227,7 +257,9 @@ class LehmannsClient:
# Publisher
publisher = None
pub = block.select_one(".publisherprop [itemprop='name']") or block.select_one(".publisher [itemprop='name']")
pub = block.select_one(
".publisherprop [itemprop='name']"
) or block.select_one(".publisher [itemprop='name']")
if pub:
publisher = pub.get_text(strip=True)

View File

@@ -21,4 +21,4 @@ if __name__ == "__main__":
text = pdf_to_csv("54_pdf.pdf")
# remove linebreaks
text = text.replace("\n", "")
print(text)
# print(text)

View File

@@ -1,16 +1,15 @@
import sys
import zipfile
from dataclasses import dataclass
from typing import Any, Union
from typing import Any
import fitz # PyMuPDF
import loguru
import pandas as pd
from bs4 import BeautifulSoup
from docx import Document
from src import LOG_DIR
from src.backend.semester import Semester
from src.logic.openai import name_tester, run_shortener, semester_converter
from src.logic.dataclass import Book, SemapDocument
log = loguru.logger
log.remove()
@@ -18,116 +17,6 @@ log.add(sys.stdout, level="INFO")
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
@dataclass
class Book:
author: str = None
year: str = None
edition: str = None
title: str = None
location: str = None
publisher: str = None
signature: str = None
internal_notes: str = None
@property
def has_signature(self) -> bool:
return self.signature is not None and self.signature != ""
@property
def is_empty(self) -> bool:
return all(
[
self.author == "",
self.year == "",
self.edition == "",
self.title == "",
self.location == "",
self.publisher == "",
self.signature == "",
self.internal_notes == "",
]
)
def from_dict(self, data: dict[str, Any]):
for key, value in data.items():
value = value.strip()
if value == "\u2002\u2002\u2002\u2002\u2002":
value = ""
if key == "Autorenname(n):Nachname, Vorname":
self.author = value
elif key == "Jahr/Auflage":
self.year = value.split("/")[0] if "/" in value else value
self.edition = value.split("/")[1] if "/" in value else ""
elif key == "Titel":
self.title = value
elif key == "Ort und Verlag":
self.location = value.split(",")[0] if "," in value else value
self.publisher = value.split(",")[1] if "," in value else ""
elif key == "Standnummer":
self.signature = value.strip()
elif key == "Interne Vermerke":
self.internal_notes = value
@dataclass
class SemapDocument:
subject: str = None
phoneNumber: int = None
mail: str = None
title: str = None
title_suggestions: list[str] = None
semester: Union[str, Semester] = None
books: list[Book] = None
eternal: bool = False
personName: str = None
personTitle: str = None
title_length = 0
title_max_length = 0
def __post_init__(self):
self.title_suggestions = []
@property
def nameSetter(self):
data = name_tester(self.personTitle)
name = f"{data['last_name']}, {data['first_name']}"
if data["title"] is not None:
title = data["title"]
self.personTitle = title
self.personName = name
self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0])
if self.title_length > 40:
log.warning("Title is too long")
name_len = len(self.personName.split(",")[0])
self.title_max_length = 38 - name_len
suggestions = run_shortener(self.title, self.title_max_length)
for suggestion in suggestions:
self.title_suggestions.append(suggestion["shortened_string"])
else:
self.title_suggestions = []
pass
@property
def renameSemester(self) -> None:
if ", Dauer" in self.semester:
self.semester = self.semester.split(",")[0]
self.eternal = True
self.semester = Semester().from_string(self.semester)
else:
log.warning("Semester {} is not valid", self.semester)
self.semester = Semester().from_string(semester_converter(self.semester))
@property
def signatures(self) -> list[str]:
if self.books is not None:
return [book.signature for book in self.books if book.has_signature]
return []
def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
doc = Document(path)
tables = doc.tables
@@ -272,7 +161,7 @@ def word_to_semap(word_path: str, ai: bool = True) -> SemapDocument:
apparatdata = df[0]
apparatdata = apparatdata.to_dict()
keys = list(apparatdata.keys())
print(apparatdata, keys)
# print(apparatdata, keys)
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys) - 1, 2)}
semap.phoneNumber = appdata["Telefon:"]
@@ -309,6 +198,182 @@ def word_to_semap(word_path: str, ai: bool = True) -> SemapDocument:
return semap
def pdf_to_semap(pdf_path: str, ai: bool = True) -> SemapDocument:
"""
Parse a Semesterapparat PDF like the sample you provided and return a SemapDocument.
- No external programs, only PyMuPDF.
- Robust to multi-line field values (e.g., hyphenated emails) and multi-line table cells.
- Works across multiple pages; headers only need to exist on the first page.
"""
doc = fitz.open(pdf_path)
semap = SemapDocument()
# ---------- helpers ----------
def _join_tokens(tokens: list[str]) -> str:
"""Join tokens, preserving hyphen/URL joins across line wraps."""
parts = []
for tok in tokens:
if parts and (
parts[-1].endswith("-")
or parts[-1].endswith("/")
or parts[-1].endswith(":")
):
parts[-1] = parts[-1] + tok # no space after '-', '/' or ':'
else:
parts.append(tok)
return " ".join(parts).strip()
def _extract_row_values_multiline(
page, labels: list[str], y_window: float = 24
) -> dict[str, str]:
"""For a row of inline labels (e.g., Name/Fach/Telefon/Mail), grab text to the right of each label."""
rects = []
for lab in labels:
hits = page.search_for(lab)
if hits:
rects.append((lab, hits[0]))
if not rects:
return {}
rects.sort(key=lambda t: t[1].x0)
words = page.get_text("words")
out = {}
for i, (lab, r) in enumerate(rects):
x0 = r.x1 + 1
x1 = rects[i + 1][1].x0 - 1 if i + 1 < len(rects) else page.rect.width - 5
y0 = r.y0 - 3
y1 = r.y0 + y_window
toks = [w for w in words if x0 <= w[0] <= x1 and y0 <= w[1] <= y1]
toks.sort(key=lambda w: (w[1], w[0])) # line, then x
out[lab] = _join_tokens([w[4] for w in toks])
return out
def _compute_columns_from_headers(page0):
"""Find column headers (once) and derive column centers + header baseline."""
headers = [
("Autorenname(n):", "Autorenname(n):Nachname, Vorname"),
("Jahr/Auflage", "Jahr/Auflage"),
("Titel", "Titel"),
("Ort und Verlag", "Ort und Verlag"),
("Standnummer", "Standnummer"),
("Interne Vermerke", "Interne Vermerke"),
]
found = []
for label, canon in headers:
rects = [
r for r in page0.search_for(label) if r.y0 > 200
] # skip top-of-form duplicates
if rects:
found.append((canon, rects[0]))
found.sort(key=lambda t: t[1].x0)
cols = [(canon, r.x0, r.x1, (r.x0 + r.x1) / 2.0) for canon, r in found]
header_y = min(r.y0 for _, r in found) if found else 0
return cols, header_y
def _extract_table_rows_from_page(
page, cols, header_y, y_top_margin=5, y_bottom_margin=40, y_tol=26.0
):
"""
Group words into logical rows (tolerant to wrapped lines), then map each word
to the nearest column by x-center and join tokens per column.
"""
words = [
w
for w in page.get_text("words")
if w[1] > header_y + y_top_margin
and w[3] < page.rect.height - y_bottom_margin
]
# group into row bands by y (tolerance big enough to capture wrapped lines, but below next row gap)
rows = []
for w in sorted(words, key=lambda w: w[1]):
y = w[1]
for row in rows:
if abs(row["y_mean"] - y) <= y_tol:
row["ys"].append(y)
row["y_mean"] = sum(row["ys"]) / len(row["ys"])
row["words"].append(w)
break
else:
rows.append({"y_mean": y, "ys": [y], "words": [w]})
# map to columns + join
joined_rows = []
for row in rows:
rowdict = {canon: "" for canon, *_ in cols}
words_by_col = {canon: [] for canon, *_ in cols}
for w in sorted(row["words"], key=lambda w: (w[1], w[0])):
xmid = (w[0] + w[2]) / 2.0
canon = min(cols, key=lambda c: abs(xmid - c[3]))[0]
words_by_col[canon].append(w[4])
for canon, toks in words_by_col.items():
rowdict[canon] = _join_tokens(toks)
if any(v for v in rowdict.values()):
joined_rows.append(rowdict)
return joined_rows
# ---------- top-of-form fields ----------
p0 = doc[0]
row1 = _extract_row_values_multiline(
p0,
["Ihr Name und Titel:", "Ihr Fach:", "Telefon:", "Mailadresse:"],
y_window=22,
)
row2 = _extract_row_values_multiline(
p0, ["Veranstaltung:", "Semester:"], y_window=20
)
name_title = row1.get("Ihr Name und Titel:", "") or ""
semap.subject = row1.get("Ihr Fach:", None)
semap.phoneNumber = row1.get("Telefon:", None) # keep as-is (string like "682-308")
semap.mail = row1.get("Mailadresse:", None)
semap.personName = ",".join(name_title.split(",")[:-1]) if name_title else None
semap.personTitle = (
",".join(name_title.split(",")[-1:]).strip() if name_title else None
)
semap.title = row2.get("Veranstaltung:", None)
semap.semester = row2.get("Semester:", None)
# ---------- table extraction (all pages) ----------
cols, header_y = _compute_columns_from_headers(p0)
all_rows: list[dict[str, Any]] = []
for pn in range(len(doc)):
all_rows.extend(_extract_table_rows_from_page(doc[pn], cols, header_y))
# drop the sub-header line "Nachname, Vorname" etc.
filtered = []
for r in all_rows:
if r.get("Autorenname(n):Nachname, Vorname", "").strip() in (
"",
"Nachname, Vorname",
):
# skip if it's just the sub-header line
if all(not r[c] for c in r if c != "Autorenname(n):Nachname, Vorname"):
continue
filtered.append(r)
# build Book objects (same filters as your word parser)
booklist: list[Book] = []
for row in filtered:
b = Book()
b.from_dict(row)
if b.is_empty:
continue
if not b.has_signature:
continue
booklist.append(b)
semap.books = booklist
# keep parity with your post-processing
if ai:
_ = semap.renameSemester
_ = semap.nameSetter
return semap
if __name__ == "__main__":
else_df = word_to_semap("C:/Users/aky547/Desktop/semap/db/temp/tmpzsz_hgdr.docx")
print(else_df)
else_df = pdf_to_semap("C:/Users/aky547/Dokumente/testsemap.pdf")
# print(else_df)

67
src/logic/xmlparser.py Normal file
View File

@@ -0,0 +1,67 @@
import xml.etree.ElementTree as ET
from src.logic.dataclass import Apparat, BookData, SemapDocument, XMLMailSubmission
from src.logic.semester import Semester
def parse_xml_submission(xml_string: str) -> XMLMailSubmission:
"""
Parse an XML string representing a mail submission and return an XMLMailSubmission object.
"""
submission = XMLMailSubmission()
root = ET.fromstring(xml_string)
static_data = root.find("static")
static_info = {child.tag: child.text for child in static_data}
books = root.find("books")
books_info = []
for book in books:
book_details = {detail.tag: detail.text for detail in book}
book = BookData(
author=book_details.get("authorname"),
year=book_details.get("year").split("/")[0]
if "/" in book_details.get("year")
else book_details.get("year"),
edition=book_details.get("year").split("/")[1]
if "/" in book_details.get("year")
else None,
title=book_details.get("title"),
signature=book_details.get("signature"),
)
books_info.append(book)
# Extract static data
submission.name = static_info.get("name")
submission.lastname = static_info.get("lastname")
submission.title = static_info.get("title")
submission.telno = int(static_info.get("telno"))
submission.email = static_info.get("mail")
submission.app_name = static_info.get("apparatsname")
submission.subject = static_info.get("subject")
sem_year = static_info.get("semester").split()[1]
sem_term = static_info.get("semester").split()[0]
submission.semester = Semester(semester=sem_term, year=int(sem_year))
submission.books = books_info
# Extract book information
# book_info = []
# for book in books:
# book_details = {detail.tag: detail.text for detail in book}
# book_info.append(book_details)
return submission
def eml_parser(path: str) -> XMLMailSubmission:
with open(path, "r", encoding="utf-8") as file:
xml_content = file.read().split("\n\n", 1)[1] # Skip headers
print("EML content loaded, parsing XML...")
print(xml_content)
return parse_xml_submission(xml_content)
def eml_to_semap(path: str) -> SemapDocument:
submission = eml_parser(path)
semap_doc = SemapDocument(
# prof=Prof(name=submission.name, lastname=submission.lastname, email=submission.email),
apparat=Apparat(name=submission.app_name, subject=submission.subject),
semester=submission.semester,
books=submission.books,
)
return semap_doc

View File

@@ -1,7 +1,9 @@
from pyzotero import zotero
from dataclasses import dataclass
from src.logic.webrequest import WebRequest, BibTextTransformer
from pyzotero import zotero
from src import settings
from src.logic.webrequest import BibTextTransformer, WebRequest
@dataclass
@@ -187,7 +189,7 @@ class ZoteroController:
book = bib.return_data()
return book
# # print(zot.item_template("bookSection"))
# # #print(zot.item_template("bookSection"))
def createBook(self, isbn):
book = self.__get_data(isbn)
@@ -210,7 +212,7 @@ class ZoteroController:
def createItem(self, item):
resp = self.zot.create_items([item])
if "successful" in resp.keys():
# print(resp["successful"]["0"]["key"])
# #print(resp["successful"]["0"]["key"])
return resp["successful"]["0"]["key"]
else:
return None
@@ -220,7 +222,7 @@ class ZoteroController:
for item in items:
if item["key"] == key:
self.zot.delete_item(item)
# print(item)
# #print(item)
break
def createHGSection(self, book: Book, data: dict):
@@ -241,7 +243,7 @@ class ZoteroController:
]
chapter.creators += authors
# print(chapter.to_dict())
# #print(chapter.to_dict())
return self.createItem(chapter.to_dict())
pass
@@ -257,7 +259,7 @@ class ZoteroController:
# chapter.creators
def createJournalArticle(self, journal, article):
# print(type(article))
# #print(type(article))
journalarticle = JournalArticle()
journalarticle.assign(journal)
journalarticle.itemType = "journalArticle"
@@ -273,7 +275,7 @@ class ZoteroController:
journalarticle.issue = article["issue"]
journalarticle.url = article["isbn"]
# print(journalarticle.to_dict())
# #print(journalarticle.to_dict())
return self.createItem(journalarticle.to_dict())
@@ -319,16 +321,16 @@ if __name__ == "__main__":
# if isinstance(publishers, str):
# publishers = [publishers]
# for publisher in publishers:
# # print(publisher)
# # #print(publisher)
# creator = Creator().from_string(publisher)
# creator.creatorType = "editor"
# authors.append(creator.__dict__)
# chapter.creators = authors
# chapter.publisher = book.publisher
# # print(chapter.to_dict())
# # #print(chapter.to_dict())
# createBookSection(chapter.to_dict())
# get_citation("9ZXH8DDE")
# # # print()
# # print(get_books())
# # print(zot.item_creator_types("bookSection"))
# # # #print()
# # #print(get_books())
# # #print(zot.item_creator_types("bookSection"))