minor and major reworks: rename swb to SRU, add a test for pdf parsing

major: rework mail to send mail as plaintext instead of html, preventing the bleed-in of html text
2025-10-07 14:15:10 +02:00
parent 0df7fd9fe6
commit 06965db26a
25 changed files with 1174 additions and 303 deletions
--- a/src/logic/SRU.py
+++ b/src/logic/SRU.py
@@ -2,6 +2,7 @@ import sys
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass, field
 from datetime import datetime
+from enum import Enum
 from typing import Dict, Iterable, List, Optional, Tuple

 import loguru
@@ -97,7 +98,7 @@ def _text(elem: Optional[ET.Element]) -> str:
 def _req_text(parent: ET.Element, path: str) -> str:
    el = parent.find(path, NS)
    if el is None or el.text is None:
-        raise ValueError(f"Required element not found or empty: {path}")
+        return None
    return el.text


@@ -188,7 +189,7 @@ def parse_search_retrieve_response(xml_str: str) -> SearchRetrieveResponse:

    # Root is zs:searchRetrieveResponse
    version = _req_text(root, "zs:version")
-    numberOfRecords = int(_req_text(root, "zs:numberOfRecords"))
+    numberOfRecords = int(_req_text(root, "zs:numberOfRecords") or "0")

    records_parent = root.find("zs:records", NS)
    records: List[Record] = []
@@ -408,8 +409,12 @@ def book_from_marc(rec: MarcRecord) -> BookData:
        rec, "264", "c"
    )
    isbn = subfield_values(rec, "020", "a")
-
+    mediatype = first_subfield_value(rec, "338", "a")
    lang = subfield_values(rec, "041", "a")
+    authors = subfield_values(rec, "700", "a")
+    author = None
+    if authors:
+        author = "; ".join(authors)

    return BookData(
        ppn=ppn,
@@ -422,32 +427,162 @@ def book_from_marc(rec: MarcRecord) -> BookData:
        isbn=isbn,
        language=lang,
        link="",
+        author=author,
+        media_type=mediatype,
    )


-class SWB:
-    def __init__(self):
-        self.url = "https://sru.k10plus.de/opac-de-627!rec=1?version=1.1&operation=searchRetrieve&query={}&maximumRecords=10&recordSchema=marcxml"
-        self.bib_id = 20735
+class SWBData(Enum):
+    URL = "https://sru.k10plus.de/opac-de-627!rec=1?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=marcxml"
+    ARGSCHEMA = "pica."
+    NAME = "SWB"
+
+
+class DNBData(Enum):
+    URL = "https://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query={}&maximumRecords=100&recordSchema=MARC21-xml"
+    ARGSCHEMA = ""
+    NAME = "DNB"
+
+
+class SRUSite(Enum):
+    SWB = SWBData
+    DNB = DNBData
+
+
+RVK_ALLOWED = r"[A-Z0-9.\-\/]"  # conservative char set typically seen in RVK notations
+
+
+def find_newer_edition(
+    swb_result: BookData, dnb_result: List[BookData]
+) -> Optional[List[BookData]]:
+    """
+    New edition if:
+      - year > swb.year OR
+      - edition_number > swb.edition_number
+
+    Additional guards & preferences:
+      - If both have signatures and they differ, skip (not the same work).
+      - For duplicates (same ppn): keep the one that has a signature, and
+        prefer a signature that matches swb_result.signature.
+      - If multiple remain: keep the single 'latest' by (year desc,
+        edition_number desc, best-signature-match desc, has-signature desc).
+    """
+
+    def norm_sig(s: Optional[str]) -> str:
+        if not s:
+            return ""
+        # normalize: lowercase, collapse whitespace, keep alnum + a few separators
+        s = s.lower()
+        s = re.sub(r"\s+", " ", s).strip()
+        # remove obvious noise; adjust if your signature format differs
+        s = re.sub(r"[^a-z0-9\-_/\. ]+", "", s)
+        return s
+
+    def has_sig(b: BookData) -> bool:
+        return bool(getattr(b, "signature", None))
+
+    def sig_matches_swb(b: BookData) -> bool:
+        if not has_sig(b) or not has_sig(swb_result):
+            return False
+        return norm_sig(b.signature) == norm_sig(swb_result.signature)
+
+    def strictly_newer(b: BookData) -> bool:
+        by_year = (
+            b.year is not None
+            and swb_result.year is not None
+            and b.year > swb_result.year
+        )
+        by_edition = (
+            b.edition_number is not None
+            and swb_result.edition_number is not None
+            and b.edition_number > swb_result.edition_number
+        )
+        return by_year or by_edition
+
+    swb_sig_norm = norm_sig(getattr(swb_result, "signature", None))
+
+    # 1) Filter to same-work AND newer
+    candidates: List[BookData] = []
+    for b in dnb_result:
+        # Skip if both signatures exist and don't match (different work)
+        b_sig = getattr(b, "signature", None)
+        if b_sig and swb_result.signature:
+            if norm_sig(b_sig) != swb_sig_norm:
+                continue  # not the same work
+
+        # Keep only if newer by rules
+        if strictly_newer(b):
+            candidates.append(b)
+
+    if not candidates:
+        return None
+
+    # 2) Dedupe by PPN, preferring signature (and matching signature if possible)
+    by_ppn: dict[Optional[str], BookData] = {}
+    for b in candidates:
+        key = getattr(b, "ppn", None)
+        prev = by_ppn.get(key)
+        if prev is None:
+            by_ppn[key] = b
+            continue
+
+        # Compute preference score for both
+        def ppn_pref_score(x: BookData) -> tuple[int, int]:
+            # (signature matches swb, has signature)
+            return (1 if sig_matches_swb(x) else 0, 1 if has_sig(x) else 0)
+
+        if ppn_pref_score(b) > ppn_pref_score(prev):
+            by_ppn[key] = b
+
+    deduped = list(by_ppn.values())
+    if not deduped:
+        return None
+
+    # 3) If multiple remain, keep only the latest one.
+    # Order: year desc, edition_number desc, signature-match desc, has-signature desc
+    def sort_key(b: BookData):
+        year = b.year if b.year is not None else -1
+        ed = b.edition_number if b.edition_number is not None else -1
+        sig_match = 1 if sig_matches_swb(b) else 0
+        sig_present = 1 if has_sig(b) else 0
+        return (year, ed, sig_match, sig_present)
+
+    best = max(deduped, key=sort_key)
+    return [best] if best else None
+
+
+class Api:
+    def __init__(self, site: str, url: str, prefix: str):
+        self.site = site
+        self.url = url
+        self.prefix = prefix
+        pass

    def get(self, query_args: Iterable[str]) -> List[Record]:
        # if any query_arg ends with =, remove it
-        query_args = [arg for arg in query_args if not arg.endswith("=")]
+        if self.site == "DNB":
+            args = [arg for arg in query_args if not arg.startswith("pica.")]
+            if args == []:
+                raise ValueError("DNB queries must include at least one search term")
+            query_args = args
+        # query_args = [f"{self.prefix}{arg}" for arg in query_args]
        query = "+and+".join(query_args)
        query = query.replace(" ", "%20").replace("&", "%26")
-
+        # query_args = [arg for arg in query_args if not arg.endswith("=")]
+        # query = "+and+".join(query_args)
+        # query = query.replace(" ", "%20").replace("&", "%26")
+        # insert the query into the url url is
        url = self.url.format(query)

        log.debug(url)
        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
+            "User-Agent": f"{self.site} SRU Client, <alexander.kirchner@ph-freiburg.de>",
            "Accept": "application/xml",
            "Accept-Charset": "latin1,utf-8;q=0.7,*;q=0.3",
        }
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            raise Exception(f"Error fetching data from SWB: {response.status_code}")
-        # #print(response.text)
        data = response.content

        # extract top-level response
@@ -456,6 +591,7 @@ class SWB:

    def getBooks(self, query_args: Iterable[str]) -> List[BookData]:
        records: List[Record] = self.get(query_args)
+        print(f"{self.site} found {len(records)} records")
        books: List[BookData] = []
        # extract title from query_args if present
        title = None
@@ -476,3 +612,11 @@ class SWB:

    def getLinkForBook(self, book: BookData) -> str:
        results = self.getBooks()
+
+
+class SWB(Api):
+    def __init__(self):
+        self.site = SWBData.NAME.value
+        self.url = SWBData.URL.value
+        self.prefix = SWBData.ARGSCHEMA.value
+        super().__init__(self.site, self.url, self.prefix)
--- a/src/logic/init.py
+++ b/src/logic/init.py
@@ -1,6 +1,35 @@
-from .dataclass import ApparatData, BookData, Prof, Apparat, ELSA
+__all__ = [
+    "custom_sort",
+    "sort_semesters_list",
+    "APP_NRS",
+    "PROF_TITLES",
+    "SEMAP_MEDIA_ACCOUNTS",
+    "csv_to_list",
+    "ELSA",
+    "Apparat",
+    "ApparatData",
+    "BookData",
+    "Prof",
+    "Semester",
+    "SemapDocument",
+    "elsa_word_to_csv",
+    "pdf_to_semap",
+    "word_docx_to_csv",
+    "word_to_semap",
+    "ZoteroController",
+    "eml_to_semap",
+]
 from .c_sort import custom_sort, sort_semesters_list
 from .constants import APP_NRS, PROF_TITLES, SEMAP_MEDIA_ACCOUNTS
 from .csvparser import csv_to_list
-from .wordparser import elsa_word_to_csv, word_docx_to_csv, word_to_semap, SemapDocument
+from .dataclass import ELSA, Apparat, ApparatData, BookData, Prof
+from .semester import Semester
+from .wordparser import (
+    SemapDocument,
+    elsa_word_to_csv,
+    pdf_to_semap,
+    word_docx_to_csv,
+    word_to_semap,
+)
+from .xmlparser import eml_to_semap
 from .zotero import ZoteroController
--- a/src/logic/c_sort.py
+++ b/src/logic/c_sort.py
@@ -83,4 +83,4 @@ if __name__ == "__main__":
        "SoSe 25",
    ]

-    print(sort_semesters_list(unsorted))
+    # print(sort_semesters_list(unsorted))
--- a/src/logic/csvparser.py
+++ b/src/logic/csvparser.py
@@ -1,4 +1,5 @@
 import csv
+
 from charset_normalizer import detect


@@ -19,4 +20,4 @@ def csv_to_list(path: str) -> list[str]:
 if __name__ == "__main__":
    text = csv_to_list("C:/Users/aky547/Desktop/semap/71.csv")
    # remove linebreaks
-    # print(text)
+    # #print(text)
--- a/src/logic/dataclass.py
+++ b/src/logic/dataclass.py
@@ -3,6 +3,11 @@ from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Optional, Union

+import regex
+
+from src.logic.openai import name_tester, run_shortener, semester_converter
+from src.logic.semester import Semester
+

@dataclass
 class Prof:
@@ -67,21 +72,63 @@ class BookData:
    language: Union[str, list[str], None] = field(default_factory=list)
    publisher: str | None = None
    place: str | None = None
-    year: str | None = None
+    year: int | None = None
    pages: str | None = None
-    library_location: int | None = None
+    library_location: str | None = None
    in_apparat: bool | None = False
    adis_idn: str | None = None
+    old_book: Any | None = None
+    media_type: str | None = None  #
+    in_library: bool | None = None  # whether the book is in the library or not
+
+    def __post_init__(self):
+        self.library_location = (
+            str(self.library_location) if self.library_location else None
+        )
+        if isinstance(self.language, list) and self.language:
+            self.language = [lang.strip() for lang in self.language if lang.strip()]
+            self.language = ",".join(self.language)
+        self.year = regex.sub(r"[^\d]", "", str(self.year)) if self.year else None
+        self.in_library = True if self.signature else False

    def from_dict(self, data: dict) -> "BookData":
        for key, value in data.items():
            setattr(self, key, value)
        return self

+    def merge(self, other: "BookData") -> "BookData":
+        for key, value in other.__dict__.items():
+            # merge lists, if the attribute is a list, extend it
+            if isinstance(value, list):
+                current_value = getattr(self, key)
+                if current_value is None:
+                    current_value = []
+                elif not isinstance(current_value, list):
+                    current_value = [current_value]
+                # extend the list with the new values, but only if they are not already in the list
+                for v in value:
+                    if v not in current_value:
+                        current_value.append(v)
+                setattr(self, key, current_value)
+            if value is not None and (
+                getattr(self, key) is None or getattr(self, key) == ""
+            ):
+                setattr(self, key, value)
+        # in language, drop all entries that are longer than 3 characters
+        if isinstance(self.language, list):
+            self.language = [lang for lang in self.language if len(lang) <= 4]
+        return self
+
    @property
    def to_dict(self) -> str:
        """Convert the dataclass to a dictionary."""
-        return json.dumps(self.__dict__, ensure_ascii=False)
+        data_dict = {
+            key: value for key, value in self.__dict__.items() if value is not None
+        }
+        # remove old_book from data_dict
+        if "old_book" in data_dict:
+            del data_dict["old_book"]
+        return json.dumps(data_dict, ensure_ascii=False)

    def from_dataclass(self, dataclass: Optional[Any]) -> None:
        if dataclass is None:
@@ -89,8 +136,15 @@ class BookData:
        for key, value in dataclass.__dict__.items():
            setattr(self, key, value)

+    def get_book_type(self) -> str:
+        if "Online" in self.pages:
+            return "eBook"
+        else:
+            return "Druckausgabe"
+
    def from_string(self, data: str) -> "BookData":
        ndata = json.loads(data)
+
        return BookData(**ndata)

    def from_LehmannsSearchResult(self, result: Any) -> "BookData":
@@ -111,6 +165,15 @@ class BookData:
        # self.pages = str(result.pages) if result.pages else None
        return self

+    @property
+    def edition_number(self) -> Optional[int]:
+        if self.edition is None:
+            return 0
+        match = regex.search(r"(\d+)", self.edition)
+        if match:
+            return int(match.group(1))
+        return 0
+

@dataclass
 class MailData:
@@ -222,3 +285,124 @@ class ELSA:
 class ApparatData:
    prof: Prof = field(default_factory=Prof)
    apparat: Apparat = field(default_factory=Apparat)
+
+
+@dataclass
+class XMLMailSubmission:
+    name: Optional[str] = None
+    lastname: Optional[str] = None
+    title: Optional[str] = None
+    telno: Optional[int] = None
+    email: Optional[str] = None
+    app_name: Optional[str] = None
+    subject: Optional[str] = None
+    semester: Optional[Semester] = None
+    books: Optional[list[BookData]] = None
+
+
+@dataclass
+class Book:
+    author: str = None
+    year: str = None
+    edition: str = None
+    title: str = None
+    location: str = None
+    publisher: str = None
+    signature: str = None
+    internal_notes: str = None
+
+    @property
+    def has_signature(self) -> bool:
+        return self.signature is not None and self.signature != ""
+
+    @property
+    def is_empty(self) -> bool:
+        return all(
+            [
+                self.author == "",
+                self.year == "",
+                self.edition == "",
+                self.title == "",
+                self.location == "",
+                self.publisher == "",
+                self.signature == "",
+                self.internal_notes == "",
+            ]
+        )
+
+    def from_dict(self, data: dict[str, Any]):
+        for key, value in data.items():
+            value = value.strip()
+            if value == "\u2002\u2002\u2002\u2002\u2002":
+                value = ""
+
+            if key == "Autorenname(n):Nachname, Vorname":
+                self.author = value
+            elif key == "Jahr/Auflage":
+                self.year = value.split("/")[0] if "/" in value else value
+                self.edition = value.split("/")[1] if "/" in value else ""
+            elif key == "Titel":
+                self.title = value
+            elif key == "Ort und Verlag":
+                self.location = value.split(",")[0] if "," in value else value
+                self.publisher = value.split(",")[1] if "," in value else ""
+            elif key == "Standnummer":
+                self.signature = value.strip()
+            elif key == "Interne Vermerke":
+                self.internal_notes = value
+
+
+@dataclass
+class SemapDocument:
+    subject: str = None
+    phoneNumber: int = None
+    mail: str = None
+    title: str = None
+    title_suggestions: list[str] = None
+    semester: Union[str, Semester] = None
+    books: list[Book] = None
+    eternal: bool = False
+    personName: str = None
+    personTitle: str = None
+    title_length = 0
+    title_max_length = 0
+
+    def __post_init__(self):
+        self.title_suggestions = []
+
+    @property
+    def nameSetter(self):
+        data = name_tester(self.personTitle)
+        name = f"{data['last_name']}, {data['first_name']}"
+        if data["title"] is not None:
+            title = data["title"]
+            self.personTitle = title
+        self.personName = name
+        self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0])
+        if self.title_length > 40:
+            name_len = len(self.personName.split(",")[0])
+            self.title_max_length = 38 - name_len
+            suggestions = run_shortener(self.title, self.title_max_length)
+            for suggestion in suggestions:
+                self.title_suggestions.append(suggestion["shortened_string"])
+        else:
+            self.title_suggestions = []
+        pass
+
+    @property
+    def renameSemester(self) -> None:
+        if self.semester:
+            if ", Dauer" in self.semester:
+                self.semester = self.semester.split(",")[0]
+                self.eternal = True
+                self.semester = Semester().from_string(self.semester)
+            else:
+                self.semester = Semester().from_string(
+                    semester_converter(self.semester)
+                )
+
+    @property
+    def signatures(self) -> list[str]:
+        if self.books is not None:
+            return [book.signature for book in self.books if book.has_signature]
+        return []
--- a/src/logic/lehmannsapi.py
+++ b/src/logic/lehmannsapi.py
@@ -1,13 +1,15 @@
 from __future__ import annotations

 import re
-from dataclasses import dataclass, asdict, field
-from typing import Optional, List, Iterable
-from urllib.parse import urljoin, quote_plus
+from dataclasses import asdict, dataclass, field
+from typing import Iterable, List, Optional
+from urllib.parse import quote_plus, urljoin

 import httpx
 from bs4 import BeautifulSoup

+from src.logic.dataclass import BookData
+
 BASE = "https://www.lehmanns.de"
 SEARCH_URL = "https://www.lehmanns.de/search/quick?mediatype_id=&q="

@@ -33,9 +35,11 @@ class LehmannsSearchResult:
    image: Optional[str] = None

    # From detail page:
-    pages: Optional[str] = None              # "<N> Seiten"
-    buyable: bool = True                     # set in enrich_pages (detail page)
-    unavailable_hint: Optional[str] = None   # e.g. "Titel ist leider vergriffen; keine Neuauflage"
+    pages: Optional[str] = None  # "<N> Seiten"
+    buyable: bool = True  # set in enrich_pages (detail page)
+    unavailable_hint: Optional[str] = (
+        None  # e.g. "Titel ist leider vergriffen; keine Neuauflage"
+    )

    def to_dict(self) -> dict:
        return asdict(self)
@@ -73,31 +77,45 @@ class LehmannsClient:
        # spaces -> '+'
        return SEARCH_URL + quote_plus(title)

-    def search_by_title(self, title: str, limit: Optional[int] = None, strict: bool = False) -> List[LehmannsSearchResult]:
+    def search_by_title(
+        self,
+        title: str,
+        limit: Optional[int] = None,
+        strict: bool = False,
+        only_latest: bool = True,
+    ) -> List[BookData]:
        """
        Parse the listing page only (no availability check here).
        Use enrich_pages(...) afterwards to fetch detail pages, add 'pages',
        and drop unbuyable items.
        """
-        url = self.build_search_url(title)
+        url = self.build_search_url(title=title)
        html = self._get(url)
        if not html:
            return []
        results = self._parse_results(html)
        self.enrich_pages(results)
+
+        results = [BookData().from_LehmannsSearchResult(r) for r in results]
        if strict:
            # filter results to only those with exact title match (case-insensitive)
            title_lower = title.lower()
            results = [r for r in results if r.title and r.title.lower() == title_lower]
-            results = [r for r in results if r.buyable]
+            # results = [r for r in results if r.buyable]
            return results
        if limit is not None:
-            results = results[:max(0, limit)]
+            results = results[: max(0, limit)]
+        if only_latest and len(results) > 1:
+            # keep only the latest edition (highest edition number)
+            results.sort(key=lambda r: (r.edition_number or 0), reverse=True)
+            results = [results[0]]
        return results

    # ------------------- Detail enrichment & filtering -------------------

-    def enrich_pages(self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True) -> List[LehmannsSearchResult]:
+    def enrich_pages(
+        self, results: Iterable[LehmannsSearchResult], drop_unbuyable: bool = True
+    ) -> List[LehmannsSearchResult]:
        """
        Fetch each result.url, extract:
          - pages: from <span class="book-meta meta-seiten" itemprop="numberOfPages">...</span>
@@ -135,11 +153,15 @@ class LehmannsClient:
                # Availability via li.availability-3
                avail_li = soup.select_one("li.availability-3")
                if avail_li:
-                    avail_text = " ".join(avail_li.get_text(" ", strip=True).split()).lower()
+                    avail_text = " ".join(
+                        avail_li.get_text(" ", strip=True).split()
+                    ).lower()
                    if "titel ist leider vergriffen" in avail_text:
                        r.buyable = False
                        if "keine neuauflage" in avail_text:
-                            r.unavailable_hint = "Titel ist leider vergriffen; keine Neuauflage"
+                            r.unavailable_hint = (
+                                "Titel ist leider vergriffen; keine Neuauflage"
+                            )
                        else:
                            r.unavailable_hint = "Titel ist leider vergriffen"

@@ -161,7 +183,9 @@ class LehmannsClient:
        try:
            r = self.client.get(url)
            r.encoding = "utf-8"
-            if r.status_code == 200 and "text/html" in (r.headers.get("content-type") or ""):
+            if r.status_code == 200 and "text/html" in (
+                r.headers.get("content-type") or ""
+            ):
                return r.text
        except httpx.HTTPError:
            pass
@@ -176,12 +200,18 @@ class LehmannsClient:
            if not a:
                continue
            url = urljoin(BASE, a["href"].strip())
-            base_title = (block.select_one(".title [itemprop='name']") or a).get_text(strip=True)
+            base_title = (block.select_one(".title [itemprop='name']") or a).get_text(
+                strip=True
+            )

            # Alternative headline => extend title
            alt_tag = block.select_one(".description[itemprop='alternativeHeadline']")
            alternative_headline = alt_tag.get_text(strip=True) if alt_tag else None
-            title = f"{base_title} : {alternative_headline}" if alternative_headline else base_title
+            title = (
+                f"{base_title} : {alternative_headline}"
+                if alternative_headline
+                else base_title
+            )
            description = alternative_headline

            # Authors from .author
@@ -227,7 +257,9 @@ class LehmannsClient:

            # Publisher
            publisher = None
-            pub = block.select_one(".publisherprop [itemprop='name']") or block.select_one(".publisher [itemprop='name']")
+            pub = block.select_one(
+                ".publisherprop [itemprop='name']"
+            ) or block.select_one(".publisher [itemprop='name']")
            if pub:
                publisher = pub.get_text(strip=True)

--- a/src/logic/pdfparser.py
+++ b/src/logic/pdfparser.py
@@ -21,4 +21,4 @@ if __name__ == "__main__":
    text = pdf_to_csv("54_pdf.pdf")
    # remove linebreaks
    text = text.replace("\n", "")
-    print(text)
+    # print(text)
--- a/src/logic/wordparser.py
+++ b/src/logic/wordparser.py
@@ -1,16 +1,15 @@
 import sys
 import zipfile
-from dataclasses import dataclass
-from typing import Any, Union
+from typing import Any

+import fitz  # PyMuPDF
 import loguru
 import pandas as pd
 from bs4 import BeautifulSoup
 from docx import Document

 from src import LOG_DIR
-from src.backend.semester import Semester
-from src.logic.openai import name_tester, run_shortener, semester_converter
+from src.logic.dataclass import Book, SemapDocument

 log = loguru.logger
 log.remove()
@@ -18,116 +17,6 @@ log.add(sys.stdout, level="INFO")
 log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")


-letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
-
-
-@dataclass
-class Book:
-    author: str = None
-    year: str = None
-    edition: str = None
-    title: str = None
-    location: str = None
-    publisher: str = None
-    signature: str = None
-    internal_notes: str = None
-
-    @property
-    def has_signature(self) -> bool:
-        return self.signature is not None and self.signature != ""
-
-    @property
-    def is_empty(self) -> bool:
-        return all(
-            [
-                self.author == "",
-                self.year == "",
-                self.edition == "",
-                self.title == "",
-                self.location == "",
-                self.publisher == "",
-                self.signature == "",
-                self.internal_notes == "",
-            ]
-        )
-
-    def from_dict(self, data: dict[str, Any]):
-        for key, value in data.items():
-            value = value.strip()
-            if value == "\u2002\u2002\u2002\u2002\u2002":
-                value = ""
-
-            if key == "Autorenname(n):Nachname, Vorname":
-                self.author = value
-            elif key == "Jahr/Auflage":
-                self.year = value.split("/")[0] if "/" in value else value
-                self.edition = value.split("/")[1] if "/" in value else ""
-            elif key == "Titel":
-                self.title = value
-            elif key == "Ort und Verlag":
-                self.location = value.split(",")[0] if "," in value else value
-                self.publisher = value.split(",")[1] if "," in value else ""
-            elif key == "Standnummer":
-                self.signature = value.strip()
-            elif key == "Interne Vermerke":
-                self.internal_notes = value
-
-
-@dataclass
-class SemapDocument:
-    subject: str = None
-    phoneNumber: int = None
-    mail: str = None
-    title: str = None
-    title_suggestions: list[str] = None
-    semester: Union[str, Semester] = None
-    books: list[Book] = None
-    eternal: bool = False
-    personName: str = None
-    personTitle: str = None
-    title_length = 0
-    title_max_length = 0
-
-    def __post_init__(self):
-        self.title_suggestions = []
-
-    @property
-    def nameSetter(self):
-        data = name_tester(self.personTitle)
-        name = f"{data['last_name']}, {data['first_name']}"
-        if data["title"] is not None:
-            title = data["title"]
-            self.personTitle = title
-        self.personName = name
-        self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0])
-        if self.title_length > 40:
-            log.warning("Title is too long")
-            name_len = len(self.personName.split(",")[0])
-            self.title_max_length = 38 - name_len
-            suggestions = run_shortener(self.title, self.title_max_length)
-            for suggestion in suggestions:
-                self.title_suggestions.append(suggestion["shortened_string"])
-        else:
-            self.title_suggestions = []
-        pass
-
-    @property
-    def renameSemester(self) -> None:
-        if ", Dauer" in self.semester:
-            self.semester = self.semester.split(",")[0]
-            self.eternal = True
-            self.semester = Semester().from_string(self.semester)
-        else:
-            log.warning("Semester {} is not valid", self.semester)
-            self.semester = Semester().from_string(semester_converter(self.semester))
-
-    @property
-    def signatures(self) -> list[str]:
-        if self.books is not None:
-            return [book.signature for book in self.books if book.has_signature]
-        return []
-
-
 def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
    doc = Document(path)
    tables = doc.tables
@@ -272,7 +161,7 @@ def word_to_semap(word_path: str, ai: bool = True) -> SemapDocument:
    apparatdata = df[0]
    apparatdata = apparatdata.to_dict()
    keys = list(apparatdata.keys())
-    print(apparatdata, keys)
+    # print(apparatdata, keys)

    appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys) - 1, 2)}
    semap.phoneNumber = appdata["Telefon:"]
@@ -309,6 +198,182 @@ def word_to_semap(word_path: str, ai: bool = True) -> SemapDocument:
    return semap


+def pdf_to_semap(pdf_path: str, ai: bool = True) -> SemapDocument:
+    """
+    Parse a Semesterapparat PDF like the sample you provided and return a SemapDocument.
+    - No external programs, only PyMuPDF.
+    - Robust to multi-line field values (e.g., hyphenated emails) and multi-line table cells.
+    - Works across multiple pages; headers only need to exist on the first page.
+    """
+    doc = fitz.open(pdf_path)
+    semap = SemapDocument()
+
+    # ---------- helpers ----------
+    def _join_tokens(tokens: list[str]) -> str:
+        """Join tokens, preserving hyphen/URL joins across line wraps."""
+        parts = []
+        for tok in tokens:
+            if parts and (
+                parts[-1].endswith("-")
+                or parts[-1].endswith("/")
+                or parts[-1].endswith(":")
+            ):
+                parts[-1] = parts[-1] + tok  # no space after '-', '/' or ':'
+            else:
+                parts.append(tok)
+        return " ".join(parts).strip()
+
+    def _extract_row_values_multiline(
+        page, labels: list[str], y_window: float = 24
+    ) -> dict[str, str]:
+        """For a row of inline labels (e.g., Name/Fach/Telefon/Mail), grab text to the right of each label."""
+        rects = []
+        for lab in labels:
+            hits = page.search_for(lab)
+            if hits:
+                rects.append((lab, hits[0]))
+        if not rects:
+            return {}
+
+        rects.sort(key=lambda t: t[1].x0)
+        words = page.get_text("words")
+        out = {}
+        for i, (lab, r) in enumerate(rects):
+            x0 = r.x1 + 1
+            x1 = rects[i + 1][1].x0 - 1 if i + 1 < len(rects) else page.rect.width - 5
+            y0 = r.y0 - 3
+            y1 = r.y0 + y_window
+            toks = [w for w in words if x0 <= w[0] <= x1 and y0 <= w[1] <= y1]
+            toks.sort(key=lambda w: (w[1], w[0]))  # line, then x
+            out[lab] = _join_tokens([w[4] for w in toks])
+        return out
+
+    def _compute_columns_from_headers(page0):
+        """Find column headers (once) and derive column centers + header baseline."""
+        headers = [
+            ("Autorenname(n):", "Autorenname(n):Nachname, Vorname"),
+            ("Jahr/Auflage", "Jahr/Auflage"),
+            ("Titel", "Titel"),
+            ("Ort und Verlag", "Ort und Verlag"),
+            ("Standnummer", "Standnummer"),
+            ("Interne Vermerke", "Interne Vermerke"),
+        ]
+        found = []
+        for label, canon in headers:
+            rects = [
+                r for r in page0.search_for(label) if r.y0 > 200
+            ]  # skip top-of-form duplicates
+            if rects:
+                found.append((canon, rects[0]))
+        found.sort(key=lambda t: t[1].x0)
+        cols = [(canon, r.x0, r.x1, (r.x0 + r.x1) / 2.0) for canon, r in found]
+        header_y = min(r.y0 for _, r in found) if found else 0
+        return cols, header_y
+
+    def _extract_table_rows_from_page(
+        page, cols, header_y, y_top_margin=5, y_bottom_margin=40, y_tol=26.0
+    ):
+        """
+        Group words into logical rows (tolerant to wrapped lines), then map each word
+        to the nearest column by x-center and join tokens per column.
+        """
+        words = [
+            w
+            for w in page.get_text("words")
+            if w[1] > header_y + y_top_margin
+            and w[3] < page.rect.height - y_bottom_margin
+        ]
+
+        # group into row bands by y (tolerance big enough to capture wrapped lines, but below next row gap)
+        rows = []
+        for w in sorted(words, key=lambda w: w[1]):
+            y = w[1]
+            for row in rows:
+                if abs(row["y_mean"] - y) <= y_tol:
+                    row["ys"].append(y)
+                    row["y_mean"] = sum(row["ys"]) / len(row["ys"])
+                    row["words"].append(w)
+                    break
+            else:
+                rows.append({"y_mean": y, "ys": [y], "words": [w]})
+
+        # map to columns + join
+        joined_rows = []
+        for row in rows:
+            rowdict = {canon: "" for canon, *_ in cols}
+            words_by_col = {canon: [] for canon, *_ in cols}
+            for w in sorted(row["words"], key=lambda w: (w[1], w[0])):
+                xmid = (w[0] + w[2]) / 2.0
+                canon = min(cols, key=lambda c: abs(xmid - c[3]))[0]
+                words_by_col[canon].append(w[4])
+            for canon, toks in words_by_col.items():
+                rowdict[canon] = _join_tokens(toks)
+            if any(v for v in rowdict.values()):
+                joined_rows.append(rowdict)
+        return joined_rows
+
+    # ---------- top-of-form fields ----------
+    p0 = doc[0]
+    row1 = _extract_row_values_multiline(
+        p0,
+        ["Ihr Name und Titel:", "Ihr Fach:", "Telefon:", "Mailadresse:"],
+        y_window=22,
+    )
+    row2 = _extract_row_values_multiline(
+        p0, ["Veranstaltung:", "Semester:"], y_window=20
+    )
+
+    name_title = row1.get("Ihr Name und Titel:", "") or ""
+    semap.subject = row1.get("Ihr Fach:", None)
+    semap.phoneNumber = row1.get("Telefon:", None)  # keep as-is (string like "682-308")
+    semap.mail = row1.get("Mailadresse:", None)
+    semap.personName = ",".join(name_title.split(",")[:-1]) if name_title else None
+    semap.personTitle = (
+        ",".join(name_title.split(",")[-1:]).strip() if name_title else None
+    )
+
+    semap.title = row2.get("Veranstaltung:", None)
+    semap.semester = row2.get("Semester:", None)
+
+    # ---------- table extraction (all pages) ----------
+    cols, header_y = _compute_columns_from_headers(p0)
+    all_rows: list[dict[str, Any]] = []
+    for pn in range(len(doc)):
+        all_rows.extend(_extract_table_rows_from_page(doc[pn], cols, header_y))
+
+    # drop the sub-header line "Nachname, Vorname" etc.
+    filtered = []
+    for r in all_rows:
+        if r.get("Autorenname(n):Nachname, Vorname", "").strip() in (
+            "",
+            "Nachname, Vorname",
+        ):
+            # skip if it's just the sub-header line
+            if all(not r[c] for c in r if c != "Autorenname(n):Nachname, Vorname"):
+                continue
+        filtered.append(r)
+
+    # build Book objects (same filters as your word parser)
+    booklist: list[Book] = []
+    for row in filtered:
+        b = Book()
+        b.from_dict(row)
+        if b.is_empty:
+            continue
+        if not b.has_signature:
+            continue
+        booklist.append(b)
+
+    semap.books = booklist
+
+    # keep parity with your post-processing
+    if ai:
+        _ = semap.renameSemester
+        _ = semap.nameSetter
+
+    return semap
+
+
 if __name__ == "__main__":
-    else_df = word_to_semap("C:/Users/aky547/Desktop/semap/db/temp/tmpzsz_hgdr.docx")
-    print(else_df)
+    else_df = pdf_to_semap("C:/Users/aky547/Dokumente/testsemap.pdf")
+    # print(else_df)
--- a/src/logic/xmlparser.py
+++ b/src/logic/xmlparser.py
@@ -0,0 +1,67 @@
+import xml.etree.ElementTree as ET
+
+from src.logic.dataclass import Apparat, BookData, SemapDocument, XMLMailSubmission
+from src.logic.semester import Semester
+
+
+def parse_xml_submission(xml_string: str) -> XMLMailSubmission:
+    """
+    Parse an XML string representing a mail submission and return an XMLMailSubmission object.
+    """
+    submission = XMLMailSubmission()
+    root = ET.fromstring(xml_string)
+    static_data = root.find("static")
+    static_info = {child.tag: child.text for child in static_data}
+    books = root.find("books")
+    books_info = []
+    for book in books:
+        book_details = {detail.tag: detail.text for detail in book}
+        book = BookData(
+            author=book_details.get("authorname"),
+            year=book_details.get("year").split("/")[0]
+            if "/" in book_details.get("year")
+            else book_details.get("year"),
+            edition=book_details.get("year").split("/")[1]
+            if "/" in book_details.get("year")
+            else None,
+            title=book_details.get("title"),
+            signature=book_details.get("signature"),
+        )
+        books_info.append(book)
+    # Extract static data
+    submission.name = static_info.get("name")
+    submission.lastname = static_info.get("lastname")
+    submission.title = static_info.get("title")
+    submission.telno = int(static_info.get("telno"))
+    submission.email = static_info.get("mail")
+    submission.app_name = static_info.get("apparatsname")
+    submission.subject = static_info.get("subject")
+    sem_year = static_info.get("semester").split()[1]
+    sem_term = static_info.get("semester").split()[0]
+    submission.semester = Semester(semester=sem_term, year=int(sem_year))
+    submission.books = books_info
+    # Extract book information
+    # book_info = []
+    # for book in books:
+    #     book_details = {detail.tag: detail.text for detail in book}
+    #     book_info.append(book_details)
+    return submission
+
+
+def eml_parser(path: str) -> XMLMailSubmission:
+    with open(path, "r", encoding="utf-8") as file:
+        xml_content = file.read().split("\n\n", 1)[1]  # Skip headers
+    print("EML content loaded, parsing XML...")
+    print(xml_content)
+    return parse_xml_submission(xml_content)
+
+
+def eml_to_semap(path: str) -> SemapDocument:
+    submission = eml_parser(path)
+    semap_doc = SemapDocument(
+        # prof=Prof(name=submission.name, lastname=submission.lastname, email=submission.email),
+        apparat=Apparat(name=submission.app_name, subject=submission.subject),
+        semester=submission.semester,
+        books=submission.books,
+    )
+    return semap_doc
--- a/src/logic/zotero.py
+++ b/src/logic/zotero.py
@@ -1,7 +1,9 @@
-from pyzotero import zotero
 from dataclasses import dataclass
-from src.logic.webrequest import WebRequest, BibTextTransformer
+
+from pyzotero import zotero
+
 from src import settings
+from src.logic.webrequest import BibTextTransformer, WebRequest


@dataclass
@@ -187,7 +189,7 @@ class ZoteroController:
        book = bib.return_data()
        return book

-    # # print(zot.item_template("bookSection"))
+    # # #print(zot.item_template("bookSection"))
    def createBook(self, isbn):
        book = self.__get_data(isbn)

@@ -210,7 +212,7 @@ class ZoteroController:
    def createItem(self, item):
        resp = self.zot.create_items([item])
        if "successful" in resp.keys():
-            # print(resp["successful"]["0"]["key"])
+            # #print(resp["successful"]["0"]["key"])
            return resp["successful"]["0"]["key"]
        else:
            return None
@@ -220,7 +222,7 @@ class ZoteroController:
        for item in items:
            if item["key"] == key:
                self.zot.delete_item(item)
-                # print(item)
+                # #print(item)
                break

    def createHGSection(self, book: Book, data: dict):
@@ -241,7 +243,7 @@ class ZoteroController:
        ]
        chapter.creators += authors

-        # print(chapter.to_dict())
+        # #print(chapter.to_dict())
        return self.createItem(chapter.to_dict())
        pass

@@ -257,7 +259,7 @@ class ZoteroController:
        # chapter.creators

    def createJournalArticle(self, journal, article):
-        # print(type(article))
+        # #print(type(article))
        journalarticle = JournalArticle()
        journalarticle.assign(journal)
        journalarticle.itemType = "journalArticle"
@@ -273,7 +275,7 @@ class ZoteroController:
        journalarticle.issue = article["issue"]
        journalarticle.url = article["isbn"]

-        # print(journalarticle.to_dict())
+        # #print(journalarticle.to_dict())

        return self.createItem(journalarticle.to_dict())

@@ -319,16 +321,16 @@ if __name__ == "__main__":
    # if isinstance(publishers, str):
    #     publishers = [publishers]
    # for publisher in publishers:
-    #     # print(publisher)
+    #     # #print(publisher)
    #     creator = Creator().from_string(publisher)
    #     creator.creatorType = "editor"
    #     authors.append(creator.__dict__)

    # chapter.creators = authors
    # chapter.publisher = book.publisher
-    # # print(chapter.to_dict())
+    # # #print(chapter.to_dict())
    # createBookSection(chapter.to_dict())
    # get_citation("9ZXH8DDE")
-    # # # print()
-    # # print(get_books())
-    # # print(zot.item_creator_types("bookSection"))
+    # # # #print()
+    # # #print(get_books())
+    # # #print(zot.item_creator_types("bookSection"))