add SemapDocument and Book dataclasses, improve word document parsing
This commit is contained in:
@@ -1,12 +1,93 @@
|
||||
import pandas as pd
|
||||
from docx import Document
|
||||
from dataclasses import dataclass
|
||||
|
||||
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||||
|
||||
|
||||
def word_docx_to_csv(path) -> pd.DataFrame:
|
||||
@dataclass
|
||||
class Book:
|
||||
author: str = None
|
||||
year: str = None
|
||||
edition: str = None
|
||||
title: str = None
|
||||
location: str = None
|
||||
publisher: str = None
|
||||
signature: str = None
|
||||
internal_notes: str = None
|
||||
|
||||
@property
|
||||
def has_signature(self) -> bool:
|
||||
return self.signature is not None and self.signature != ""
|
||||
|
||||
@property
|
||||
def is_empty(self) -> bool:
|
||||
return all(
|
||||
[
|
||||
self.author == "",
|
||||
self.year == "",
|
||||
self.edition == "",
|
||||
self.title == "",
|
||||
self.location == "",
|
||||
self.publisher == "",
|
||||
self.signature == "",
|
||||
self.internal_notes == "",
|
||||
]
|
||||
)
|
||||
|
||||
def from_dict(self, data: dict):
|
||||
for key, value in data.items():
|
||||
if value == "\u2002\u2002\u2002\u2002\u2002":
|
||||
value = ""
|
||||
|
||||
if key == "Autorenname(n):Nachname, Vorname":
|
||||
self.author = value
|
||||
elif key == "Jahr/Auflage":
|
||||
self.year = value.split("/")[0] if "/" in value else value
|
||||
self.edition = value.split("/")[1] if "/" in value else ""
|
||||
elif key == "Titel":
|
||||
self.title = value
|
||||
elif key == "Ort und Verlag":
|
||||
self.location = value.split(",")[0] if "," in value else value
|
||||
self.publisher = value.split(",")[1] if "," in value else ""
|
||||
elif key == "Standnummer":
|
||||
self.signature = value
|
||||
elif key == "Interne Vermerke":
|
||||
self.internal_notes = value
|
||||
|
||||
|
||||
@dataclass
|
||||
class SemapDocument:
|
||||
subject: str = None
|
||||
phoneNumber: int = None
|
||||
mail: str = None
|
||||
title: str = None
|
||||
semester: str = None
|
||||
books: list[Book] = None
|
||||
|
||||
@property
|
||||
def renameSemester(self) -> None:
|
||||
if self.semester is not None:
|
||||
if "sommersemester" in self.semester.lower():
|
||||
year = self.semester.split(" ")[-1]
|
||||
self.semester = f"SoSe {year}"
|
||||
elif "wintersemester" in self.semester.lower():
|
||||
year = self.semester.split(" ")[-1]
|
||||
self.semester = f"WiSe {year}"
|
||||
|
||||
@property
|
||||
def signatures(self) -> list[str]:
|
||||
if self.books is not None:
|
||||
return [book.signature for book in self.books if book.has_signature]
|
||||
return []
|
||||
|
||||
|
||||
def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
|
||||
doc = Document(path)
|
||||
tables = doc.tables
|
||||
print("Tables: ", len(tables))
|
||||
|
||||
# print content of all tables
|
||||
|
||||
m_data = []
|
||||
for table in tables:
|
||||
@@ -24,8 +105,9 @@ def word_docx_to_csv(path) -> pd.DataFrame:
|
||||
|
||||
m_data.append(df)
|
||||
|
||||
df = m_data[2]
|
||||
return df
|
||||
# for df[0, 1]: merge i and i+1 as key, value
|
||||
|
||||
return m_data
|
||||
|
||||
|
||||
def makeDict():
|
||||
@@ -122,6 +204,46 @@ def elsa_word_to_csv(path):
|
||||
return tuple_to_dict(data, doctype), doctype
|
||||
|
||||
|
||||
def word_to_semap(word_path: str) -> SemapDocument:
|
||||
semap = SemapDocument()
|
||||
df = word_docx_to_csv(word_path)
|
||||
apparatdata = df[0]
|
||||
apparatdata = apparatdata.to_dict()
|
||||
keys = list(apparatdata.keys())
|
||||
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)}
|
||||
print(appdata.keys())
|
||||
semap.phoneNumber = appdata["Telefon:"]
|
||||
semap.subject = appdata["Ihr Fach:"]
|
||||
semap.mail = appdata["Mailadresse:"]
|
||||
apparatdata = df[1]
|
||||
apparatdata = apparatdata.to_dict()
|
||||
keys = list(apparatdata.keys())
|
||||
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)}
|
||||
semap.title = appdata["Veranstaltung:"]
|
||||
semap.semester = appdata["Semester:"]
|
||||
semap.renameSemester
|
||||
books = df[2]
|
||||
booklist = []
|
||||
for i in range(len(books)):
|
||||
if books.iloc[i].isnull().all():
|
||||
continue
|
||||
data = books.iloc[i].to_dict()
|
||||
book = Book()
|
||||
book.from_dict(data)
|
||||
if book.is_empty:
|
||||
continue
|
||||
elif not book.has_signature:
|
||||
continue
|
||||
else:
|
||||
booklist.append(book)
|
||||
|
||||
semap.books = booklist
|
||||
|
||||
return semap
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
else_df = elsa_word_to_csv("C:/Users/aky547/Desktop/Antrag ELSA Schweitzer.docx")
|
||||
# print(else_df)
|
||||
else_df = word_to_semap(
|
||||
"C:/Users/aky547/Desktop/SA 80 titelmeldung_SoSe2025 Burth.docx"
|
||||
)
|
||||
print(else_df)
|
||||
|
||||
Reference in New Issue
Block a user