import pandas as pd from docx import Document from dataclasses import dataclass import sys from loguru import logger as log logger = log logger.remove() logger.add("logs/wordparser.log", rotation="1 week", enqueue=True) log.add( f"logs/application.log", rotation="1 day", compression="zip", enqueue=True, ) # logger.add(sys.stderr, format="{time} {level} {message}", level="INFO") logger.add(sys.stdout) letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" @dataclass class Book: author: str = None year: str = None edition: str = None title: str = None location: str = None publisher: str = None signature: str = None internal_notes: str = None @property def has_signature(self) -> bool: return self.signature is not None and self.signature != "" @property def is_empty(self) -> bool: return all( [ self.author == "", self.year == "", self.edition == "", self.title == "", self.location == "", self.publisher == "", self.signature == "", self.internal_notes == "", ] ) def from_dict(self, data: dict): for key, value in data.items(): if value == "\u2002\u2002\u2002\u2002\u2002": value = "" if key == "Autorenname(n):Nachname, Vorname": self.author = value elif key == "Jahr/Auflage": self.year = value.split("/")[0] if "/" in value else value self.edition = value.split("/")[1] if "/" in value else "" elif key == "Titel": self.title = value elif key == "Ort und Verlag": self.location = value.split(",")[0] if "," in value else value self.publisher = value.split(",")[1] if "," in value else "" elif key == "Standnummer": self.signature = value elif key == "Interne Vermerke": self.internal_notes = value @dataclass class SemapDocument: subject: str = None phoneNumber: int = None mail: str = None title: str = None semester: str = None books: list[Book] = None @property def renameSemester(self) -> None: if self.semester is not None: if "sommersemester" in self.semester.lower(): year = self.semester.split(" ")[-1] self.semester = f"SoSe {year}" elif "wintersemester" in self.semester.lower(): year = self.semester.split(" ")[-1] self.semester = f"WiSe {year}" @property def signatures(self) -> list[str]: if self.books is not None: return [book.signature for book in self.books if book.has_signature] return [] def word_docx_to_csv(path: str) -> list[pd.DataFrame]: doc = Document(path) tables = doc.tables m_data = [] for table in tables: data = [] for row in table.rows: row_data = [] for cell in row.cells: text = cell.text text = text.replace("\n", "") row_data.append(text) data.append(row_data) df = pd.DataFrame(data) df.columns = df.iloc[0] df = df.iloc[1:] m_data.append(df) # for df[0, 1]: merge i and i+1 as key, value return m_data def makeDict(): return { "work_author": None, "section_author": None, "year": None, "edition": None, "work_title": None, "chapter_title": None, "location": None, "publisher": None, "signature": None, "issue": None, "pages": None, "isbn": None, "type": None, } def tuple_to_dict(tlist: tuple, type: str) -> dict: ret = [] for line in tlist: data = makeDict() if type == "Monografien": data["type"] = type data["work_author"] = line[0] data["year"] = line[1] data["edition"] = line[2] data["work_title"] = line[3] data["location"] = line[4] data["publisher"] = line[5] data["signature"] = line[6] data["pages"] = line[7] elif type == "Herausgeberwerke": data["type"] = type data["section_author"] = line[0] data["year"] = line[1] data["edition"] = line[2] data["chapter_title"] = line[3] data["work_author"] = line[4] data["work_title"] = line[5] data["location"] = line[6] data["publisher"] = line[7] data["signature"] = line[9] data["pages"] = line[8] elif type == "Zeitschriftenaufsätze": data["type"] = type data["section_author"] = line[0] data["year"] = line[1] data["issue"] = line[2] data["chapter_title"] = line[3] data["work_title"] = line[4] data["location"] = line[5] data["publisher"] = line[6] data["signature"] = line[8] data["pages"] = line[7] ret.append(data) return ret def elsa_word_to_csv(path): doc = Document(path) # # print all lines in doc doctype = [para.text for para in doc.paragraphs if para.text != ""][-1] tuples = { "Monografien": ("", "", "", "", "", "", "", "", ""), "Herausgeberwerke": ("", "", "", "", "", "", "", "", "", "", ""), "Zeitschriftenaufsätze": ("", "", "", "", "", "", "", "", "", ""), } tables = doc.tables m_data = [] for table in tables: data = [] for row in table.rows: row_data = [] for cell in row.cells: text = cell.text text = text.replace("\n", "") text = text.replace("\u2002", "") row_data.append(text) data.append(row_data) df = pd.DataFrame(data) df.columns = df.iloc[0] df = df.iloc[1:] m_data.append(df) df = m_data[0] # split df to rows data = [ row for row in df.itertuples(index=False, name=None) if row != tuples[doctype] ] # print(data) return tuple_to_dict(data, doctype), doctype def word_to_semap(word_path: str) -> SemapDocument: logger.info("Parsing Word Document {}", word_path) semap = SemapDocument() df = word_docx_to_csv(word_path) apparatdata = df[0] apparatdata = apparatdata.to_dict() keys = list(apparatdata.keys()) appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)} semap.phoneNumber = appdata["Telefon:"] semap.subject = appdata["Ihr Fach:"] semap.mail = appdata["Mailadresse:"] apparatdata = df[1] apparatdata = apparatdata.to_dict() keys = list(apparatdata.keys()) appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)} semap.title = appdata["Veranstaltung:"] semap.semester = appdata["Semester:"] semap.renameSemester books = df[2] booklist = [] for i in range(len(books)): if books.iloc[i].isnull().all(): continue data = books.iloc[i].to_dict() book = Book() book.from_dict(data) if book.is_empty: continue elif not book.has_signature: continue else: booklist.append(book) logger.info("Found {} books", len(booklist)) semap.books = booklist return semap if __name__ == "__main__": else_df = word_to_semap( "C:/Users/aky547/Desktop/SA 80 titelmeldung_SoSe2025 Burth.docx" )