import sys import zipfile from dataclasses import dataclass from typing import Any, Union import loguru import pandas as pd from bs4 import BeautifulSoup from docx import Document from src import LOG_DIR from src.backend import Semester from src.logic.openai import name_tester, run_shortener, semester_converter log = loguru.logger log.remove() log.add(sys.stdout, level="INFO") log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days") letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" @dataclass class Book: author: str = None year: str = None edition: str = None title: str = None location: str = None publisher: str = None signature: str = None internal_notes: str = None @property def has_signature(self) -> bool: return self.signature is not None and self.signature != "" @property def is_empty(self) -> bool: return all( [ self.author == "", self.year == "", self.edition == "", self.title == "", self.location == "", self.publisher == "", self.signature == "", self.internal_notes == "", ] ) def from_dict(self, data: dict[str, Any]): for key, value in data.items(): value = value.strip() if value == "\u2002\u2002\u2002\u2002\u2002": value = "" if key == "Autorenname(n):Nachname, Vorname": self.author = value elif key == "Jahr/Auflage": self.year = value.split("/")[0] if "/" in value else value self.edition = value.split("/")[1] if "/" in value else "" elif key == "Titel": self.title = value elif key == "Ort und Verlag": self.location = value.split(",")[0] if "," in value else value self.publisher = value.split(",")[1] if "," in value else "" elif key == "Standnummer": self.signature = value.strip() elif key == "Interne Vermerke": self.internal_notes = value @dataclass class SemapDocument: subject: str = None phoneNumber: int = None mail: str = None title: str = None title_suggestions: list[str] = None semester: Union[str, Semester] = None books: list[Book] = None eternal: bool = False personName: str = None personTitle: str = None title_length = 0 title_max_length = 0 def __post_init__(self): self.title_suggestions = [] @property def nameSetter(self): data = name_tester(self.personTitle) name = f"{data['last_name']}, {data['first_name']}" if data["title"] is not None: title = data["title"] self.personTitle = title self.personName = name self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0]) if self.title_length > 40: log.warning("Title is too long") name_len = len(self.personName.split(",")[0]) self.title_max_length = 38 - name_len suggestions = run_shortener(self.title, self.title_max_length) for suggestion in suggestions: self.title_suggestions.append(suggestion["shortened_string"]) else: self.title_suggestions = [] pass @property def renameSemester(self) -> None: if ", Dauer" in self.semester: self.semester = self.semester.split(",")[0] self.eternal = True self.semester = Semester().from_string(self.semester) else: log.warning("Semester {} is not valid", self.semester) self.semester = Semester().from_string(semester_converter(self.semester)) @property def signatures(self) -> list[str]: if self.books is not None: return [book.signature for book in self.books if book.has_signature] return [] def word_docx_to_csv(path: str) -> list[pd.DataFrame]: doc = Document(path) tables = doc.tables m_data = [] for table in tables: data = [] for row in table.rows: row_data: list[Any] = [] for cell in row.cells: text = cell.text text = text.replace("\n", "") row_data.append(text) if text == "Ihr Fach:": row_data.append(get_fach(path)) data.append(row_data) df = pd.DataFrame(data) df.columns = df.iloc[0] df = df.iloc[1:] m_data.append(df) return m_data def get_fach(path: str) -> str: document = zipfile.ZipFile(path) xml_data = document.read("word/document.xml") document.close() soup = BeautifulSoup(xml_data, "xml") # text we need is in -> w:r -> w:t paragraphs = soup.find_all("w:p") names = [] for para in paragraphs: para_id = para.get("w14:paraId") if para_id == "12456A32": # get the data in the w:t for run in para.find_all("w:r"): data = run.find("w:t") return data.contents[0] def makeDict(): return { "work_author": None, "section_author": None, "year": None, "edition": None, "work_title": None, "chapter_title": None, "location": None, "publisher": None, "signature": None, "issue": None, "pages": None, "isbn": None, "type": None, } def tuple_to_dict(tlist: tuple, type: str) -> dict: ret = [] for line in tlist: data = makeDict() if type == "Monografien": data["type"] = type data["work_author"] = line[0] data["year"] = line[1] data["edition"] = line[2] data["work_title"] = line[3] data["location"] = line[4] data["publisher"] = line[5] data["signature"] = line[6] data["pages"] = line[7] elif type == "Herausgeberwerke": data["type"] = type data["section_author"] = line[0] data["year"] = line[1] data["edition"] = line[2] data["chapter_title"] = line[3] data["work_author"] = line[4] data["work_title"] = line[5] data["location"] = line[6] data["publisher"] = line[7] data["signature"] = line[9] data["pages"] = line[8] elif type == "Zeitschriftenaufsätze": data["type"] = type data["section_author"] = line[0] data["year"] = line[1] data["issue"] = line[2] data["chapter_title"] = line[3] data["work_title"] = line[4] data["location"] = line[5] data["publisher"] = line[6] data["signature"] = line[8] data["pages"] = line[7] ret.append(data) return ret def elsa_word_to_csv(path: str): doc = Document(path) # # print all lines in doc doctype = [para.text for para in doc.paragraphs if para.text != ""][-1] tuples = { "Monografien": ("", "", "", "", "", "", "", "", ""), "Herausgeberwerke": ("", "", "", "", "", "", "", "", "", "", ""), "Zeitschriftenaufsätze": ("", "", "", "", "", "", "", "", "", ""), } tables = doc.tables m_data: list[pd.DataFrame] = [] for table in tables: data: list[list[str]] = [] for row in table.rows: row_data: list[str] = [] for cell in row.cells: text = cell.text text = text.replace("\n", "") text = text.replace("\u2002", "") row_data.append(text) data.append(row_data) df = pd.DataFrame(data) df.columns = df.iloc[0] df = df.iloc[1:] m_data.append(df) df = m_data[0] # split df to rows data = [ row for row in df.itertuples(index=False, name=None) if row != tuples[doctype] ] # log.debug(data) return tuple_to_dict(data, doctype), doctype def word_to_semap(word_path: str) -> SemapDocument: log.info("Parsing Word Document {}", word_path) semap = SemapDocument() df = word_docx_to_csv(word_path) apparatdata = df[0] apparatdata = apparatdata.to_dict() keys = list(apparatdata.keys()) print(apparatdata, keys) appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys) - 1, 2)} semap.phoneNumber = appdata["Telefon:"] semap.subject = appdata["Ihr Fach:"] semap.mail = appdata["Mailadresse:"] semap.personName = ",".join(appdata["Ihr Name und Titel:"].split(",")[:-1]) semap.personTitle = ",".join(appdata["Ihr Name und Titel:"].split(",")[-1:]).strip() apparatdata = df[1] apparatdata = apparatdata.to_dict() keys = list(apparatdata.keys()) appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)} semap.title = appdata["Veranstaltung:"] semap.semester = appdata["Semester:"] semap.renameSemester semap.nameSetter books = df[2] booklist = [] for i in range(len(books)): if books.iloc[i].isnull().all(): continue data = books.iloc[i].to_dict() book = Book() book.from_dict(data) if book.is_empty: continue elif not book.has_signature: continue else: booklist.append(book) log.info("Found {} books", len(booklist)) semap.books = booklist return semap if __name__ == "__main__": else_df = elsa_word_to_csv( "C:/Users/aky547/Desktop/ELSA_Bestellung Scann Der Westen und der Rest.docx" ) print(else_df)