diff --git a/src/logic/wordparser.py b/src/logic/wordparser.py index 77391a1..c113e16 100644 --- a/src/logic/wordparser.py +++ b/src/logic/wordparser.py @@ -4,7 +4,8 @@ from dataclasses import dataclass from src.backend import Semester from typing import Union, Any from src.logic.openai import name_tester, run_shortener, semester_converter - +import zipfile +from bs4 import BeautifulSoup import loguru import sys @@ -134,8 +135,11 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]: row_data: list[Any] = [] for cell in row.cells: text = cell.text + text = text.replace("\n", "") row_data.append(text) + if text == "Ihr Fach:": + row_data.append(get_fach(path)) data.append(row_data) df = pd.DataFrame(data) df.columns = df.iloc[0] @@ -143,10 +147,27 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]: m_data.append(df) - return m_data +def get_fach(path: str) -> str: + document = zipfile.ZipFile(path) + xml_data = document.read("word/document.xml") + document.close() + + soup = BeautifulSoup(xml_data, "xml") + # text we need is in -> w:r -> w:t + paragraphs = soup.find_all("w:p") + names = [] + for para in paragraphs: + para_id = para.get("w14:paraId") + if para_id == "12456A32": + # get the data in the w:t + for run in para.find_all("w:r"): + data = run.find("w:t") + return data.contents[0] + + def makeDict(): return { "work_author": None, @@ -245,13 +266,12 @@ def word_to_semap(word_path: str) -> SemapDocument: log.info("Parsing Word Document {}", word_path) semap = SemapDocument() df = word_docx_to_csv(word_path) - print(df) apparatdata = df[0] apparatdata = apparatdata.to_dict() - keys = list(apparatdata.keys()) + print(apparatdata, keys) - appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)} + appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys) - 1, 2)} semap.phoneNumber = appdata["Telefon:"] semap.subject = appdata["Ihr Fach:"] semap.mail = appdata["Mailadresse:"]