rework wordparser

- add dropdown selector
- fix bug where Telefon: key got overwritten
This commit is contained in:
2025-06-03 13:15:06 +02:00
parent e29b630405
commit d02a8a271f

View File

@@ -4,7 +4,8 @@ from dataclasses import dataclass
from src.backend import Semester from src.backend import Semester
from typing import Union, Any from typing import Union, Any
from src.logic.openai import name_tester, run_shortener, semester_converter from src.logic.openai import name_tester, run_shortener, semester_converter
import zipfile
from bs4 import BeautifulSoup
import loguru import loguru
import sys import sys
@@ -134,8 +135,11 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
row_data: list[Any] = [] row_data: list[Any] = []
for cell in row.cells: for cell in row.cells:
text = cell.text text = cell.text
text = text.replace("\n", "") text = text.replace("\n", "")
row_data.append(text) row_data.append(text)
if text == "Ihr Fach:":
row_data.append(get_fach(path))
data.append(row_data) data.append(row_data)
df = pd.DataFrame(data) df = pd.DataFrame(data)
df.columns = df.iloc[0] df.columns = df.iloc[0]
@@ -143,10 +147,27 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
m_data.append(df) m_data.append(df)
return m_data return m_data
def get_fach(path: str) -> str:
document = zipfile.ZipFile(path)
xml_data = document.read("word/document.xml")
document.close()
soup = BeautifulSoup(xml_data, "xml")
# text we need is in <w:p w14:paraId="12456A32" ... > -> w:r -> w:t
paragraphs = soup.find_all("w:p")
names = []
for para in paragraphs:
para_id = para.get("w14:paraId")
if para_id == "12456A32":
# get the data in the w:t
for run in para.find_all("w:r"):
data = run.find("w:t")
return data.contents[0]
def makeDict(): def makeDict():
return { return {
"work_author": None, "work_author": None,
@@ -245,13 +266,12 @@ def word_to_semap(word_path: str) -> SemapDocument:
log.info("Parsing Word Document {}", word_path) log.info("Parsing Word Document {}", word_path)
semap = SemapDocument() semap = SemapDocument()
df = word_docx_to_csv(word_path) df = word_docx_to_csv(word_path)
print(df)
apparatdata = df[0] apparatdata = df[0]
apparatdata = apparatdata.to_dict() apparatdata = apparatdata.to_dict()
keys = list(apparatdata.keys()) keys = list(apparatdata.keys())
print(apparatdata, keys)
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)} appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys) - 1, 2)}
semap.phoneNumber = appdata["Telefon:"] semap.phoneNumber = appdata["Telefon:"]
semap.subject = appdata["Ihr Fach:"] semap.subject = appdata["Ihr Fach:"]
semap.mail = appdata["Mailadresse:"] semap.mail = appdata["Mailadresse:"]