rework wordparser
- add dropdown selector - fix bug where Telefon: key got overwritten
This commit is contained in:
@@ -4,7 +4,8 @@ from dataclasses import dataclass
|
||||
from src.backend import Semester
|
||||
from typing import Union, Any
|
||||
from src.logic.openai import name_tester, run_shortener, semester_converter
|
||||
|
||||
import zipfile
|
||||
from bs4 import BeautifulSoup
|
||||
import loguru
|
||||
import sys
|
||||
|
||||
@@ -134,8 +135,11 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
|
||||
row_data: list[Any] = []
|
||||
for cell in row.cells:
|
||||
text = cell.text
|
||||
|
||||
text = text.replace("\n", "")
|
||||
row_data.append(text)
|
||||
if text == "Ihr Fach:":
|
||||
row_data.append(get_fach(path))
|
||||
data.append(row_data)
|
||||
df = pd.DataFrame(data)
|
||||
df.columns = df.iloc[0]
|
||||
@@ -143,10 +147,27 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
|
||||
|
||||
m_data.append(df)
|
||||
|
||||
|
||||
return m_data
|
||||
|
||||
|
||||
def get_fach(path: str) -> str:
|
||||
document = zipfile.ZipFile(path)
|
||||
xml_data = document.read("word/document.xml")
|
||||
document.close()
|
||||
|
||||
soup = BeautifulSoup(xml_data, "xml")
|
||||
# text we need is in <w:p w14:paraId="12456A32" ... > -> w:r -> w:t
|
||||
paragraphs = soup.find_all("w:p")
|
||||
names = []
|
||||
for para in paragraphs:
|
||||
para_id = para.get("w14:paraId")
|
||||
if para_id == "12456A32":
|
||||
# get the data in the w:t
|
||||
for run in para.find_all("w:r"):
|
||||
data = run.find("w:t")
|
||||
return data.contents[0]
|
||||
|
||||
|
||||
def makeDict():
|
||||
return {
|
||||
"work_author": None,
|
||||
@@ -245,13 +266,12 @@ def word_to_semap(word_path: str) -> SemapDocument:
|
||||
log.info("Parsing Word Document {}", word_path)
|
||||
semap = SemapDocument()
|
||||
df = word_docx_to_csv(word_path)
|
||||
print(df)
|
||||
apparatdata = df[0]
|
||||
apparatdata = apparatdata.to_dict()
|
||||
|
||||
keys = list(apparatdata.keys())
|
||||
print(apparatdata, keys)
|
||||
|
||||
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)}
|
||||
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys) - 1, 2)}
|
||||
semap.phoneNumber = appdata["Telefon:"]
|
||||
semap.subject = appdata["Ihr Fach:"]
|
||||
semap.mail = appdata["Mailadresse:"]
|
||||
|
||||
Reference in New Issue
Block a user