rework wordparser
- add dropdown selector - fix bug where Telefon: key got overwritten
This commit is contained in:
@@ -4,7 +4,8 @@ from dataclasses import dataclass
|
|||||||
from src.backend import Semester
|
from src.backend import Semester
|
||||||
from typing import Union, Any
|
from typing import Union, Any
|
||||||
from src.logic.openai import name_tester, run_shortener, semester_converter
|
from src.logic.openai import name_tester, run_shortener, semester_converter
|
||||||
|
import zipfile
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
import loguru
|
import loguru
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
@@ -134,8 +135,11 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
|
|||||||
row_data: list[Any] = []
|
row_data: list[Any] = []
|
||||||
for cell in row.cells:
|
for cell in row.cells:
|
||||||
text = cell.text
|
text = cell.text
|
||||||
|
|
||||||
text = text.replace("\n", "")
|
text = text.replace("\n", "")
|
||||||
row_data.append(text)
|
row_data.append(text)
|
||||||
|
if text == "Ihr Fach:":
|
||||||
|
row_data.append(get_fach(path))
|
||||||
data.append(row_data)
|
data.append(row_data)
|
||||||
df = pd.DataFrame(data)
|
df = pd.DataFrame(data)
|
||||||
df.columns = df.iloc[0]
|
df.columns = df.iloc[0]
|
||||||
@@ -143,10 +147,27 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
|
|||||||
|
|
||||||
m_data.append(df)
|
m_data.append(df)
|
||||||
|
|
||||||
|
|
||||||
return m_data
|
return m_data
|
||||||
|
|
||||||
|
|
||||||
|
def get_fach(path: str) -> str:
|
||||||
|
document = zipfile.ZipFile(path)
|
||||||
|
xml_data = document.read("word/document.xml")
|
||||||
|
document.close()
|
||||||
|
|
||||||
|
soup = BeautifulSoup(xml_data, "xml")
|
||||||
|
# text we need is in <w:p w14:paraId="12456A32" ... > -> w:r -> w:t
|
||||||
|
paragraphs = soup.find_all("w:p")
|
||||||
|
names = []
|
||||||
|
for para in paragraphs:
|
||||||
|
para_id = para.get("w14:paraId")
|
||||||
|
if para_id == "12456A32":
|
||||||
|
# get the data in the w:t
|
||||||
|
for run in para.find_all("w:r"):
|
||||||
|
data = run.find("w:t")
|
||||||
|
return data.contents[0]
|
||||||
|
|
||||||
|
|
||||||
def makeDict():
|
def makeDict():
|
||||||
return {
|
return {
|
||||||
"work_author": None,
|
"work_author": None,
|
||||||
@@ -245,13 +266,12 @@ def word_to_semap(word_path: str) -> SemapDocument:
|
|||||||
log.info("Parsing Word Document {}", word_path)
|
log.info("Parsing Word Document {}", word_path)
|
||||||
semap = SemapDocument()
|
semap = SemapDocument()
|
||||||
df = word_docx_to_csv(word_path)
|
df = word_docx_to_csv(word_path)
|
||||||
print(df)
|
|
||||||
apparatdata = df[0]
|
apparatdata = df[0]
|
||||||
apparatdata = apparatdata.to_dict()
|
apparatdata = apparatdata.to_dict()
|
||||||
|
|
||||||
keys = list(apparatdata.keys())
|
keys = list(apparatdata.keys())
|
||||||
|
print(apparatdata, keys)
|
||||||
|
|
||||||
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)}
|
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys) - 1, 2)}
|
||||||
semap.phoneNumber = appdata["Telefon:"]
|
semap.phoneNumber = appdata["Telefon:"]
|
||||||
semap.subject = appdata["Ihr Fach:"]
|
semap.subject = appdata["Ihr Fach:"]
|
||||||
semap.mail = appdata["Mailadresse:"]
|
semap.mail = appdata["Mailadresse:"]
|
||||||
|
|||||||
Reference in New Issue
Block a user