update wordparser

This commit is contained in:
2025-05-26 13:22:47 +02:00
parent ac7c7ad60c
commit b1d523f574

View File

@@ -3,13 +3,14 @@ from docx import Document
from dataclasses import dataclass
from src.backend import Semester
from typing import Union, Any
from src.logic.openai import name_tester, run_shortener, semester_converter
import loguru
import sys
log = loguru.logger
log.remove()
log.add(sys.stdout)
log.add(sys.stdout, level="INFO")
log.add("logs/application.log", rotation="1 MB", retention="10 days")
@@ -75,12 +76,37 @@ class SemapDocument:
phoneNumber: int = None
mail: str = None
title: str = None
title_suggestions: list[str] = None
semester: Union[str, Semester] = None
books: list[Book] = None
eternal: bool = False
personName: str = None
personTitle: str = None
title_length = 0
title_max_length = 0
def __post_init__(self):
self.title_suggestions = []
@property
def nameSetter(self):
data = name_tester(self.personTitle)
name = f"{data['last_name']}, {data['first_name']}"
if data["title"] is not None:
title = data["title"]
self.personTitle = title
self.personName = name
self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0])
if self.title_length > 40:
log.warning("Title is too long")
name_len = len(self.personName.split(",")[0])
self.title_max_length = 38 - name_len
suggestions = run_shortener(self.title, self.title_max_length)
for suggestion in suggestions:
self.title_suggestions.append(suggestion["shortened_string"])
else:
self.title_suggestions = []
pass
@property
def renameSemester(self) -> None:
if ", Dauer" in self.semester:
@@ -88,8 +114,8 @@ class SemapDocument:
self.eternal = True
self.semester = Semester().from_string(self.semester)
else:
logger.warning("Semester {} is not valid", self.semester)
self.semester = None
log.warning("Semester {} is not valid", self.semester)
self.semester = Semester().from_string(semester_converter(self.semester))
@property
def signatures(self) -> list[str]:
@@ -105,7 +131,7 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
for table in tables:
data = []
for row in table.rows:
row_data = []
row_data: list[Any] = []
for cell in row.cells:
text = cell.text
text = text.replace("\n", "")
@@ -117,7 +143,6 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
m_data.append(df)
# for df[0, 1]: merge i and i+1 as key, value
return m_data
@@ -220,6 +245,7 @@ def word_to_semap(word_path: str) -> SemapDocument:
log.info("Parsing Word Document {}", word_path)
semap = SemapDocument()
df = word_docx_to_csv(word_path)
print(df)
apparatdata = df[0]
apparatdata = apparatdata.to_dict()
@@ -238,6 +264,8 @@ def word_to_semap(word_path: str) -> SemapDocument:
semap.title = appdata["Veranstaltung:"]
semap.semester = appdata["Semester:"]
semap.renameSemester
semap.nameSetter
books = df[2]
booklist = []
for i in range(len(books)):
@@ -254,7 +282,6 @@ def word_to_semap(word_path: str) -> SemapDocument:
booklist.append(book)
log.info("Found {} books", len(booklist))
semap.books = booklist
return semap