update wordparser

This commit is contained in:
2025-05-26 13:22:47 +02:00
parent ac7c7ad60c
commit b1d523f574

View File

@@ -3,13 +3,14 @@ from docx import Document
from dataclasses import dataclass from dataclasses import dataclass
from src.backend import Semester from src.backend import Semester
from typing import Union, Any from typing import Union, Any
from src.logic.openai import name_tester, run_shortener, semester_converter
import loguru import loguru
import sys import sys
log = loguru.logger log = loguru.logger
log.remove() log.remove()
log.add(sys.stdout) log.add(sys.stdout, level="INFO")
log.add("logs/application.log", rotation="1 MB", retention="10 days") log.add("logs/application.log", rotation="1 MB", retention="10 days")
@@ -75,12 +76,37 @@ class SemapDocument:
phoneNumber: int = None phoneNumber: int = None
mail: str = None mail: str = None
title: str = None title: str = None
title_suggestions: list[str] = None
semester: Union[str, Semester] = None semester: Union[str, Semester] = None
books: list[Book] = None books: list[Book] = None
eternal: bool = False eternal: bool = False
personName: str = None personName: str = None
personTitle: str = None personTitle: str = None
title_length = 0
title_max_length = 0
def __post_init__(self):
self.title_suggestions = []
@property
def nameSetter(self):
data = name_tester(self.personTitle)
name = f"{data['last_name']}, {data['first_name']}"
if data["title"] is not None:
title = data["title"]
self.personTitle = title
self.personName = name
self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0])
if self.title_length > 40:
log.warning("Title is too long")
name_len = len(self.personName.split(",")[0])
self.title_max_length = 38 - name_len
suggestions = run_shortener(self.title, self.title_max_length)
for suggestion in suggestions:
self.title_suggestions.append(suggestion["shortened_string"])
else:
self.title_suggestions = []
pass
@property @property
def renameSemester(self) -> None: def renameSemester(self) -> None:
if ", Dauer" in self.semester: if ", Dauer" in self.semester:
@@ -88,8 +114,8 @@ class SemapDocument:
self.eternal = True self.eternal = True
self.semester = Semester().from_string(self.semester) self.semester = Semester().from_string(self.semester)
else: else:
logger.warning("Semester {} is not valid", self.semester) log.warning("Semester {} is not valid", self.semester)
self.semester = None self.semester = Semester().from_string(semester_converter(self.semester))
@property @property
def signatures(self) -> list[str]: def signatures(self) -> list[str]:
@@ -105,7 +131,7 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
for table in tables: for table in tables:
data = [] data = []
for row in table.rows: for row in table.rows:
row_data = [] row_data: list[Any] = []
for cell in row.cells: for cell in row.cells:
text = cell.text text = cell.text
text = text.replace("\n", "") text = text.replace("\n", "")
@@ -117,7 +143,6 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
m_data.append(df) m_data.append(df)
# for df[0, 1]: merge i and i+1 as key, value
return m_data return m_data
@@ -220,6 +245,7 @@ def word_to_semap(word_path: str) -> SemapDocument:
log.info("Parsing Word Document {}", word_path) log.info("Parsing Word Document {}", word_path)
semap = SemapDocument() semap = SemapDocument()
df = word_docx_to_csv(word_path) df = word_docx_to_csv(word_path)
print(df)
apparatdata = df[0] apparatdata = df[0]
apparatdata = apparatdata.to_dict() apparatdata = apparatdata.to_dict()
@@ -238,6 +264,8 @@ def word_to_semap(word_path: str) -> SemapDocument:
semap.title = appdata["Veranstaltung:"] semap.title = appdata["Veranstaltung:"]
semap.semester = appdata["Semester:"] semap.semester = appdata["Semester:"]
semap.renameSemester semap.renameSemester
semap.nameSetter
books = df[2] books = df[2]
booklist = [] booklist = []
for i in range(len(books)): for i in range(len(books)):
@@ -254,7 +282,6 @@ def word_to_semap(word_path: str) -> SemapDocument:
booklist.append(book) booklist.append(book)
log.info("Found {} books", len(booklist)) log.info("Found {} books", len(booklist))
semap.books = booklist semap.books = booklist
return semap return semap