From b1d523f574817f792a0c0099146afb0319e676a7 Mon Sep 17 00:00:00 2001 From: WorldTeacher Date: Mon, 26 May 2025 13:22:47 +0200 Subject: [PATCH] update wordparser --- src/logic/wordparser.py | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/src/logic/wordparser.py b/src/logic/wordparser.py index c0ad35a..77391a1 100644 --- a/src/logic/wordparser.py +++ b/src/logic/wordparser.py @@ -3,13 +3,14 @@ from docx import Document from dataclasses import dataclass from src.backend import Semester from typing import Union, Any +from src.logic.openai import name_tester, run_shortener, semester_converter import loguru import sys log = loguru.logger log.remove() -log.add(sys.stdout) +log.add(sys.stdout, level="INFO") log.add("logs/application.log", rotation="1 MB", retention="10 days") @@ -75,12 +76,37 @@ class SemapDocument: phoneNumber: int = None mail: str = None title: str = None + title_suggestions: list[str] = None semester: Union[str, Semester] = None books: list[Book] = None eternal: bool = False personName: str = None personTitle: str = None + title_length = 0 + title_max_length = 0 + def __post_init__(self): + self.title_suggestions = [] + + @property + def nameSetter(self): + data = name_tester(self.personTitle) + name = f"{data['last_name']}, {data['first_name']}" + if data["title"] is not None: + title = data["title"] + self.personTitle = title + self.personName = name + self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0]) + if self.title_length > 40: + log.warning("Title is too long") + name_len = len(self.personName.split(",")[0]) + self.title_max_length = 38 - name_len + suggestions = run_shortener(self.title, self.title_max_length) + for suggestion in suggestions: + self.title_suggestions.append(suggestion["shortened_string"]) + else: + self.title_suggestions = [] + pass @property def renameSemester(self) -> None: if ", Dauer" in self.semester: @@ -88,8 +114,8 @@ class SemapDocument: self.eternal = True self.semester = Semester().from_string(self.semester) else: - logger.warning("Semester {} is not valid", self.semester) - self.semester = None + log.warning("Semester {} is not valid", self.semester) + self.semester = Semester().from_string(semester_converter(self.semester)) @property def signatures(self) -> list[str]: @@ -105,7 +131,7 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]: for table in tables: data = [] for row in table.rows: - row_data = [] + row_data: list[Any] = [] for cell in row.cells: text = cell.text text = text.replace("\n", "") @@ -117,7 +143,6 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]: m_data.append(df) - # for df[0, 1]: merge i and i+1 as key, value return m_data @@ -220,6 +245,7 @@ def word_to_semap(word_path: str) -> SemapDocument: log.info("Parsing Word Document {}", word_path) semap = SemapDocument() df = word_docx_to_csv(word_path) + print(df) apparatdata = df[0] apparatdata = apparatdata.to_dict() @@ -238,6 +264,8 @@ def word_to_semap(word_path: str) -> SemapDocument: semap.title = appdata["Veranstaltung:"] semap.semester = appdata["Semester:"] semap.renameSemester + semap.nameSetter + books = df[2] booklist = [] for i in range(len(books)): @@ -254,7 +282,6 @@ def word_to_semap(word_path: str) -> SemapDocument: booklist.append(book) log.info("Found {} books", len(booklist)) semap.books = booklist - return semap