update wordparser
This commit is contained in:
@@ -3,13 +3,14 @@ from docx import Document
|
||||
from dataclasses import dataclass
|
||||
from src.backend import Semester
|
||||
from typing import Union, Any
|
||||
from src.logic.openai import name_tester, run_shortener, semester_converter
|
||||
|
||||
import loguru
|
||||
import sys
|
||||
|
||||
log = loguru.logger
|
||||
log.remove()
|
||||
log.add(sys.stdout)
|
||||
log.add(sys.stdout, level="INFO")
|
||||
log.add("logs/application.log", rotation="1 MB", retention="10 days")
|
||||
|
||||
|
||||
@@ -75,12 +76,37 @@ class SemapDocument:
|
||||
phoneNumber: int = None
|
||||
mail: str = None
|
||||
title: str = None
|
||||
title_suggestions: list[str] = None
|
||||
semester: Union[str, Semester] = None
|
||||
books: list[Book] = None
|
||||
eternal: bool = False
|
||||
personName: str = None
|
||||
personTitle: str = None
|
||||
title_length = 0
|
||||
title_max_length = 0
|
||||
|
||||
def __post_init__(self):
|
||||
self.title_suggestions = []
|
||||
|
||||
@property
|
||||
def nameSetter(self):
|
||||
data = name_tester(self.personTitle)
|
||||
name = f"{data['last_name']}, {data['first_name']}"
|
||||
if data["title"] is not None:
|
||||
title = data["title"]
|
||||
self.personTitle = title
|
||||
self.personName = name
|
||||
self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0])
|
||||
if self.title_length > 40:
|
||||
log.warning("Title is too long")
|
||||
name_len = len(self.personName.split(",")[0])
|
||||
self.title_max_length = 38 - name_len
|
||||
suggestions = run_shortener(self.title, self.title_max_length)
|
||||
for suggestion in suggestions:
|
||||
self.title_suggestions.append(suggestion["shortened_string"])
|
||||
else:
|
||||
self.title_suggestions = []
|
||||
pass
|
||||
@property
|
||||
def renameSemester(self) -> None:
|
||||
if ", Dauer" in self.semester:
|
||||
@@ -88,8 +114,8 @@ class SemapDocument:
|
||||
self.eternal = True
|
||||
self.semester = Semester().from_string(self.semester)
|
||||
else:
|
||||
logger.warning("Semester {} is not valid", self.semester)
|
||||
self.semester = None
|
||||
log.warning("Semester {} is not valid", self.semester)
|
||||
self.semester = Semester().from_string(semester_converter(self.semester))
|
||||
|
||||
@property
|
||||
def signatures(self) -> list[str]:
|
||||
@@ -105,7 +131,7 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
|
||||
for table in tables:
|
||||
data = []
|
||||
for row in table.rows:
|
||||
row_data = []
|
||||
row_data: list[Any] = []
|
||||
for cell in row.cells:
|
||||
text = cell.text
|
||||
text = text.replace("\n", "")
|
||||
@@ -117,7 +143,6 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
|
||||
|
||||
m_data.append(df)
|
||||
|
||||
# for df[0, 1]: merge i and i+1 as key, value
|
||||
|
||||
return m_data
|
||||
|
||||
@@ -220,6 +245,7 @@ def word_to_semap(word_path: str) -> SemapDocument:
|
||||
log.info("Parsing Word Document {}", word_path)
|
||||
semap = SemapDocument()
|
||||
df = word_docx_to_csv(word_path)
|
||||
print(df)
|
||||
apparatdata = df[0]
|
||||
apparatdata = apparatdata.to_dict()
|
||||
|
||||
@@ -238,6 +264,8 @@ def word_to_semap(word_path: str) -> SemapDocument:
|
||||
semap.title = appdata["Veranstaltung:"]
|
||||
semap.semester = appdata["Semester:"]
|
||||
semap.renameSemester
|
||||
semap.nameSetter
|
||||
|
||||
books = df[2]
|
||||
booklist = []
|
||||
for i in range(len(books)):
|
||||
@@ -254,7 +282,6 @@ def word_to_semap(word_path: str) -> SemapDocument:
|
||||
booklist.append(book)
|
||||
log.info("Found {} books", len(booklist))
|
||||
semap.books = booklist
|
||||
|
||||
return semap
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user