update wordparser
This commit is contained in:
@@ -3,13 +3,14 @@ from docx import Document
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from src.backend import Semester
|
from src.backend import Semester
|
||||||
from typing import Union, Any
|
from typing import Union, Any
|
||||||
|
from src.logic.openai import name_tester, run_shortener, semester_converter
|
||||||
|
|
||||||
import loguru
|
import loguru
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
log = loguru.logger
|
log = loguru.logger
|
||||||
log.remove()
|
log.remove()
|
||||||
log.add(sys.stdout)
|
log.add(sys.stdout, level="INFO")
|
||||||
log.add("logs/application.log", rotation="1 MB", retention="10 days")
|
log.add("logs/application.log", rotation="1 MB", retention="10 days")
|
||||||
|
|
||||||
|
|
||||||
@@ -75,12 +76,37 @@ class SemapDocument:
|
|||||||
phoneNumber: int = None
|
phoneNumber: int = None
|
||||||
mail: str = None
|
mail: str = None
|
||||||
title: str = None
|
title: str = None
|
||||||
|
title_suggestions: list[str] = None
|
||||||
semester: Union[str, Semester] = None
|
semester: Union[str, Semester] = None
|
||||||
books: list[Book] = None
|
books: list[Book] = None
|
||||||
eternal: bool = False
|
eternal: bool = False
|
||||||
personName: str = None
|
personName: str = None
|
||||||
personTitle: str = None
|
personTitle: str = None
|
||||||
|
title_length = 0
|
||||||
|
title_max_length = 0
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
self.title_suggestions = []
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nameSetter(self):
|
||||||
|
data = name_tester(self.personTitle)
|
||||||
|
name = f"{data['last_name']}, {data['first_name']}"
|
||||||
|
if data["title"] is not None:
|
||||||
|
title = data["title"]
|
||||||
|
self.personTitle = title
|
||||||
|
self.personName = name
|
||||||
|
self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0])
|
||||||
|
if self.title_length > 40:
|
||||||
|
log.warning("Title is too long")
|
||||||
|
name_len = len(self.personName.split(",")[0])
|
||||||
|
self.title_max_length = 38 - name_len
|
||||||
|
suggestions = run_shortener(self.title, self.title_max_length)
|
||||||
|
for suggestion in suggestions:
|
||||||
|
self.title_suggestions.append(suggestion["shortened_string"])
|
||||||
|
else:
|
||||||
|
self.title_suggestions = []
|
||||||
|
pass
|
||||||
@property
|
@property
|
||||||
def renameSemester(self) -> None:
|
def renameSemester(self) -> None:
|
||||||
if ", Dauer" in self.semester:
|
if ", Dauer" in self.semester:
|
||||||
@@ -88,8 +114,8 @@ class SemapDocument:
|
|||||||
self.eternal = True
|
self.eternal = True
|
||||||
self.semester = Semester().from_string(self.semester)
|
self.semester = Semester().from_string(self.semester)
|
||||||
else:
|
else:
|
||||||
logger.warning("Semester {} is not valid", self.semester)
|
log.warning("Semester {} is not valid", self.semester)
|
||||||
self.semester = None
|
self.semester = Semester().from_string(semester_converter(self.semester))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def signatures(self) -> list[str]:
|
def signatures(self) -> list[str]:
|
||||||
@@ -105,7 +131,7 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
|
|||||||
for table in tables:
|
for table in tables:
|
||||||
data = []
|
data = []
|
||||||
for row in table.rows:
|
for row in table.rows:
|
||||||
row_data = []
|
row_data: list[Any] = []
|
||||||
for cell in row.cells:
|
for cell in row.cells:
|
||||||
text = cell.text
|
text = cell.text
|
||||||
text = text.replace("\n", "")
|
text = text.replace("\n", "")
|
||||||
@@ -117,7 +143,6 @@ def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
|
|||||||
|
|
||||||
m_data.append(df)
|
m_data.append(df)
|
||||||
|
|
||||||
# for df[0, 1]: merge i and i+1 as key, value
|
|
||||||
|
|
||||||
return m_data
|
return m_data
|
||||||
|
|
||||||
@@ -220,6 +245,7 @@ def word_to_semap(word_path: str) -> SemapDocument:
|
|||||||
log.info("Parsing Word Document {}", word_path)
|
log.info("Parsing Word Document {}", word_path)
|
||||||
semap = SemapDocument()
|
semap = SemapDocument()
|
||||||
df = word_docx_to_csv(word_path)
|
df = word_docx_to_csv(word_path)
|
||||||
|
print(df)
|
||||||
apparatdata = df[0]
|
apparatdata = df[0]
|
||||||
apparatdata = apparatdata.to_dict()
|
apparatdata = apparatdata.to_dict()
|
||||||
|
|
||||||
@@ -238,6 +264,8 @@ def word_to_semap(word_path: str) -> SemapDocument:
|
|||||||
semap.title = appdata["Veranstaltung:"]
|
semap.title = appdata["Veranstaltung:"]
|
||||||
semap.semester = appdata["Semester:"]
|
semap.semester = appdata["Semester:"]
|
||||||
semap.renameSemester
|
semap.renameSemester
|
||||||
|
semap.nameSetter
|
||||||
|
|
||||||
books = df[2]
|
books = df[2]
|
||||||
booklist = []
|
booklist = []
|
||||||
for i in range(len(books)):
|
for i in range(len(books)):
|
||||||
@@ -254,7 +282,6 @@ def word_to_semap(word_path: str) -> SemapDocument:
|
|||||||
booklist.append(book)
|
booklist.append(book)
|
||||||
log.info("Found {} books", len(booklist))
|
log.info("Found {} books", len(booklist))
|
||||||
semap.books = booklist
|
semap.books = booklist
|
||||||
|
|
||||||
return semap
|
return semap
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user