Files
SemesterapparatsManager/src/logic/wordparser.py

316 lines
9.9 KiB
Python

import sys
import zipfile
from dataclasses import dataclass
from typing import Any, Union
import loguru
import pandas as pd
from bs4 import BeautifulSoup
from docx import Document
from src import LOG_DIR
from src.backend import Semester
from src.logic.openai import name_tester, run_shortener, semester_converter
log = loguru.logger
log.remove()
log.add(sys.stdout, level="INFO")
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
@dataclass
class Book:
author: str = None
year: str = None
edition: str = None
title: str = None
location: str = None
publisher: str = None
signature: str = None
internal_notes: str = None
@property
def has_signature(self) -> bool:
return self.signature is not None and self.signature != ""
@property
def is_empty(self) -> bool:
return all(
[
self.author == "",
self.year == "",
self.edition == "",
self.title == "",
self.location == "",
self.publisher == "",
self.signature == "",
self.internal_notes == "",
]
)
def from_dict(self, data: dict[str, Any]):
for key, value in data.items():
value = value.strip()
if value == "\u2002\u2002\u2002\u2002\u2002":
value = ""
if key == "Autorenname(n):Nachname, Vorname":
self.author = value
elif key == "Jahr/Auflage":
self.year = value.split("/")[0] if "/" in value else value
self.edition = value.split("/")[1] if "/" in value else ""
elif key == "Titel":
self.title = value
elif key == "Ort und Verlag":
self.location = value.split(",")[0] if "," in value else value
self.publisher = value.split(",")[1] if "," in value else ""
elif key == "Standnummer":
self.signature = value.strip()
elif key == "Interne Vermerke":
self.internal_notes = value
@dataclass
class SemapDocument:
subject: str = None
phoneNumber: int = None
mail: str = None
title: str = None
title_suggestions: list[str] = None
semester: Union[str, Semester] = None
books: list[Book] = None
eternal: bool = False
personName: str = None
personTitle: str = None
title_length = 0
title_max_length = 0
def __post_init__(self):
self.title_suggestions = []
@property
def nameSetter(self):
data = name_tester(self.personTitle)
name = f"{data['last_name']}, {data['first_name']}"
if data["title"] is not None:
title = data["title"]
self.personTitle = title
self.personName = name
self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0])
if self.title_length > 40:
log.warning("Title is too long")
name_len = len(self.personName.split(",")[0])
self.title_max_length = 38 - name_len
suggestions = run_shortener(self.title, self.title_max_length)
for suggestion in suggestions:
self.title_suggestions.append(suggestion["shortened_string"])
else:
self.title_suggestions = []
pass
@property
def renameSemester(self) -> None:
if ", Dauer" in self.semester:
self.semester = self.semester.split(",")[0]
self.eternal = True
self.semester = Semester().from_string(self.semester)
else:
log.warning("Semester {} is not valid", self.semester)
self.semester = Semester().from_string(semester_converter(self.semester))
@property
def signatures(self) -> list[str]:
if self.books is not None:
return [book.signature for book in self.books if book.has_signature]
return []
def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
doc = Document(path)
tables = doc.tables
m_data = []
for table in tables:
data = []
for row in table.rows:
row_data: list[Any] = []
for cell in row.cells:
text = cell.text
text = text.replace("\n", "")
row_data.append(text)
if text == "Ihr Fach:":
row_data.append(get_fach(path))
data.append(row_data)
df = pd.DataFrame(data)
df.columns = df.iloc[0]
df = df.iloc[1:]
m_data.append(df)
return m_data
def get_fach(path: str) -> str:
document = zipfile.ZipFile(path)
xml_data = document.read("word/document.xml")
document.close()
soup = BeautifulSoup(xml_data, "xml")
# text we need is in <w:p w14:paraId="12456A32" ... > -> w:r -> w:t
paragraphs = soup.find_all("w:p")
names = []
for para in paragraphs:
para_id = para.get("w14:paraId")
if para_id == "12456A32":
# get the data in the w:t
for run in para.find_all("w:r"):
data = run.find("w:t")
return data.contents[0]
def makeDict():
return {
"work_author": None,
"section_author": None,
"year": None,
"edition": None,
"work_title": None,
"chapter_title": None,
"location": None,
"publisher": None,
"signature": None,
"issue": None,
"pages": None,
"isbn": None,
"type": None,
}
def tuple_to_dict(tlist: tuple, type: str) -> dict:
ret = []
for line in tlist:
data = makeDict()
if type == "Monografien":
data["type"] = type
data["work_author"] = line[0]
data["year"] = line[1]
data["edition"] = line[2]
data["work_title"] = line[3]
data["location"] = line[4]
data["publisher"] = line[5]
data["signature"] = line[6]
data["pages"] = line[7]
elif type == "Herausgeberwerke":
data["type"] = type
data["section_author"] = line[0]
data["year"] = line[1]
data["edition"] = line[2]
data["chapter_title"] = line[3]
data["work_author"] = line[4]
data["work_title"] = line[5]
data["location"] = line[6]
data["publisher"] = line[7]
data["signature"] = line[9]
data["pages"] = line[8]
elif type == "Zeitschriftenaufsätze":
data["type"] = type
data["section_author"] = line[0]
data["year"] = line[1]
data["issue"] = line[2]
data["chapter_title"] = line[3]
data["work_title"] = line[4]
data["location"] = line[5]
data["publisher"] = line[6]
data["signature"] = line[8]
data["pages"] = line[7]
ret.append(data)
return ret
def elsa_word_to_csv(path: str):
doc = Document(path)
# # print all lines in doc
doctype = [para.text for para in doc.paragraphs if para.text != ""][-1]
tuples = {
"Monografien": ("", "", "", "", "", "", "", "", ""),
"Herausgeberwerke": ("", "", "", "", "", "", "", "", "", "", ""),
"Zeitschriftenaufsätze": ("", "", "", "", "", "", "", "", "", ""),
}
tables = doc.tables
m_data: list[pd.DataFrame] = []
for table in tables:
data: list[list[str]] = []
for row in table.rows:
row_data: list[str] = []
for cell in row.cells:
text = cell.text
text = text.replace("\n", "")
text = text.replace("\u2002", "")
row_data.append(text)
data.append(row_data)
df = pd.DataFrame(data)
df.columns = df.iloc[0]
df = df.iloc[1:]
m_data.append(df)
df = m_data[0]
# split df to rows
data = [
row for row in df.itertuples(index=False, name=None) if row != tuples[doctype]
]
# log.debug(data)
return tuple_to_dict(data, doctype), doctype
def word_to_semap(word_path: str) -> SemapDocument:
log.info("Parsing Word Document {}", word_path)
semap = SemapDocument()
df = word_docx_to_csv(word_path)
apparatdata = df[0]
apparatdata = apparatdata.to_dict()
keys = list(apparatdata.keys())
print(apparatdata, keys)
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys) - 1, 2)}
semap.phoneNumber = appdata["Telefon:"]
semap.subject = appdata["Ihr Fach:"]
semap.mail = appdata["Mailadresse:"]
semap.personName = ",".join(appdata["Ihr Name und Titel:"].split(",")[:-1])
semap.personTitle = ",".join(appdata["Ihr Name und Titel:"].split(",")[-1:]).strip()
apparatdata = df[1]
apparatdata = apparatdata.to_dict()
keys = list(apparatdata.keys())
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)}
semap.title = appdata["Veranstaltung:"]
semap.semester = appdata["Semester:"]
semap.renameSemester
semap.nameSetter
books = df[2]
booklist = []
for i in range(len(books)):
if books.iloc[i].isnull().all():
continue
data = books.iloc[i].to_dict()
book = Book()
book.from_dict(data)
if book.is_empty:
continue
elif not book.has_signature:
continue
else:
booklist.append(book)
log.info("Found {} books", len(booklist))
semap.books = booklist
return semap
if __name__ == "__main__":
else_df = elsa_word_to_csv(
"C:/Users/aky547/Desktop/ELSA_Bestellung Scann Der Westen und der Rest.docx"
)
print(else_df)