Files
SemesterapparatsManager/src/logic/wordparser.py
2025-04-28 10:18:07 +02:00

261 lines
7.8 KiB
Python

import pandas as pd
from docx import Document
from dataclasses import dataclass
import sys
from loguru import logger as log
logger = log
logger.remove()
logger.add("logs/wordparser.log", rotation="1 week", enqueue=True)
log.add(
f"logs/application.log",
rotation="1 day",
compression="zip",
enqueue=True,
)
# logger.add(sys.stderr, format="{time} {level} {message}", level="INFO")
logger.add(sys.stdout)
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
@dataclass
class Book:
author: str = None
year: str = None
edition: str = None
title: str = None
location: str = None
publisher: str = None
signature: str = None
internal_notes: str = None
@property
def has_signature(self) -> bool:
return self.signature is not None and self.signature != ""
@property
def is_empty(self) -> bool:
return all(
[
self.author == "",
self.year == "",
self.edition == "",
self.title == "",
self.location == "",
self.publisher == "",
self.signature == "",
self.internal_notes == "",
]
)
def from_dict(self, data: dict):
for key, value in data.items():
if value == "\u2002\u2002\u2002\u2002\u2002":
value = ""
if key == "Autorenname(n):Nachname, Vorname":
self.author = value
elif key == "Jahr/Auflage":
self.year = value.split("/")[0] if "/" in value else value
self.edition = value.split("/")[1] if "/" in value else ""
elif key == "Titel":
self.title = value
elif key == "Ort und Verlag":
self.location = value.split(",")[0] if "," in value else value
self.publisher = value.split(",")[1] if "," in value else ""
elif key == "Standnummer":
self.signature = value
elif key == "Interne Vermerke":
self.internal_notes = value
@dataclass
class SemapDocument:
subject: str = None
phoneNumber: int = None
mail: str = None
title: str = None
semester: str = None
books: list[Book] = None
@property
def renameSemester(self) -> None:
if self.semester is not None:
if "sommersemester" in self.semester.lower():
year = self.semester.split(" ")[-1]
self.semester = f"SoSe {year}"
elif "wintersemester" in self.semester.lower():
year = self.semester.split(" ")[-1]
self.semester = f"WiSe {year}"
@property
def signatures(self) -> list[str]:
if self.books is not None:
return [book.signature for book in self.books if book.has_signature]
return []
def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
doc = Document(path)
tables = doc.tables
m_data = []
for table in tables:
data = []
for row in table.rows:
row_data = []
for cell in row.cells:
text = cell.text
text = text.replace("\n", "")
row_data.append(text)
data.append(row_data)
df = pd.DataFrame(data)
df.columns = df.iloc[0]
df = df.iloc[1:]
m_data.append(df)
# for df[0, 1]: merge i and i+1 as key, value
return m_data
def makeDict():
return {
"work_author": None,
"section_author": None,
"year": None,
"edition": None,
"work_title": None,
"chapter_title": None,
"location": None,
"publisher": None,
"signature": None,
"issue": None,
"pages": None,
"isbn": None,
"type": None,
}
def tuple_to_dict(tlist: tuple, type: str) -> dict:
ret = []
for line in tlist:
data = makeDict()
if type == "Monografien":
data["type"] = type
data["work_author"] = line[0]
data["year"] = line[1]
data["edition"] = line[2]
data["work_title"] = line[3]
data["location"] = line[4]
data["publisher"] = line[5]
data["signature"] = line[6]
data["pages"] = line[7]
elif type == "Herausgeberwerke":
data["type"] = type
data["section_author"] = line[0]
data["year"] = line[1]
data["edition"] = line[2]
data["chapter_title"] = line[3]
data["work_author"] = line[4]
data["work_title"] = line[5]
data["location"] = line[6]
data["publisher"] = line[7]
data["signature"] = line[9]
data["pages"] = line[8]
elif type == "Zeitschriftenaufsätze":
data["type"] = type
data["section_author"] = line[0]
data["year"] = line[1]
data["issue"] = line[2]
data["chapter_title"] = line[3]
data["work_title"] = line[4]
data["location"] = line[5]
data["publisher"] = line[6]
data["signature"] = line[8]
data["pages"] = line[7]
ret.append(data)
return ret
def elsa_word_to_csv(path):
doc = Document(path)
# # print all lines in doc
doctype = [para.text for para in doc.paragraphs if para.text != ""][-1]
tuples = {
"Monografien": ("", "", "", "", "", "", "", "", ""),
"Herausgeberwerke": ("", "", "", "", "", "", "", "", "", "", ""),
"Zeitschriftenaufsätze": ("", "", "", "", "", "", "", "", "", ""),
}
tables = doc.tables
m_data = []
for table in tables:
data = []
for row in table.rows:
row_data = []
for cell in row.cells:
text = cell.text
text = text.replace("\n", "")
text = text.replace("\u2002", "")
row_data.append(text)
data.append(row_data)
df = pd.DataFrame(data)
df.columns = df.iloc[0]
df = df.iloc[1:]
m_data.append(df)
df = m_data[0]
# split df to rows
data = [
row for row in df.itertuples(index=False, name=None) if row != tuples[doctype]
]
# print(data)
return tuple_to_dict(data, doctype), doctype
def word_to_semap(word_path: str) -> SemapDocument:
logger.info("Parsing Word Document {}", word_path)
semap = SemapDocument()
df = word_docx_to_csv(word_path)
apparatdata = df[0]
apparatdata = apparatdata.to_dict()
keys = list(apparatdata.keys())
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)}
semap.phoneNumber = appdata["Telefon:"]
semap.subject = appdata["Ihr Fach:"]
semap.mail = appdata["Mailadresse:"]
apparatdata = df[1]
apparatdata = apparatdata.to_dict()
keys = list(apparatdata.keys())
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)}
semap.title = appdata["Veranstaltung:"]
semap.semester = appdata["Semester:"]
semap.renameSemester
books = df[2]
booklist = []
for i in range(len(books)):
if books.iloc[i].isnull().all():
continue
data = books.iloc[i].to_dict()
book = Book()
book.from_dict(data)
if book.is_empty:
continue
elif not book.has_signature:
continue
else:
booklist.append(book)
logger.info("Found {} books", len(booklist))
semap.books = booklist
return semap
if __name__ == "__main__":
else_df = word_to_semap(
"C:/Users/aky547/Desktop/SA 80 titelmeldung_SoSe2025 Burth.docx"
)