316 lines
9.9 KiB
Python
316 lines
9.9 KiB
Python
import sys
|
|
import zipfile
|
|
from dataclasses import dataclass
|
|
from typing import Any, Union
|
|
|
|
import loguru
|
|
import pandas as pd
|
|
from bs4 import BeautifulSoup
|
|
from docx import Document
|
|
|
|
from src import LOG_DIR
|
|
from src.backend import Semester
|
|
from src.logic.openai import name_tester, run_shortener, semester_converter
|
|
|
|
log = loguru.logger
|
|
log.remove()
|
|
log.add(sys.stdout, level="INFO")
|
|
log.add(f"{LOG_DIR}/application.log", rotation="1 MB", retention="10 days")
|
|
|
|
|
|
|
|
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
|
|
|
|
|
@dataclass
|
|
class Book:
|
|
author: str = None
|
|
year: str = None
|
|
edition: str = None
|
|
title: str = None
|
|
location: str = None
|
|
publisher: str = None
|
|
signature: str = None
|
|
internal_notes: str = None
|
|
|
|
@property
|
|
def has_signature(self) -> bool:
|
|
return self.signature is not None and self.signature != ""
|
|
|
|
@property
|
|
def is_empty(self) -> bool:
|
|
return all(
|
|
[
|
|
self.author == "",
|
|
self.year == "",
|
|
self.edition == "",
|
|
self.title == "",
|
|
self.location == "",
|
|
self.publisher == "",
|
|
self.signature == "",
|
|
self.internal_notes == "",
|
|
]
|
|
)
|
|
|
|
def from_dict(self, data: dict[str, Any]):
|
|
for key, value in data.items():
|
|
value = value.strip()
|
|
if value == "\u2002\u2002\u2002\u2002\u2002":
|
|
value = ""
|
|
|
|
if key == "Autorenname(n):Nachname, Vorname":
|
|
self.author = value
|
|
elif key == "Jahr/Auflage":
|
|
self.year = value.split("/")[0] if "/" in value else value
|
|
self.edition = value.split("/")[1] if "/" in value else ""
|
|
elif key == "Titel":
|
|
self.title = value
|
|
elif key == "Ort und Verlag":
|
|
self.location = value.split(",")[0] if "," in value else value
|
|
self.publisher = value.split(",")[1] if "," in value else ""
|
|
elif key == "Standnummer":
|
|
self.signature = value.strip()
|
|
elif key == "Interne Vermerke":
|
|
self.internal_notes = value
|
|
|
|
|
|
@dataclass
|
|
class SemapDocument:
|
|
subject: str = None
|
|
phoneNumber: int = None
|
|
mail: str = None
|
|
title: str = None
|
|
title_suggestions: list[str] = None
|
|
semester: Union[str, Semester] = None
|
|
books: list[Book] = None
|
|
eternal: bool = False
|
|
personName: str = None
|
|
personTitle: str = None
|
|
title_length = 0
|
|
title_max_length = 0
|
|
|
|
def __post_init__(self):
|
|
self.title_suggestions = []
|
|
|
|
@property
|
|
def nameSetter(self):
|
|
data = name_tester(self.personTitle)
|
|
name = f"{data['last_name']}, {data['first_name']}"
|
|
if data["title"] is not None:
|
|
title = data["title"]
|
|
self.personTitle = title
|
|
self.personName = name
|
|
self.title_length = len(self.title) + 3 + len(self.personName.split(",")[0])
|
|
if self.title_length > 40:
|
|
log.warning("Title is too long")
|
|
name_len = len(self.personName.split(",")[0])
|
|
self.title_max_length = 38 - name_len
|
|
suggestions = run_shortener(self.title, self.title_max_length)
|
|
for suggestion in suggestions:
|
|
self.title_suggestions.append(suggestion["shortened_string"])
|
|
else:
|
|
self.title_suggestions = []
|
|
pass
|
|
@property
|
|
def renameSemester(self) -> None:
|
|
if ", Dauer" in self.semester:
|
|
self.semester = self.semester.split(",")[0]
|
|
self.eternal = True
|
|
self.semester = Semester().from_string(self.semester)
|
|
else:
|
|
log.warning("Semester {} is not valid", self.semester)
|
|
self.semester = Semester().from_string(semester_converter(self.semester))
|
|
|
|
@property
|
|
def signatures(self) -> list[str]:
|
|
if self.books is not None:
|
|
return [book.signature for book in self.books if book.has_signature]
|
|
return []
|
|
|
|
|
|
def word_docx_to_csv(path: str) -> list[pd.DataFrame]:
|
|
doc = Document(path)
|
|
tables = doc.tables
|
|
m_data = []
|
|
for table in tables:
|
|
data = []
|
|
for row in table.rows:
|
|
row_data: list[Any] = []
|
|
for cell in row.cells:
|
|
text = cell.text
|
|
|
|
text = text.replace("\n", "")
|
|
row_data.append(text)
|
|
if text == "Ihr Fach:":
|
|
row_data.append(get_fach(path))
|
|
data.append(row_data)
|
|
df = pd.DataFrame(data)
|
|
df.columns = df.iloc[0]
|
|
df = df.iloc[1:]
|
|
|
|
m_data.append(df)
|
|
|
|
return m_data
|
|
|
|
|
|
def get_fach(path: str) -> str:
|
|
document = zipfile.ZipFile(path)
|
|
xml_data = document.read("word/document.xml")
|
|
document.close()
|
|
|
|
soup = BeautifulSoup(xml_data, "xml")
|
|
# text we need is in <w:p w14:paraId="12456A32" ... > -> w:r -> w:t
|
|
paragraphs = soup.find_all("w:p")
|
|
names = []
|
|
for para in paragraphs:
|
|
para_id = para.get("w14:paraId")
|
|
if para_id == "12456A32":
|
|
# get the data in the w:t
|
|
for run in para.find_all("w:r"):
|
|
data = run.find("w:t")
|
|
return data.contents[0]
|
|
|
|
|
|
def makeDict():
|
|
return {
|
|
"work_author": None,
|
|
"section_author": None,
|
|
"year": None,
|
|
"edition": None,
|
|
"work_title": None,
|
|
"chapter_title": None,
|
|
"location": None,
|
|
"publisher": None,
|
|
"signature": None,
|
|
"issue": None,
|
|
"pages": None,
|
|
"isbn": None,
|
|
"type": None,
|
|
}
|
|
|
|
|
|
def tuple_to_dict(tlist: tuple, type: str) -> dict:
|
|
ret = []
|
|
for line in tlist:
|
|
data = makeDict()
|
|
if type == "Monografien":
|
|
data["type"] = type
|
|
data["work_author"] = line[0]
|
|
data["year"] = line[1]
|
|
data["edition"] = line[2]
|
|
data["work_title"] = line[3]
|
|
data["location"] = line[4]
|
|
data["publisher"] = line[5]
|
|
data["signature"] = line[6]
|
|
data["pages"] = line[7]
|
|
elif type == "Herausgeberwerke":
|
|
data["type"] = type
|
|
data["section_author"] = line[0]
|
|
data["year"] = line[1]
|
|
data["edition"] = line[2]
|
|
data["chapter_title"] = line[3]
|
|
data["work_author"] = line[4]
|
|
data["work_title"] = line[5]
|
|
data["location"] = line[6]
|
|
data["publisher"] = line[7]
|
|
data["signature"] = line[9]
|
|
data["pages"] = line[8]
|
|
elif type == "Zeitschriftenaufsätze":
|
|
data["type"] = type
|
|
data["section_author"] = line[0]
|
|
data["year"] = line[1]
|
|
data["issue"] = line[2]
|
|
data["chapter_title"] = line[3]
|
|
data["work_title"] = line[4]
|
|
data["location"] = line[5]
|
|
data["publisher"] = line[6]
|
|
data["signature"] = line[8]
|
|
data["pages"] = line[7]
|
|
ret.append(data)
|
|
return ret
|
|
|
|
|
|
def elsa_word_to_csv(path: str):
|
|
doc = Document(path)
|
|
# # print all lines in doc
|
|
doctype = [para.text for para in doc.paragraphs if para.text != ""][-1]
|
|
tuples = {
|
|
"Monografien": ("", "", "", "", "", "", "", "", ""),
|
|
"Herausgeberwerke": ("", "", "", "", "", "", "", "", "", "", ""),
|
|
"Zeitschriftenaufsätze": ("", "", "", "", "", "", "", "", "", ""),
|
|
}
|
|
tables = doc.tables
|
|
|
|
m_data: list[pd.DataFrame] = []
|
|
for table in tables:
|
|
data: list[list[str]] = []
|
|
for row in table.rows:
|
|
row_data: list[str] = []
|
|
for cell in row.cells:
|
|
text = cell.text
|
|
text = text.replace("\n", "")
|
|
text = text.replace("\u2002", "")
|
|
row_data.append(text)
|
|
data.append(row_data)
|
|
df = pd.DataFrame(data)
|
|
df.columns = df.iloc[0]
|
|
df = df.iloc[1:]
|
|
m_data.append(df)
|
|
df = m_data[0]
|
|
# split df to rows
|
|
data = [
|
|
row for row in df.itertuples(index=False, name=None) if row != tuples[doctype]
|
|
]
|
|
# log.debug(data)
|
|
return tuple_to_dict(data, doctype), doctype
|
|
|
|
|
|
def word_to_semap(word_path: str) -> SemapDocument:
|
|
log.info("Parsing Word Document {}", word_path)
|
|
semap = SemapDocument()
|
|
df = word_docx_to_csv(word_path)
|
|
apparatdata = df[0]
|
|
apparatdata = apparatdata.to_dict()
|
|
keys = list(apparatdata.keys())
|
|
print(apparatdata, keys)
|
|
|
|
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys) - 1, 2)}
|
|
semap.phoneNumber = appdata["Telefon:"]
|
|
semap.subject = appdata["Ihr Fach:"]
|
|
semap.mail = appdata["Mailadresse:"]
|
|
semap.personName = ",".join(appdata["Ihr Name und Titel:"].split(",")[:-1])
|
|
semap.personTitle = ",".join(appdata["Ihr Name und Titel:"].split(",")[-1:]).strip()
|
|
apparatdata = df[1]
|
|
apparatdata = apparatdata.to_dict()
|
|
keys = list(apparatdata.keys())
|
|
appdata = {keys[i]: keys[i + 1] for i in range(0, len(keys), 2)}
|
|
semap.title = appdata["Veranstaltung:"]
|
|
semap.semester = appdata["Semester:"]
|
|
semap.renameSemester
|
|
semap.nameSetter
|
|
|
|
books = df[2]
|
|
booklist = []
|
|
for i in range(len(books)):
|
|
if books.iloc[i].isnull().all():
|
|
continue
|
|
data = books.iloc[i].to_dict()
|
|
book = Book()
|
|
book.from_dict(data)
|
|
if book.is_empty:
|
|
continue
|
|
elif not book.has_signature:
|
|
continue
|
|
else:
|
|
booklist.append(book)
|
|
log.info("Found {} books", len(booklist))
|
|
semap.books = booklist
|
|
return semap
|
|
|
|
|
|
if __name__ == "__main__":
|
|
else_df = elsa_word_to_csv(
|
|
"C:/Users/aky547/Desktop/ELSA_Bestellung Scann Der Westen und der Rest.docx"
|
|
)
|
|
print(else_df)
|