rework elsa file parser, add function to add dicts

This commit is contained in:
WorldTeacher
2024-06-27 13:18:19 +02:00
parent 84f9d69a61
commit 115248098a

View File

@@ -1,5 +1,6 @@
import pandas as pd import pandas as pd
from docx import Document from docx import Document
import re
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
def word_docx_to_csv(path) -> pd.DataFrame: def word_docx_to_csv(path) -> pd.DataFrame:
@@ -24,12 +25,72 @@ def word_docx_to_csv(path) -> pd.DataFrame:
df = m_data[2] df = m_data[2]
return df return df
def makeDict():
return {
"work_author": None,
"section_author": None,
"year": None,
"edition": None,
"work_title": None,
"chapter_title": None,
"location": None,
"publisher": None,
"signature": None,
"issue": None,
"pages": None,
"isbn": None,
"type": None,
}
def tuple_to_dict(tlist: tuple, type: str) -> dict:
ret = []
for line in tlist:
data = makeDict()
if type == "Monographie":
data["type"] = type
data["work_author"] = line[0]
data["year"] = line[1]
data["edition"] = line[2]
data["work_title"] = line[3]
data["location"] = line[4]
data["publisher"] = line[5]
data["signature"] = line[6]
data["pages"] = line[7]
elif type == "Herausgeberwerke":
data["type"] = type
data["section_author"] = line[0]
data["year"] = line[1]
data["edition"] = line[2]
data["chapter_title"] = line[3]
data["work_author"] = line[4]
data["work_title"] = line[5]
data["location"] = line[6]
data["publisher"] = line[7]
data["signature"] = line[9]
data["pages"] = line[8]
elif type == "Zeitschriftenaufsätze":
data["type"] = type
data["section_author"] = line[0]
data["year"] = line[1]
data["issue"] = line[2]
data["chapter_title"] = line[3]
data["work_title"] = line[4]
data["location"] = line[5]
data["publisher"] = line[6]
data["signature"] = line[8]
data["pages"] = line[7]
ret.append(data)
return ret
def elsa_word_to_csv(path) -> list[tuple]: def elsa_word_to_csv(path):
doc = Document(path) doc = Document(path)
# print all lines in doc # print all lines in doc
doctype = [para.text for para in doc.paragraphs if para.text != ""][-1] doctype = [para.text for para in doc.paragraphs if para.text != ""][-1]
tuples = {
"Monographie": ("", "", "", "", "", "", "", "", ""),
"Herausgeberwerke": ("", "", "", "", "", "", "", "", "", "", ""),
"Zeitschriftenaufsätze": ("", "", "", "", "", "", "", "", "", ""),
}
tables = doc.tables tables = doc.tables
m_data = [] m_data = []
@@ -50,16 +111,12 @@ def elsa_word_to_csv(path) -> list[tuple]:
df = m_data[0] df = m_data[0]
# split df to rows # split df to rows
data = [ data = [
row row for row in df.itertuples(index=False, name=None) if row != tuples[doctype]
for row in df.itertuples(index=False, name=None)
if row != ("", "", "", "", "", "", "", "", "")
] ]
print(data)
return data, doctype return tuple_to_dict(data, doctype), doctype
if __name__ == "__main__": if __name__ == "__main__":
else_df = elsa_word_to_csv( else_df = elsa_word_to_csv("c:/Users/aky547/Desktop/hrsgw_test.docx")
"c:/Users/aky547/Desktop/semap/formularsemhrsg2023_Bestellung Sahrai_Hurrelmann et al.Referenzwerk Prävention.docx"
)
print(else_df) print(else_df)