From 115248098a6a3005a93af620939deb5ac878a80e Mon Sep 17 00:00:00 2001 From: WorldTeacher <41587052+WorldTeacher@users.noreply.github.com> Date: Thu, 27 Jun 2024 13:18:19 +0200 Subject: [PATCH] rework elsa file parser, add function to add dicts --- src/logic/wordparser.py | 75 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 9 deletions(-) diff --git a/src/logic/wordparser.py b/src/logic/wordparser.py index 99fba0c..4300a46 100644 --- a/src/logic/wordparser.py +++ b/src/logic/wordparser.py @@ -1,5 +1,6 @@ import pandas as pd from docx import Document +import re letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" def word_docx_to_csv(path) -> pd.DataFrame: @@ -24,12 +25,72 @@ def word_docx_to_csv(path) -> pd.DataFrame: df = m_data[2] return df +def makeDict(): + return { + "work_author": None, + "section_author": None, + "year": None, + "edition": None, + "work_title": None, + "chapter_title": None, + "location": None, + "publisher": None, + "signature": None, + "issue": None, + "pages": None, + "isbn": None, + "type": None, + } +def tuple_to_dict(tlist: tuple, type: str) -> dict: + ret = [] + for line in tlist: + data = makeDict() + if type == "Monographie": + data["type"] = type + data["work_author"] = line[0] + data["year"] = line[1] + data["edition"] = line[2] + data["work_title"] = line[3] + data["location"] = line[4] + data["publisher"] = line[5] + data["signature"] = line[6] + data["pages"] = line[7] + elif type == "Herausgeberwerke": + data["type"] = type + data["section_author"] = line[0] + data["year"] = line[1] + data["edition"] = line[2] + data["chapter_title"] = line[3] + data["work_author"] = line[4] + data["work_title"] = line[5] + data["location"] = line[6] + data["publisher"] = line[7] + data["signature"] = line[9] + data["pages"] = line[8] + elif type == "Zeitschriftenaufsätze": + data["type"] = type + data["section_author"] = line[0] + data["year"] = line[1] + data["issue"] = line[2] + data["chapter_title"] = line[3] + data["work_title"] = line[4] + data["location"] = line[5] + data["publisher"] = line[6] + data["signature"] = line[8] + data["pages"] = line[7] + ret.append(data) + return ret -def elsa_word_to_csv(path) -> list[tuple]: +def elsa_word_to_csv(path): doc = Document(path) # print all lines in doc doctype = [para.text for para in doc.paragraphs if para.text != ""][-1] + tuples = { + "Monographie": ("", "", "", "", "", "", "", "", ""), + "Herausgeberwerke": ("", "", "", "", "", "", "", "", "", "", ""), + "Zeitschriftenaufsätze": ("", "", "", "", "", "", "", "", "", ""), + } tables = doc.tables m_data = [] @@ -50,16 +111,12 @@ def elsa_word_to_csv(path) -> list[tuple]: df = m_data[0] # split df to rows data = [ - row - for row in df.itertuples(index=False, name=None) - if row != ("", "", "", "", "", "", "", "", "") + row for row in df.itertuples(index=False, name=None) if row != tuples[doctype] ] - - return data, doctype + print(data) + return tuple_to_dict(data, doctype), doctype if __name__ == "__main__": - else_df = elsa_word_to_csv( - "c:/Users/aky547/Desktop/semap/formularsemhrsg2023_Bestellung Sahrai_Hurrelmann et al.Referenzwerk Prävention.docx" - ) + else_df = elsa_word_to_csv("c:/Users/aky547/Desktop/hrsgw_test.docx") print(else_df) \ No newline at end of file