rework elsa file parser, add function to add dicts
This commit is contained in:
@@ -1,5 +1,6 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
import re
|
||||||
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||||||
|
|
||||||
def word_docx_to_csv(path) -> pd.DataFrame:
|
def word_docx_to_csv(path) -> pd.DataFrame:
|
||||||
@@ -24,12 +25,72 @@ def word_docx_to_csv(path) -> pd.DataFrame:
|
|||||||
|
|
||||||
df = m_data[2]
|
df = m_data[2]
|
||||||
return df
|
return df
|
||||||
|
def makeDict():
|
||||||
|
return {
|
||||||
|
"work_author": None,
|
||||||
|
"section_author": None,
|
||||||
|
"year": None,
|
||||||
|
"edition": None,
|
||||||
|
"work_title": None,
|
||||||
|
"chapter_title": None,
|
||||||
|
"location": None,
|
||||||
|
"publisher": None,
|
||||||
|
"signature": None,
|
||||||
|
"issue": None,
|
||||||
|
"pages": None,
|
||||||
|
"isbn": None,
|
||||||
|
"type": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
def tuple_to_dict(tlist: tuple, type: str) -> dict:
|
||||||
|
ret = []
|
||||||
|
for line in tlist:
|
||||||
|
data = makeDict()
|
||||||
|
if type == "Monographie":
|
||||||
|
data["type"] = type
|
||||||
|
data["work_author"] = line[0]
|
||||||
|
data["year"] = line[1]
|
||||||
|
data["edition"] = line[2]
|
||||||
|
data["work_title"] = line[3]
|
||||||
|
data["location"] = line[4]
|
||||||
|
data["publisher"] = line[5]
|
||||||
|
data["signature"] = line[6]
|
||||||
|
data["pages"] = line[7]
|
||||||
|
elif type == "Herausgeberwerke":
|
||||||
|
data["type"] = type
|
||||||
|
data["section_author"] = line[0]
|
||||||
|
data["year"] = line[1]
|
||||||
|
data["edition"] = line[2]
|
||||||
|
data["chapter_title"] = line[3]
|
||||||
|
data["work_author"] = line[4]
|
||||||
|
data["work_title"] = line[5]
|
||||||
|
data["location"] = line[6]
|
||||||
|
data["publisher"] = line[7]
|
||||||
|
data["signature"] = line[9]
|
||||||
|
data["pages"] = line[8]
|
||||||
|
elif type == "Zeitschriftenaufsätze":
|
||||||
|
data["type"] = type
|
||||||
|
data["section_author"] = line[0]
|
||||||
|
data["year"] = line[1]
|
||||||
|
data["issue"] = line[2]
|
||||||
|
data["chapter_title"] = line[3]
|
||||||
|
data["work_title"] = line[4]
|
||||||
|
data["location"] = line[5]
|
||||||
|
data["publisher"] = line[6]
|
||||||
|
data["signature"] = line[8]
|
||||||
|
data["pages"] = line[7]
|
||||||
|
ret.append(data)
|
||||||
|
return ret
|
||||||
|
|
||||||
def elsa_word_to_csv(path) -> list[tuple]:
|
def elsa_word_to_csv(path):
|
||||||
doc = Document(path)
|
doc = Document(path)
|
||||||
# print all lines in doc
|
# print all lines in doc
|
||||||
doctype = [para.text for para in doc.paragraphs if para.text != ""][-1]
|
doctype = [para.text for para in doc.paragraphs if para.text != ""][-1]
|
||||||
|
tuples = {
|
||||||
|
"Monographie": ("", "", "", "", "", "", "", "", ""),
|
||||||
|
"Herausgeberwerke": ("", "", "", "", "", "", "", "", "", "", ""),
|
||||||
|
"Zeitschriftenaufsätze": ("", "", "", "", "", "", "", "", "", ""),
|
||||||
|
}
|
||||||
tables = doc.tables
|
tables = doc.tables
|
||||||
|
|
||||||
m_data = []
|
m_data = []
|
||||||
@@ -50,16 +111,12 @@ def elsa_word_to_csv(path) -> list[tuple]:
|
|||||||
df = m_data[0]
|
df = m_data[0]
|
||||||
# split df to rows
|
# split df to rows
|
||||||
data = [
|
data = [
|
||||||
row
|
row for row in df.itertuples(index=False, name=None) if row != tuples[doctype]
|
||||||
for row in df.itertuples(index=False, name=None)
|
|
||||||
if row != ("", "", "", "", "", "", "", "", "")
|
|
||||||
]
|
]
|
||||||
|
print(data)
|
||||||
return data, doctype
|
return tuple_to_dict(data, doctype), doctype
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
else_df = elsa_word_to_csv(
|
else_df = elsa_word_to_csv("c:/Users/aky547/Desktop/hrsgw_test.docx")
|
||||||
"c:/Users/aky547/Desktop/semap/formularsemhrsg2023_Bestellung Sahrai_Hurrelmann et al.Referenzwerk Prävention.docx"
|
|
||||||
)
|
|
||||||
print(else_df)
|
print(else_df)
|
||||||
Reference in New Issue
Block a user