rework elsa file parser, add function to add dicts
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import pandas as pd
|
||||
from docx import Document
|
||||
import re
|
||||
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||||
|
||||
def word_docx_to_csv(path) -> pd.DataFrame:
|
||||
@@ -24,12 +25,72 @@ def word_docx_to_csv(path) -> pd.DataFrame:
|
||||
|
||||
df = m_data[2]
|
||||
return df
|
||||
def makeDict():
|
||||
return {
|
||||
"work_author": None,
|
||||
"section_author": None,
|
||||
"year": None,
|
||||
"edition": None,
|
||||
"work_title": None,
|
||||
"chapter_title": None,
|
||||
"location": None,
|
||||
"publisher": None,
|
||||
"signature": None,
|
||||
"issue": None,
|
||||
"pages": None,
|
||||
"isbn": None,
|
||||
"type": None,
|
||||
}
|
||||
|
||||
def tuple_to_dict(tlist: tuple, type: str) -> dict:
|
||||
ret = []
|
||||
for line in tlist:
|
||||
data = makeDict()
|
||||
if type == "Monographie":
|
||||
data["type"] = type
|
||||
data["work_author"] = line[0]
|
||||
data["year"] = line[1]
|
||||
data["edition"] = line[2]
|
||||
data["work_title"] = line[3]
|
||||
data["location"] = line[4]
|
||||
data["publisher"] = line[5]
|
||||
data["signature"] = line[6]
|
||||
data["pages"] = line[7]
|
||||
elif type == "Herausgeberwerke":
|
||||
data["type"] = type
|
||||
data["section_author"] = line[0]
|
||||
data["year"] = line[1]
|
||||
data["edition"] = line[2]
|
||||
data["chapter_title"] = line[3]
|
||||
data["work_author"] = line[4]
|
||||
data["work_title"] = line[5]
|
||||
data["location"] = line[6]
|
||||
data["publisher"] = line[7]
|
||||
data["signature"] = line[9]
|
||||
data["pages"] = line[8]
|
||||
elif type == "Zeitschriftenaufsätze":
|
||||
data["type"] = type
|
||||
data["section_author"] = line[0]
|
||||
data["year"] = line[1]
|
||||
data["issue"] = line[2]
|
||||
data["chapter_title"] = line[3]
|
||||
data["work_title"] = line[4]
|
||||
data["location"] = line[5]
|
||||
data["publisher"] = line[6]
|
||||
data["signature"] = line[8]
|
||||
data["pages"] = line[7]
|
||||
ret.append(data)
|
||||
return ret
|
||||
|
||||
def elsa_word_to_csv(path) -> list[tuple]:
|
||||
def elsa_word_to_csv(path):
|
||||
doc = Document(path)
|
||||
# print all lines in doc
|
||||
doctype = [para.text for para in doc.paragraphs if para.text != ""][-1]
|
||||
tuples = {
|
||||
"Monographie": ("", "", "", "", "", "", "", "", ""),
|
||||
"Herausgeberwerke": ("", "", "", "", "", "", "", "", "", "", ""),
|
||||
"Zeitschriftenaufsätze": ("", "", "", "", "", "", "", "", "", ""),
|
||||
}
|
||||
tables = doc.tables
|
||||
|
||||
m_data = []
|
||||
@@ -50,16 +111,12 @@ def elsa_word_to_csv(path) -> list[tuple]:
|
||||
df = m_data[0]
|
||||
# split df to rows
|
||||
data = [
|
||||
row
|
||||
for row in df.itertuples(index=False, name=None)
|
||||
if row != ("", "", "", "", "", "", "", "", "")
|
||||
row for row in df.itertuples(index=False, name=None) if row != tuples[doctype]
|
||||
]
|
||||
|
||||
return data, doctype
|
||||
print(data)
|
||||
return tuple_to_dict(data, doctype), doctype
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
else_df = elsa_word_to_csv(
|
||||
"c:/Users/aky547/Desktop/semap/formularsemhrsg2023_Bestellung Sahrai_Hurrelmann et al.Referenzwerk Prävention.docx"
|
||||
)
|
||||
else_df = elsa_word_to_csv("c:/Users/aky547/Desktop/hrsgw_test.docx")
|
||||
print(else_df)
|
||||
Reference in New Issue
Block a user