add elsa word parser

This commit is contained in:
WorldTeacher
2024-05-24 10:07:44 +02:00
parent c3ef7aedce
commit f204ed2b30

View File

@@ -1,26 +1,56 @@
import pandas as pd import pandas as pd
from docx import Document from docx import Document
def word_docx_to_csv(path) -> pd.DataFrame: def word_docx_to_csv(path) -> pd.DataFrame:
doc = Document(path) doc = Document(path)
tables = doc.tables tables = doc.tables
m_data = [] m_data = []
for table in tables: for table in tables:
data = [] data = []
for row in table.rows: for row in table.rows:
row_data = [] row_data = []
for cell in row.cells: for cell in row.cells:
text = cell.text text = cell.text
text = text.replace("\n", "") text = text.replace("\n", "")
row_data.append(text) row_data.append(text)
data.append(row_data) data.append(row_data)
df = pd.DataFrame(data) df = pd.DataFrame(data)
df.columns = df.iloc[0] df.columns = df.iloc[0]
df = df.iloc[1:] df = df.iloc[1:]
m_data.append(df) m_data.append(df)
df = m_data[2] df = m_data[2]
return df return df
def elsa_word_to_csv(path) -> list[tuple]:
doc = Document(path)
tables = doc.tables
m_data = []
for table in tables:
data = []
for row in table.rows:
row_data = []
for cell in row.cells:
text = cell.text
text = text.replace("\n", "")
text = text.replace("\u2002", "")
row_data.append(text)
data.append(row_data)
df = pd.DataFrame(data)
df.columns = df.iloc[0]
df = df.iloc[1:]
m_data.append(df)
df = m_data[0]
# split df to rows
data = [
row
for row in df.itertuples(index=False, name=None)
if row != ("", "", "", "", "", "", "", "", "")
]
return data