add elsa word parser
This commit is contained in:
@@ -1,26 +1,56 @@
|
||||
import pandas as pd
|
||||
from docx import Document
|
||||
|
||||
|
||||
def word_docx_to_csv(path) -> pd.DataFrame:
|
||||
doc = Document(path)
|
||||
tables = doc.tables
|
||||
|
||||
m_data = []
|
||||
for table in tables:
|
||||
data = []
|
||||
for row in table.rows:
|
||||
row_data = []
|
||||
for cell in row.cells:
|
||||
text = cell.text
|
||||
text = text.replace("\n", "")
|
||||
row_data.append(text)
|
||||
data.append(row_data)
|
||||
df = pd.DataFrame(data)
|
||||
df.columns = df.iloc[0]
|
||||
df = df.iloc[1:]
|
||||
|
||||
m_data.append(df)
|
||||
|
||||
df = m_data[2]
|
||||
return df
|
||||
import pandas as pd
|
||||
from docx import Document
|
||||
|
||||
|
||||
def word_docx_to_csv(path) -> pd.DataFrame:
|
||||
doc = Document(path)
|
||||
tables = doc.tables
|
||||
|
||||
m_data = []
|
||||
for table in tables:
|
||||
data = []
|
||||
for row in table.rows:
|
||||
row_data = []
|
||||
for cell in row.cells:
|
||||
text = cell.text
|
||||
text = text.replace("\n", "")
|
||||
row_data.append(text)
|
||||
data.append(row_data)
|
||||
df = pd.DataFrame(data)
|
||||
df.columns = df.iloc[0]
|
||||
df = df.iloc[1:]
|
||||
|
||||
m_data.append(df)
|
||||
|
||||
df = m_data[2]
|
||||
return df
|
||||
|
||||
|
||||
def elsa_word_to_csv(path) -> list[tuple]:
|
||||
doc = Document(path)
|
||||
tables = doc.tables
|
||||
|
||||
m_data = []
|
||||
for table in tables:
|
||||
data = []
|
||||
for row in table.rows:
|
||||
row_data = []
|
||||
for cell in row.cells:
|
||||
text = cell.text
|
||||
text = text.replace("\n", "")
|
||||
text = text.replace("\u2002", "")
|
||||
row_data.append(text)
|
||||
data.append(row_data)
|
||||
df = pd.DataFrame(data)
|
||||
df.columns = df.iloc[0]
|
||||
df = df.iloc[1:]
|
||||
m_data.append(df)
|
||||
df = m_data[0]
|
||||
# split df to rows
|
||||
data = [
|
||||
row
|
||||
for row in df.itertuples(index=False, name=None)
|
||||
if row != ("", "", "", "", "", "", "", "", "")
|
||||
]
|
||||
|
||||
return data
|
||||
|
||||
Reference in New Issue
Block a user