diff --git a/src/logic/wordparser.py b/src/logic/wordparser.py index 716a135..6f3131c 100644 --- a/src/logic/wordparser.py +++ b/src/logic/wordparser.py @@ -1,26 +1,56 @@ -import pandas as pd -from docx import Document - - -def word_docx_to_csv(path) -> pd.DataFrame: - doc = Document(path) - tables = doc.tables - - m_data = [] - for table in tables: - data = [] - for row in table.rows: - row_data = [] - for cell in row.cells: - text = cell.text - text = text.replace("\n", "") - row_data.append(text) - data.append(row_data) - df = pd.DataFrame(data) - df.columns = df.iloc[0] - df = df.iloc[1:] - - m_data.append(df) - - df = m_data[2] - return df +import pandas as pd +from docx import Document + + +def word_docx_to_csv(path) -> pd.DataFrame: + doc = Document(path) + tables = doc.tables + + m_data = [] + for table in tables: + data = [] + for row in table.rows: + row_data = [] + for cell in row.cells: + text = cell.text + text = text.replace("\n", "") + row_data.append(text) + data.append(row_data) + df = pd.DataFrame(data) + df.columns = df.iloc[0] + df = df.iloc[1:] + + m_data.append(df) + + df = m_data[2] + return df + + +def elsa_word_to_csv(path) -> list[tuple]: + doc = Document(path) + tables = doc.tables + + m_data = [] + for table in tables: + data = [] + for row in table.rows: + row_data = [] + for cell in row.cells: + text = cell.text + text = text.replace("\n", "") + text = text.replace("\u2002", "") + row_data.append(text) + data.append(row_data) + df = pd.DataFrame(data) + df.columns = df.iloc[0] + df = df.iloc[1:] + m_data.append(df) + df = m_data[0] + # split df to rows + data = [ + row + for row in df.itertuples(index=False, name=None) + if row != ("", "", "", "", "", "", "", "", "") + ] + + return data