add elsa word parser

2024-05-24 10:07:44 +02:00
parent c3ef7aedce
commit f204ed2b30
1 changed files with 56 additions and 26 deletions
--- a/src/logic/wordparser.py
+++ b/src/logic/wordparser.py
@@ -1,26 +1,56 @@
-import pandas as pd
+import pandas as pd
-from docx import Document
+from docx import Document
-
+
-
+
-def word_docx_to_csv(path) -> pd.DataFrame:
+def word_docx_to_csv(path) -> pd.DataFrame:
-    doc = Document(path)
+    doc = Document(path)
-    tables = doc.tables
+    tables = doc.tables
-
+
-    m_data = []
+    m_data = []
-    for table in tables:
+    for table in tables:
-        data = []
+        data = []
-        for row in table.rows:
+        for row in table.rows:
-            row_data = []
+            row_data = []
-            for cell in row.cells:
+            for cell in row.cells:
-                text = cell.text
+                text = cell.text
-                text = text.replace("\n", "")
+                text = text.replace("\n", "")
-                row_data.append(text)
+                row_data.append(text)
-            data.append(row_data)
+            data.append(row_data)
-        df = pd.DataFrame(data)
+        df = pd.DataFrame(data)
-        df.columns = df.iloc[0]
+        df.columns = df.iloc[0]
-        df = df.iloc[1:]
+        df = df.iloc[1:]
-
+
-        m_data.append(df)
+        m_data.append(df)
-
+
-    df = m_data[2]
+    df = m_data[2]
-    return df
+    return df
 def elsa_word_to_csv(path) -> list[tuple]:
    doc = Document(path)
    tables = doc.tables
    m_data = []
    for table in tables:
        data = []
        for row in table.rows:
            row_data = []
            for cell in row.cells:
                text = cell.text
                text = text.replace("\n", "")
                text = text.replace("\u2002", "")
                row_data.append(text)
            data.append(row_data)
            df = pd.DataFrame(data)
        df.columns = df.iloc[0]
        df = df.iloc[1:]
        m_data.append(df)
    df = m_data[0]
    # split df to rows
    data = [
        row
        for row in df.itertuples(index=False, name=None)
        if row != ("", "", "", "", "", "", "", "", "")
    ]
    return data