From 9b8e0621fe89ef8f0a730d9095ea2730a0eb9413 Mon Sep 17 00:00:00 2001 From: WorldTeacher <41587052+WorldTeacher@users.noreply.github.com> Date: Wed, 26 Jun 2024 16:34:26 +0200 Subject: [PATCH] add valid letters to detect paragraphs --- src/logic/wordparser.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/logic/wordparser.py b/src/logic/wordparser.py index 6f3131c..99fba0c 100644 --- a/src/logic/wordparser.py +++ b/src/logic/wordparser.py @@ -1,6 +1,6 @@ import pandas as pd from docx import Document - +letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" def word_docx_to_csv(path) -> pd.DataFrame: doc = Document(path) @@ -28,6 +28,8 @@ def word_docx_to_csv(path) -> pd.DataFrame: def elsa_word_to_csv(path) -> list[tuple]: doc = Document(path) + # print all lines in doc + doctype = [para.text for para in doc.paragraphs if para.text != ""][-1] tables = doc.tables m_data = [] @@ -53,4 +55,11 @@ def elsa_word_to_csv(path) -> list[tuple]: if row != ("", "", "", "", "", "", "", "", "") ] - return data + return data, doctype + + +if __name__ == "__main__": + else_df = elsa_word_to_csv( + "c:/Users/aky547/Desktop/semap/formularsemhrsg2023_Bestellung Sahrai_Hurrelmann et al.Referenzwerk Prävention.docx" + ) + print(else_df) \ No newline at end of file