add valid letters to detect paragraphs

This commit is contained in:
WorldTeacher
2024-06-26 16:34:26 +02:00
parent ee229a41c5
commit 9b8e0621fe

View File

@@ -1,6 +1,6 @@
import pandas as pd import pandas as pd
from docx import Document from docx import Document
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
def word_docx_to_csv(path) -> pd.DataFrame: def word_docx_to_csv(path) -> pd.DataFrame:
doc = Document(path) doc = Document(path)
@@ -28,6 +28,8 @@ def word_docx_to_csv(path) -> pd.DataFrame:
def elsa_word_to_csv(path) -> list[tuple]: def elsa_word_to_csv(path) -> list[tuple]:
doc = Document(path) doc = Document(path)
# print all lines in doc
doctype = [para.text for para in doc.paragraphs if para.text != ""][-1]
tables = doc.tables tables = doc.tables
m_data = [] m_data = []
@@ -53,4 +55,11 @@ def elsa_word_to_csv(path) -> list[tuple]:
if row != ("", "", "", "", "", "", "", "", "") if row != ("", "", "", "", "", "", "", "", "")
] ]
return data return data, doctype
if __name__ == "__main__":
else_df = elsa_word_to_csv(
"c:/Users/aky547/Desktop/semap/formularsemhrsg2023_Bestellung Sahrai_Hurrelmann et al.Referenzwerk Prävention.docx"
)
print(else_df)