add valid letters to detect paragraphs
This commit is contained in:
@@ -1,6 +1,6 @@
|
||||
import pandas as pd
|
||||
from docx import Document
|
||||
|
||||
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||||
|
||||
def word_docx_to_csv(path) -> pd.DataFrame:
|
||||
doc = Document(path)
|
||||
@@ -28,6 +28,8 @@ def word_docx_to_csv(path) -> pd.DataFrame:
|
||||
|
||||
def elsa_word_to_csv(path) -> list[tuple]:
|
||||
doc = Document(path)
|
||||
# print all lines in doc
|
||||
doctype = [para.text for para in doc.paragraphs if para.text != ""][-1]
|
||||
tables = doc.tables
|
||||
|
||||
m_data = []
|
||||
@@ -53,4 +55,11 @@ def elsa_word_to_csv(path) -> list[tuple]:
|
||||
if row != ("", "", "", "", "", "", "", "", "")
|
||||
]
|
||||
|
||||
return data
|
||||
return data, doctype
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
else_df = elsa_word_to_csv(
|
||||
"c:/Users/aky547/Desktop/semap/formularsemhrsg2023_Bestellung Sahrai_Hurrelmann et al.Referenzwerk Prävention.docx"
|
||||
)
|
||||
print(else_df)
|
||||
Reference in New Issue
Block a user