add valid letters to detect paragraphs
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
||||||
|
|
||||||
def word_docx_to_csv(path) -> pd.DataFrame:
|
def word_docx_to_csv(path) -> pd.DataFrame:
|
||||||
doc = Document(path)
|
doc = Document(path)
|
||||||
@@ -28,6 +28,8 @@ def word_docx_to_csv(path) -> pd.DataFrame:
|
|||||||
|
|
||||||
def elsa_word_to_csv(path) -> list[tuple]:
|
def elsa_word_to_csv(path) -> list[tuple]:
|
||||||
doc = Document(path)
|
doc = Document(path)
|
||||||
|
# print all lines in doc
|
||||||
|
doctype = [para.text for para in doc.paragraphs if para.text != ""][-1]
|
||||||
tables = doc.tables
|
tables = doc.tables
|
||||||
|
|
||||||
m_data = []
|
m_data = []
|
||||||
@@ -53,4 +55,11 @@ def elsa_word_to_csv(path) -> list[tuple]:
|
|||||||
if row != ("", "", "", "", "", "", "", "", "")
|
if row != ("", "", "", "", "", "", "", "", "")
|
||||||
]
|
]
|
||||||
|
|
||||||
return data
|
return data, doctype
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
else_df = elsa_word_to_csv(
|
||||||
|
"c:/Users/aky547/Desktop/semap/formularsemhrsg2023_Bestellung Sahrai_Hurrelmann et al.Referenzwerk Prävention.docx"
|
||||||
|
)
|
||||||
|
print(else_df)
|
||||||
Reference in New Issue
Block a user