rest of files, not sorted

This commit is contained in:
WorldTeacher
2024-05-17 08:35:37 +02:00
parent 7a0f7ed1f1
commit d7853ab67d
82 changed files with 10724 additions and 2309 deletions

View File

@@ -12,19 +12,15 @@ def pdf_to_csv(path: str) -> pd.DataFrame:
"""
file = PDFQuery(path)
file.load()
#get the text from the pdf file
text_elems = file.extract([
('with_formatter', 'text'),
('all_text', '*')
])
extracted_text = text_elems['all_text']
# get the text from the pdf file
text_elems = file.extract([("with_formatter", "text"), ("all_text", "*")])
extracted_text = text_elems["all_text"]
return extracted_text
if __name__ == "__main__":
text = pdf_to_csv("54_pdf.pdf")
#remove linebreaks
# remove linebreaks
text = text.replace("\n", "")
print(text)