chore: restructured project, updated readme

This commit is contained in:
2025-10-29 09:31:40 +01:00
parent a4460ec17b
commit ee62c65ae7
70 changed files with 8518 additions and 100 deletions

23
src/parsers/pdf_parser.py Normal file
View File

@@ -0,0 +1,23 @@
# add depend path to system path
from pdfquery import PDFQuery
def pdf_to_csv(path: str) -> str:
"""
Extracts the data from a pdf file and returns it as a pandas dataframe
"""
file = PDFQuery(path)
file.load()
# get the text from the pdf file
text_elems = file.extract([("with_formatter", "text"), ("all_text", "*")])
extracted_text = text_elems["all_text"]
return extracted_text
if __name__ == "__main__":
text = pdf_to_csv("54_pdf.pdf")
# remove linebreaks
text = text.replace("\n", "")
# print(text)