chore: restructured project, updated readme

2025-10-29 09:31:40 +01:00
parent a4460ec17b
commit ee62c65ae7
70 changed files with 8518 additions and 100 deletions
--- a/src/parsers/pdf_parser.py
+++ b/src/parsers/pdf_parser.py
@@ -0,0 +1,23 @@
+# add depend path to system path
+
+from pdfquery import PDFQuery
+
+
+def pdf_to_csv(path: str) -> str:
+    """
+    Extracts the data from a pdf file and returns it as a pandas dataframe
+    """
+    file = PDFQuery(path)
+    file.load()
+    # get the text from the pdf file
+    text_elems = file.extract([("with_formatter", "text"), ("all_text", "*")])
+    extracted_text = text_elems["all_text"]
+
+    return extracted_text
+
+
+if __name__ == "__main__":
+    text = pdf_to_csv("54_pdf.pdf")
+    # remove linebreaks
+    text = text.replace("\n", "")
+    # print(text)