25 lines
588 B
Python
25 lines
588 B
Python
# add depend path to system path
|
|
|
|
import pandas as pd
|
|
from pdfquery import PDFQuery
|
|
|
|
|
|
def pdf_to_csv(path: str) -> pd.DataFrame:
|
|
"""
|
|
Extracts the data from a pdf file and returns it as a pandas dataframe
|
|
"""
|
|
file = PDFQuery(path)
|
|
file.load()
|
|
# get the text from the pdf file
|
|
text_elems = file.extract([("with_formatter", "text"), ("all_text", "*")])
|
|
extracted_text = text_elems["all_text"]
|
|
|
|
return extracted_text
|
|
|
|
|
|
if __name__ == "__main__":
|
|
text = pdf_to_csv("54_pdf.pdf")
|
|
# remove linebreaks
|
|
text = text.replace("\n", "")
|
|
print(text)
|