parse_expert_pdf.py 944 B

12345678910111213141516171819202122232425262728293031323334353637
  1. from parse_expert_pdf_utils import (
  2. date_extraction,
  3. address_extraction,
  4. refer_exteraction,
  5. )
  6. import tika
  7. import os
  8. from tika import parser
  9. tika.initVM()
  10. import warnings
  11. warnings.filterwarnings("ignore")
  12. def main(path):
  13. required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
  14. for idx, file in enumerate(required_files):
  15. print(idx, file)
  16. print(file)
  17. content = parser.from_file(file)["content"].strip().replace("\n", "")
  18. # content = parsed_pdf["content"].strip().replace("\n", "")
  19. # print(content)
  20. date = date_extraction(content)
  21. print(date)
  22. address = address_extraction(content)
  23. print(address)
  24. refer = refer_exteraction(content)
  25. print(refer)
  26. if __name__ == "__main__":
  27. HOME_DIR = os.path.expanduser("~")
  28. BASE_DIR = "Code/pdf_parser/pdfs"
  29. path = os.path.join(HOME_DIR, BASE_DIR)
  30. main(path)