pdf_parser.py 912 B

1234567891011121314151617181920212223242526272829303132333435
  1. import os
  2. from utils import (
  3. extract_filing_date,
  4. address_extraction,
  5. refer_exteraction,
  6. email_extraction,
  7. telephone_number_extraction,
  8. )
  9. import tika
  10. from tika import parser
  11. tika.initVM()
  12. def main(PATH):
  13. """
  14. parses the required data from the pdfs
  15. """
  16. required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
  17. for idx, file in enumerate(required_files):
  18. print(idx, file)
  19. content = parser.from_file(file)["content"].strip().replace("\n", "")
  20. print(extract_filing_date(content))
  21. print(address_extraction(content))
  22. print(refer_exteraction(content))
  23. print(email_extraction(content))
  24. print((telephone_number_extraction(content)))
  25. if __name__ == "__main__":
  26. HOME_DIR = os.path.expanduser("~")
  27. BASE_DIR = "Code/pdf_parser/pdfs"
  28. PATH = os.path.join(HOME_DIR, BASE_DIR)
  29. main(PATH)