main.py 1.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. """ Script to parse documents from Decision_Affirmed
  2. """
  3. import os
  4. from collections import defaultdict
  5. import pandas as pd
  6. import tika
  7. from tika import parser
  8. from parse_pdf_utils import (
  9. extract_filing_date,
  10. address_extraction,
  11. email_extraction,
  12. telephone_number_extraction,
  13. )
  14. tika.initVM()
  15. def main(PATH):
  16. """
  17. parses the required data from the pdfs
  18. """
  19. data_dict = defaultdict(list)
  20. required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
  21. for file in required_files:
  22. content = parser.from_file(file)["content"]
  23. data_dict["document_name"].append(file)
  24. data_dict["filing_date"].append(extract_filing_date(content))
  25. data_dict["address"].append(address_extraction(content))
  26. data_dict["email"].append(email_extraction(content))
  27. data_dict["telephone_number"].append(telephone_number_extraction(content))
  28. data = pd.DataFrame(data_dict)
  29. data.to_csv("required_data.csv")
  30. if __name__ == "__main__":
  31. HOME_DIR = os.path.expanduser("~")
  32. BASE_DIR = "Code/pdf_parser/server_documents/decision_affirmed/"
  33. PATH = os.path.join(HOME_DIR, BASE_DIR)
  34. main(PATH)