pdf_parser.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. import os
  2. from collections import defaultdict
  3. from utils import (
  4. extract_filing_date,
  5. address_extraction,
  6. refer_exteraction,
  7. email_extraction,
  8. telephone_number_extraction,
  9. )
  10. import pandas as pd
  11. import tika
  12. from tika import parser
  13. tika.initVM()
  14. def main(PATH):
  15. """
  16. parses the required data from the pdfs
  17. """
  18. data_dict = defaultdict(list)
  19. required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
  20. for idx, file in enumerate(required_files):
  21. content = parser.from_file(file)["content"].strip().replace("\n", "")
  22. data_dict["filing"].append(extract_filing_date(content))
  23. data_dict["address"].append(address_extraction(content))
  24. data_dict["refer"].append(refer_exteraction(content))
  25. data_dict["email"].append(email_extraction(content))
  26. data_dict["telephone_number"].append(telephone_number_extraction(content))
  27. data = pd.DataFrame(data_dict)
  28. data.to_csv("required_data.csv")
  29. if __name__ == "__main__":
  30. HOME_DIR = os.path.expanduser("~")
  31. BASE_DIR = "Code/pdf_parser/pdfs"
  32. PATH = os.path.join(HOME_DIR, BASE_DIR)
  33. main(PATH)