parse_expert_pdf.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. from parse_expert_pdf_utils import (
  2. defendent_extraction,
  3. plaintiff_extraction,
  4. defendent_extraction,
  5. expert_name_extraction,
  6. case_number_extraction,
  7. )
  8. import tika
  9. import os
  10. from tika import parser
  11. import pandas as pd
  12. from collections import defaultdict
  13. tika.initVM()
  14. import warnings
  15. warnings.filterwarnings("ignore")
  16. def main(path: str) -> pd.DataFrame:
  17. """
  18. The functions iterates through all the given files and gathers the data in the \
  19. form of a dataframe
  20. """
  21. data = defaultdict(list)
  22. required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
  23. for file in required_files:
  24. print(file)
  25. content = parser.from_file(file)["content"]
  26. data["file"].append(file)
  27. data["full_name"].append(expert_name_extraction(content))
  28. data["plaintiff"].append(plaintiff_extraction(content))
  29. data["defendant"].append(defendent_extraction(content))
  30. data["case_number"].append(case_number_extraction(content))
  31. df = pd.DataFrame(data)
  32. return df
  33. if __name__ == "__main__":
  34. HOME_DIR = os.path.expanduser("~")
  35. BASE_DIR = "Code/pdf_parser/expert_report"
  36. PATH = os.path.join(HOME_DIR, BASE_DIR)
  37. print(main(PATH))