parse_expert_pdf.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. from collections import defaultdict
  2. import os
  3. import tika
  4. from tika import parser
  5. import pandas as pd
  6. from parse_expert_pdf_utils import (
  7. plaintiff_extraction,
  8. defendent_extraction,
  9. expert_name_extraction,
  10. case_number_extraction,
  11. patent_extraction,
  12. hourly_compensation,
  13. )
  14. tika.initVM()
  15. def main(path: str) -> pd.DataFrame:
  16. """
  17. The functions iterates through all the given files and gathers the data in the \
  18. form of a dataframe
  19. """
  20. data = defaultdict(list)
  21. required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
  22. for file in required_files:
  23. print(file)
  24. content = parser.from_file(file)["content"]
  25. data["file"].append(file)
  26. data["full_name"].append(expert_name_extraction(content))
  27. data["hourly_pay"].append(hourly_compensation(content))
  28. data["plaintiff"].append(plaintiff_extraction(content))
  29. data["defendant"].append(defendent_extraction(content))
  30. data["case_number"].append(case_number_extraction(content))
  31. data["patents"].append(patent_extraction(content))
  32. data_expert = pd.DataFrame(data)
  33. data_expert.to_csv("required_data.csv")
  34. return data_expert
  35. if __name__ == "__main__":
  36. HOME_DIR = os.path.expanduser("~")
  37. BASE_DIR = "Code/pdf_parser/expert_report"
  38. PATH = os.path.join(HOME_DIR, BASE_DIR)
  39. print(main(PATH))