parse_expert_pdf.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. from collections import defaultdict
  2. import os
  3. import tika
  4. from tika import parser
  5. import pandas as pd
  6. from pandas import DataFrame
  7. from parse_expert_pdf_utils import (
  8. plaintiff_extraction,
  9. defendent_extraction,
  10. expert_name_extraction,
  11. case_number_extraction,
  12. patent_extraction,
  13. hourly_compensation,
  14. extract_email,
  15. )
  16. tika.initVM()
  17. def main(path: str) -> DataFrame:
  18. """
  19. The functions iterates through all the given files and gathers the data in the \
  20. form of a dataframe
  21. """
  22. data = defaultdict(list)
  23. required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
  24. for file in required_files:
  25. print(file)
  26. content = parser.from_file(file)["content"]
  27. data["file"].append(file)
  28. data["email"].append(content)
  29. data["full_name"].append(expert_name_extraction(content))
  30. data["hourly_pay"].append(hourly_compensation(content))
  31. data["plaintiff"].append(plaintiff_extraction(content))
  32. data["defendant"].append(defendent_extraction(content))
  33. data["case_number"].append(case_number_extraction(content))
  34. data["patents"].append(patent_extraction(content))
  35. data_expert = pd.DataFrame(data)
  36. data_expert.to_csv("required_data.csv")
  37. return data_expert
  38. if __name__ == "__main__":
  39. HOME_DIR = os.path.expanduser("~")
  40. BASE_DIR = "Code/pdf_parser/expert_report"
  41. PATH = os.path.join(HOME_DIR, BASE_DIR)
  42. print(main(PATH))