main.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. """This module is used to parse PDF for expert reports.
  2. """
  3. from collections import defaultdict
  4. import os
  5. import tika
  6. from tika import parser
  7. import pandas as pd
  8. from pandas import DataFrame
  9. from utils import (
  10. plaintiff_extraction,
  11. defendent_extraction,
  12. case_number_extraction,
  13. patent_extraction,
  14. email_extraction,
  15. address_extraction,
  16. )
  17. tika.initVM()
  18. def main(path: str) -> DataFrame:
  19. """
  20. The functions iterates through all the given files and gathers the data in the \
  21. form of a dataframe
  22. """
  23. data = defaultdict(list)
  24. required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
  25. for file in required_files:
  26. print(file)
  27. content = parser.from_file(file)["content"]
  28. data["file"].append(file)
  29. data["email"].append(email_extraction(content))
  30. data["plaintiff"].append(plaintiff_extraction(content))
  31. data["defendant"].append(defendent_extraction(content))
  32. data["case_number"].append(case_number_extraction(content))
  33. data["patents"].append(patent_extraction(content))
  34. data["address"].append(address_extraction(content))
  35. data_expert = pd.DataFrame(data)
  36. data_expert.to_csv("required_data.csv")
  37. return data_expert
  38. if __name__ == "__main__":
  39. HOME_DIR = os.path.expanduser("~")
  40. BASE_DIR = "Code/pdf_parser/server_documents/preliminary_response"
  41. PATH = os.path.join(HOME_DIR, BASE_DIR)
  42. print(main(PATH))