parse_pdf.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546
  1. from collections import defaultdict
  2. from IPython.display import display
  3. import os
  4. import tika
  5. import pandas as pd
  6. tika.initVM()
  7. from tika import parser
  8. from parse_pdf_utils import (
  9. extract_acronyms,
  10. extract_case_number,
  11. extract_defendent,
  12. extract_filing_date,
  13. extract_firm_name,
  14. extract_patent_number,
  15. extract_plaintiff,
  16. )
  17. def extract_all(PATH):
  18. """
  19. Returns all the required data from the pdfs in a dataframe format.
  20. """
  21. data_dict = defaultdict(list)
  22. required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
  23. for idx, file in enumerate(required_files):
  24. print(idx, file)
  25. parse_file = parser.from_file(file)["content"].strip().replace("\n", "")
  26. # data_dict["case_number_list"].append(extract_case_number(content=parse_file))
  27. # data_dict["filing_date_list"].append(extract_filing_date(content=parse_file))
  28. data_dict["plaintiff_list"].append(extract_plaintiff(content=parse_file))
  29. # data_dict["defendent_list"].append(extract_defendent(content=parse_file))
  30. # data_dict["acronyms_list"].append(extract_acronyms(content=parse_file))
  31. # data_dict["firm_name_list"].append(extract_firm_name(content=parse_file))
  32. # data_dict["patent_list"].append(extract_patent_number(content=parse_file))
  33. data = pd.DataFrame(data_dict)
  34. # data.to_csv("required_data.csv", index=False)
  35. return data
  36. if __name__ == "__main__":
  37. BASE_DIR = "Code/pdf_parser/complaints"
  38. HOME_DIR = os.path.expanduser("~")
  39. PATH = os.path.join(HOME_DIR, BASE_DIR)
  40. display(extract_all(PATH))