parse_expert_pdf.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. from parse_expert_pdf_utils import (
  2. case_number_extraction,
  3. expert_name_extraction,
  4. plaintiff_extraction,
  5. defendent_extraction,
  6. patent_extraction,
  7. on_behalf_of_extraction,
  8. acronym_extraction,
  9. )
  10. import tika
  11. from tika import parser
  12. tika.initVM()
  13. import os
  14. def main(PATH):
  15. required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
  16. for file in required_files:
  17. content = parser.from_file(file)["content"].strip().replace("\n", "")
  18. # case_number = case_number_extraction(content)
  19. # print(case_number)
  20. # expert_name = expert_name_extraction(content)
  21. # print(expert_name)
  22. # plaintiff = plaintiff_extraction(content)
  23. # print(plaintiff)
  24. # defendent = defendent_extraction(content)
  25. # print(defendent)
  26. # patent = patent_extraction(content)
  27. # print(patent)
  28. # on_behalf_of = on_behalf_of_extraction(content)
  29. # print(on_behalf_of)
  30. acronym = acronym_extraction(content)
  31. print(acronym)
  32. if __name__ == "__main__":
  33. HOME_DIR = os.path.expanduser("~")
  34. BASE_DIR = "Code/pdf_parser/expert_report"
  35. PATH = os.path.join(HOME_DIR, BASE_DIR)
  36. main(PATH)