parse_expert_pdf.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. from parse_expert_pdf_utils import (
  2. case_number_extraction,
  3. expert_name_extraction,
  4. plaintiff_extraction,
  5. defendent_extraction,
  6. patent_extraction,
  7. on_behalf_of_extraction,
  8. acronym_extraction,
  9. )
  10. import tika
  11. from tika import parser
  12. import re
  13. import warnings
  14. warnings.filterwarnings('ignore')
  15. def main():
  16. tika.initVM()
  17. path = "/home/omkardesai/Code/pdf_parser/pdfs/expert_report1.pdf"
  18. parsed_pdf = parser.from_file(path)
  19. print(parsed_pdf.keys())
  20. content = parsed_pdf['content'].strip().replace('\n', '')
  21. # print(content)
  22. # case_number = case_number_extraction(content)
  23. # print(case_number)
  24. # expert_name = expert_name_extraction(content)
  25. # print(expert_name)
  26. # plaintiff = plaintiff_extraction(content)
  27. # print(plaintiff)
  28. # defendent = defendent_extraction(content)
  29. # print(defendent)
  30. # patent = patent_extraction(content)
  31. # print(patent)
  32. # on_behalf_of = on_behalf_of_extraction(content)
  33. # print(on_behalf_of)
  34. acronym = acronym_extraction(content)
  35. print(acronym)
  36. if __name__ == "__main__":
  37. main()