parse_expert_pdf.py 1.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. from parse_expert_pdf_utils import (
  2. case_number_extraction,
  3. expert_name_extraction,
  4. plaintiff_extraction,
  5. defendent_extraction,
  6. patent_extraction,
  7. on_behalf_of_extraction,
  8. acronym_extraction,
  9. hourly_compensation,
  10. date_extraction,
  11. address_extraction,
  12. )
  13. import tika
  14. from tika import parser
  15. import re
  16. import warnings
  17. warnings.filterwarnings('ignore')
  18. def main():
  19. tika.initVM()
  20. path = "/home/ftech/Code/xc/pdf_parser/pdfs/2018008353_Mail_Decision.pdf" #2018008353_Mail_Decision.pdf"
  21. parsed_pdf = parser.from_file(path)
  22. print(parsed_pdf.keys())
  23. content = parsed_pdf['content'].strip().replace('\n', '')
  24. # print(content)
  25. # case_number = case_number_extraction(content)
  26. # print(case_number)
  27. # expert_name = expert_name_extraction(content)
  28. # print(expert_name)
  29. # plaintiff = plaintiff_extraction(content)
  30. # print(plaintiff)
  31. # defendent = defendent_extraction(content)
  32. # print(defendent)
  33. # patent = patent_extraction(content)
  34. # print(patent)
  35. # on_behalf_of = on_behalf_of_extraction(content)
  36. # print(on_behalf_of)
  37. # acronym = acronym_extraction(content)
  38. # print(acronym)
  39. # pay = hourly_compensation(content)
  40. # print(pay)
  41. date = date_extraction(content)
  42. print(date)
  43. address = address_extraction(content)
  44. print(address)
  45. if __name__ == "__main__":
  46. main()