parse_expert_pdf.py 1.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. from parse_expert_pdf_utils import (
  2. case_number_extraction,
  3. expert_name_extraction,
  4. plaintiff_extraction,
  5. defendent_extraction,
  6. patent_extraction,
  7. on_behalf_of_extraction,
  8. acronym_extraction,
  9. hourly_compensation,
  10. date_extraction,
  11. address_extraction,
  12. refer_exteraction,
  13. )
  14. import tika
  15. import os
  16. from tika import parser
  17. tika.initVM()
  18. import warnings
  19. warnings.filterwarnings("ignore")
  20. def main(path):
  21. required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
  22. for idx, file in enumerate(required_files):
  23. print(idx, file)
  24. parsed_pdf = parser.from_file(file)
  25. # print(parsed_pdf.keys())
  26. content = parsed_pdf["content"].strip().replace("\n", "")
  27. # print(content)
  28. date = date_extraction(content)
  29. print(date)
  30. address = address_extraction(content)
  31. print(address)
  32. refer = refer_exteraction(content)
  33. print(refer)
  34. if __name__ == "__main__":
  35. HOME_DIR = os.path.expanduser("~")
  36. BASE_DIR = "/home/ftech/Code/pdf_parser/pdfs/"
  37. path = os.path.join(HOME_DIR, BASE_DIR)
  38. main(path)