test.py 499 B

1234567891011121314151617181920
  1. import re
  2. import tika
  3. from tika import parser
  4. tika.initVM()
  5. if __name__ == "__main__":
  6. data = (
  7. parser.from_file(
  8. "/home/ftech/Desktop/harsh_parikh_codes/PDF_Scrapper/Complaint/document.pdf"
  9. )["content"]
  10. .strip()
  11. .replace("\n", "")
  12. )
  13. print(data)
  14. # pattern = r"OF\s\w+(\s.*?\,).*?Plaintiff"
  15. # print(re.search(pattern, data).groups()[0])
  16. # pattern = r"Plaintiff[s]?.*v[s]?\.(.*)Defendant\."
  17. # re.search(pattern, data).group()