import re import tika from tika import parser tika.initVM() if __name__ == "__main__": data = ( parser.from_file( "/home/ftech/Desktop/harsh_parikh_codes/PDF_Scrapper/Complaint/document.pdf" )["content"] .strip() .replace("\n", "") ) print(data) # pattern = r"OF\s\w+(\s.*?\,).*?Plaintiff" # print(re.search(pattern, data).groups()[0]) # pattern = r"Plaintiff[s]?.*v[s]?\.(.*)Defendant\." # re.search(pattern, data).group()