1234567891011121314151617181920 |
- import re
- import tika
- from tika import parser
- tika.initVM()
- if __name__ == "__main__":
- data = (
- parser.from_file(
- "/home/ftech/Desktop/harsh_parikh_codes/PDF_Scrapper/Complaint/document.pdf"
- )["content"]
- .strip()
- .replace("\n", "")
- )
- print(data)
- # pattern = r"OF\s\w+(\s.*?\,).*?Plaintiff"
- # print(re.search(pattern, data).groups()[0])
- # pattern = r"Plaintiff[s]?.*v[s]?\.(.*)Defendant\."
- # re.search(pattern, data).group()
|