1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- from parse_expert_pdf_utils import (
- case_number_extraction,
- expert_name_extraction,
- plaintiff_extraction,
- defendent_extraction,
- patent_extraction,
- on_behalf_of_extraction,
- acronym_extraction,
- hourly_compensation,
- date_extraction,
- address_extraction,
- )
- import tika
- from tika import parser
- import re
- import warnings
- warnings.filterwarnings('ignore')
- def main():
- tika.initVM()
- path = "/home/ftech/Code/xc/pdf_parser/pdfs/2018008353_Mail_Decision.pdf" #2018008353_Mail_Decision.pdf"
- parsed_pdf = parser.from_file(path)
- print(parsed_pdf.keys())
- content = parsed_pdf['content'].strip().replace('\n', '')
- # print(content)
- # case_number = case_number_extraction(content)
- # print(case_number)
- # expert_name = expert_name_extraction(content)
- # print(expert_name)
- # plaintiff = plaintiff_extraction(content)
- # print(plaintiff)
- # defendent = defendent_extraction(content)
- # print(defendent)
- # patent = patent_extraction(content)
- # print(patent)
- # on_behalf_of = on_behalf_of_extraction(content)
- # print(on_behalf_of)
- # acronym = acronym_extraction(content)
- # print(acronym)
- # pay = hourly_compensation(content)
- # print(pay)
- date = date_extraction(content)
- print(date)
- address = address_extraction(content)
- print(address)
- if __name__ == "__main__":
- main()
|