123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- from parse_expert_pdf_utils import (
- case_number_extraction,
- expert_name_extraction,
- plaintiff_extraction,
- defendent_extraction,
- patent_extraction,
- on_behalf_of_extraction,
- acronym_extraction,
- hourly_compensation,
- date_extraction,
- address_extraction,
- refer_exteraction,
- )
- import tika
- import os
- from tika import parser
- tika.initVM()
- import warnings
- warnings.filterwarnings("ignore")
- def main(path):
- required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
- for idx, file in enumerate(required_files):
- print(idx, file)
- parsed_pdf = parser.from_file(file)
- # print(parsed_pdf.keys())
- content = parsed_pdf["content"].strip().replace("\n", "")
- # print(content)
- date = date_extraction(content)
- print(date)
- address = address_extraction(content)
- print(address)
- refer = refer_exteraction(content)
- print(refer)
- if __name__ == "__main__":
- HOME_DIR = os.path.expanduser("~")
- BASE_DIR = "/home/ftech/Code/pdf_parser/pdfs/"
- path = os.path.join(HOME_DIR, BASE_DIR)
- main(path)
|