from parse_expert_pdf_utils import ( case_number_extraction, expert_name_extraction, plaintiff_extraction, defendent_extraction, patent_extraction, on_behalf_of_extraction, acronym_extraction, hourly_compensation, date_extraction, address_extraction, ) import tika import os from tika import parser tika.initVM() import warnings warnings.filterwarnings("ignore") def main(path): required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1] for idx, file in enumerate(required_files): print(idx, file) parsed_pdf = parser.from_file(file) # print(parsed_pdf.keys()) content = parsed_pdf["content"].strip().replace("\n", "") # print(content) date = date_extraction(content) print(date) address = address_extraction(content) print(address) if __name__ == "__main__": HOME_DIR = os.path.expanduser("~") BASE_DIR = "/home/ftech/Code/pdf_parser/pdfs/" path = os.path.join(HOME_DIR, BASE_DIR) main(path)