import os from utils import ( extract_filing_date, address_extraction, refer_exteraction, email_extraction, telephone_number_extraction, ) import tika from tika import parser tika.initVM() def main(PATH): """ parses the required data from the pdfs """ required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1] for idx, file in enumerate(required_files): print(idx, file) content = parser.from_file(file)["content"].strip().replace("\n", "") print(extract_filing_date(content)) print(address_extraction(content)) print(refer_exteraction(content)) print(email_extraction(content)) print((telephone_number_extraction(content))) if __name__ == "__main__": HOME_DIR = os.path.expanduser("~") BASE_DIR = "Code/pdf_parser/pdfs" PATH = os.path.join(HOME_DIR, BASE_DIR) main(PATH)