import os from collections import defaultdict import pandas as pd import tika from tika import parser from utils import ( extract_filing_date, address_extraction, refer_exteraction, email_extraction, telephone_number_extraction, ) tika.initVM() def main(PATH): """ parses the required data from the pdfs """ data_dict = defaultdict(list) required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1] for file in required_files: content = parser.from_file(file)["content"] data_dict["document_name"].append(file) data_dict["filing_date"].append(extract_filing_date(content)) data_dict["address"].append(address_extraction(content)) data_dict["refer"].append(refer_exteraction(content)) data_dict["email"].append(email_extraction(content)) data_dict["telephone_number"].append(telephone_number_extraction(content)) data = pd.DataFrame(data_dict) data.to_csv("required_data.csv") if __name__ == "__main__": HOME_DIR = os.path.expanduser("~") BASE_DIR = "Code/pdf_parser/pdfs" PATH = os.path.join(HOME_DIR, BASE_DIR) main(PATH)