import os from collections import defaultdict from utils import ( extract_filing_date, address_extraction, refer_exteraction, email_extraction, telephone_number_extraction, ) import pandas as pd import tika from tika import parser tika.initVM() def main(PATH): """ parses the required data from the pdfs """ data_dict = defaultdict(list) required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1] for idx, file in enumerate(required_files): content = parser.from_file(file)["content"].strip().replace("\n", "") data_dict["filing"].append(extract_filing_date(content)) data_dict["address"].append(address_extraction(content)) data_dict["refer"].append(refer_exteraction(content)) data_dict["email"].append(email_extraction(content)) data_dict["telephone_number"].append(telephone_number_extraction(content)) data = pd.DataFrame(data_dict) data.to_csv("required_data.csv") if __name__ == "__main__": HOME_DIR = os.path.expanduser("~") BASE_DIR = "Code/pdf_parser/pdfs" PATH = os.path.join(HOME_DIR, BASE_DIR) main(PATH)