from collections import defaultdict import os from IPython.display import display import tika import pandas as pd from pandas import DataFrame tika.initVM() from tika import parser from parse_resume_utils import ( extract_email, extract_phone, extract_zipcode, extract_case_numbers, extract_litigation_experience, extract_patents_issued, extract_name, ) def main(PATH: str) -> DataFrame: """ Returns the required data in a dataframe format """ data_dict = defaultdict(list) required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1] for idx, file in enumerate(required_files): print(idx, file) parse_content = parser.from_file(file)["content"] data_dict["name"].append(extract_name(parse_content)) data_dict["email"].append(extract_email(parse_content)) data_dict["phone"].append(extract_phone(parse_content)) data_dict["zipcode"].append(extract_zipcode(parse_content)) data_dict["cases"].append(extract_case_numbers(parse_content)) data_dict["litigation_experience"].append( extract_litigation_experience(parse_content) ) data_dict["patents_issued"].append(extract_patents_issued(parse_content)) data = pd.DataFrame(data_dict) data.to_csv("required_data.csv") return data if __name__ == "__main__": HOME_DIR = os.path.expanduser("~") BASE_DIR = "Code/pdf_parser/expert_resume" PATH = os.path.join(HOME_DIR, BASE_DIR) display(main(PATH))