from collections import defaultdict import os import tika from tika import parser import pandas as pd from pandas import DataFrame from parse_expert_pdf_utils import ( plaintiff_extraction, defendent_extraction, expert_name_extraction, case_number_extraction, patent_extraction, hourly_compensation, extract_email, ) tika.initVM() def main(path: str) -> DataFrame: """ The functions iterates through all the given files and gathers the data in the \ form of a dataframe """ data = defaultdict(list) required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1] for file in required_files: print(file) content = parser.from_file(file)["content"] data["file"].append(file) data["email"].append(content) data["full_name"].append(expert_name_extraction(content)) data["hourly_pay"].append(hourly_compensation(content)) data["plaintiff"].append(plaintiff_extraction(content)) data["defendant"].append(defendent_extraction(content)) data["case_number"].append(case_number_extraction(content)) data["patents"].append(patent_extraction(content)) data_expert = pd.DataFrame(data) data_expert.to_csv("required_data.csv") return data_expert if __name__ == "__main__": HOME_DIR = os.path.expanduser("~") BASE_DIR = "Code/pdf_parser/expert_report" PATH = os.path.join(HOME_DIR, BASE_DIR) print(main(PATH))