from parse_expert_pdf_utils import ( defendent_extraction, plaintiff_extraction, defendent_extraction, expert_name_extraction, case_number_extraction, ) import tika import os from tika import parser import pandas as pd from collections import defaultdict tika.initVM() import warnings warnings.filterwarnings("ignore") def main(path: str) -> pd.DataFrame: """ The functions iterates through all the given files and gathers the data in the \ form of a dataframe """ data = defaultdict(list) required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1] for file in required_files: print(file) content = parser.from_file(file)["content"] data["file"].append(file) data["full_name"].append(expert_name_extraction(content)) data["plaintiff"].append(plaintiff_extraction(content)) data["defendant"].append(defendent_extraction(content)) data["case_number"].append(case_number_extraction(content)) df = pd.DataFrame(data) return df if __name__ == "__main__": HOME_DIR = os.path.expanduser("~") BASE_DIR = "Code/pdf_parser/expert_report" PATH = os.path.join(HOME_DIR, BASE_DIR) print(main(PATH))