"""This module is used to parse PDF for expert reports. """ from collections import defaultdict import os import tika from tika import parser import pandas as pd from pandas import DataFrame from parse_expert_pdf_utils import ( plaintiff_extraction, defendent_extraction, expert_name_extraction, case_number_extraction, patent_extraction, hourly_compensation, extract_email, ) tika.initVM() def main(path: str) -> DataFrame: """ The functions iterates through all the given files and gathers the data in the \ form of a dataframe """ data = defaultdict(list) required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1] for file in required_files: print(file) content = parser.from_file(file)["content"] data["file"].append(file) data["email"].append(extract_email(content)) data["full_name"].append(expert_name_extraction(content)) data["hourly_pay"].append(hourly_compensation(content)) data["plaintiff"].append(plaintiff_extraction(content)) data["defendant"].append(defendent_extraction(content)) data["case_number"].append(case_number_extraction(content)) data["patents"].append(patent_extraction(content)) data_expert = pd.DataFrame(data) data_expert.to_csv("required_data.csv") return data_expert if __name__ == "__main__": HOME_DIR = os.path.expanduser("~") BASE_DIR = "Code/pdf_parser/expert_report" PATH = os.path.join(HOME_DIR, BASE_DIR) print(main(PATH))