from collections import defaultdict import os from IPython.display import display import tika import pandas as pd tika.initVM() from tika import parser from parse_pdf_utils import ( extract_acronyms, extract_case_number, extract_defendent, extract_filing_date, extract_firm_name, extract_patent_number, extract_plaintiff, extract_attorney_name, ) def extract_all(PATH): """ Returns all the required data from the pdfs in a dataframe format. """ data_dict = defaultdict(list) required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1] for idx, file in enumerate(required_files): print(idx, file) parse_file = parser.from_file(file)["content"].strip().replace("\n", "") data_dict["case_number_list"].append(extract_case_number(content=parse_file)) data_dict["filing_date_list"].append(extract_filing_date(content=parse_file)) data_dict["plaintiff_list"].append(extract_plaintiff(content=parse_file)) data_dict["defendent_list"].append(extract_defendent(content=parse_file)) data_dict["acronyms_list"].append(extract_acronyms(content=parse_file)) data_dict["firm_name_list"].append(extract_firm_name(content=parse_file)) data_dict["attorney"].append(extract_attorney_name(content=parse_file)) data_dict["patent_list"].append(extract_patent_number(content=parse_file)) data = pd.DataFrame(data_dict) data.to_csv("required_data.csv", index=False) return data if __name__ == "__main__": BASE_DIR = "Code/pdf_parser/complaints" HOME_DIR = os.path.expanduser("~") PATH = os.path.join(HOME_DIR, BASE_DIR) display(extract_all(PATH))