123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- from collections import defaultdict
- import os
- from IPython.display import display
- import tika
- import pandas as pd
- tika.initVM()
- from tika import parser
- from parse_pdf_utils import (
- extract_acronyms,
- extract_case_number,
- extract_defendent,
- extract_filing_date,
- extract_firm_name,
- extract_patent_number,
- extract_plaintiff,
- extract_attorney_name,
- )
- def extract_all(PATH):
- """
- Returns all the required data from the pdfs in a dataframe format.
- """
- data_dict = defaultdict(list)
- required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
- for idx, file in enumerate(required_files):
- print(idx, file)
- parse_file = parser.from_file(file)["content"].strip().replace("\n", "")
- data_dict["case_number_list"].append(extract_case_number(content=parse_file))
- data_dict["filing_date_list"].append(extract_filing_date(content=parse_file))
- data_dict["plaintiff_list"].append(extract_plaintiff(content=parse_file))
- data_dict["defendent_list"].append(extract_defendent(content=parse_file))
- data_dict["acronyms_list"].append(extract_acronyms(content=parse_file))
- data_dict["firm_name_list"].append(extract_firm_name(content=parse_file))
- data_dict["attorney"].append(extract_attorney_name(content=parse_file))
- data_dict["patent_list"].append(extract_patent_number(content=parse_file))
- data = pd.DataFrame(data_dict)
- data.to_csv("required_data.csv", index=False)
- return data
- if __name__ == "__main__":
- BASE_DIR = "Code/pdf_parser/complaints"
- HOME_DIR = os.path.expanduser("~")
- PATH = os.path.join(HOME_DIR, BASE_DIR)
- display(extract_all(PATH))
|