12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- from collections import defaultdict
- import os
- import tika
- from tika import parser
- import pandas as pd
- from pandas import DataFrame
- from parse_expert_pdf_utils import (
- plaintiff_extraction,
- defendent_extraction,
- expert_name_extraction,
- case_number_extraction,
- patent_extraction,
- hourly_compensation,
- extract_email,
- )
- tika.initVM()
- def main(path: str) -> DataFrame:
- """
- The functions iterates through all the given files and gathers the data in the \
- form of a dataframe
- """
- data = defaultdict(list)
- required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
- for file in required_files:
- print(file)
- content = parser.from_file(file)["content"]
- data["file"].append(file)
- data["email"].append(content)
- data["full_name"].append(expert_name_extraction(content))
- data["hourly_pay"].append(hourly_compensation(content))
- data["plaintiff"].append(plaintiff_extraction(content))
- data["defendant"].append(defendent_extraction(content))
- data["case_number"].append(case_number_extraction(content))
- data["patents"].append(patent_extraction(content))
- data_expert = pd.DataFrame(data)
- data_expert.to_csv("required_data.csv")
- return data_expert
- if __name__ == "__main__":
- HOME_DIR = os.path.expanduser("~")
- BASE_DIR = "Code/pdf_parser/expert_report"
- PATH = os.path.join(HOME_DIR, BASE_DIR)
- print(main(PATH))
|