123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- from collections import defaultdict
- import os
- from IPython.display import display
- import tika
- import pandas as pd
- tika.initVM()
- from tika import parser
- from parse_resume_utils import (
- extract_email,
- extract_phone,
- extract_zipcode,
- extract_case_numbers,
- extract_litigation_experience,
- extract_patents_issued,
- )
- def main(PATH):
- """
- Returns the required data in a dataframe format
- """
- data_dict = defaultdict(list)
- required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
- for idx, file in enumerate(required_files):
- print(idx, file)
- parse_content = parser.from_file(file)["content"].strip().replace("\n", "")
- data_dict["email"].append(extract_email(parse_content))
- data_dict["phone"].append(extract_phone(parse_content))
- data_dict["zipcode"].append(extract_zipcode(parse_content))
- data_dict["cases"].append(extract_case_numbers(parse_content))
- data_dict["litigation_experience"].append(
- extract_litigation_experience(parse_content)
- )
- data_dict["patents_issued"].append(extract_patents_issued(parse_content))
- return pd.DataFrame(data_dict)
- if __name__ == "__main__":
- HOME_DIR = os.path.expanduser("~")
- BASE_DIR = "Code/pdf_parser/expert_resume"
- PATH = os.path.join(HOME_DIR, BASE_DIR)
- display(main(PATH))
|