parse_expert_resume.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. from collections import defaultdict
  2. import os
  3. from IPython.display import display
  4. from sympy import content
  5. import tika
  6. import pandas as pd
  7. tika.initVM()
  8. from tika import parser
  9. from parse_resume_utils import (
  10. extract_email,
  11. extract_phone,
  12. extract_zipcode,
  13. extract_case_numbers,
  14. extract_litigation_experience,
  15. extract_patents_issued,
  16. extract_name,
  17. )
  18. def main(PATH):
  19. """
  20. Returns the required data in a dataframe format
  21. """
  22. data_dict = defaultdict(list)
  23. required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
  24. for idx, file in enumerate(required_files):
  25. print(idx, file)
  26. parse_content = parser.from_file(file)["content"].strip().replace("\n", "")
  27. data_dict["name"].append(extract_name(parse_content))
  28. data_dict["email"].append(extract_email(parse_content))
  29. data_dict["phone"].append(extract_phone(parse_content))
  30. data_dict["zipcode"].append(extract_zipcode(parse_content))
  31. data_dict["cases"].append(extract_case_numbers(parse_content))
  32. data_dict["litigation_experience"].append(
  33. extract_litigation_experience(parse_content)
  34. )
  35. data_dict["patents_issued"].append(extract_patents_issued(parse_content))
  36. data = pd.DataFrame(data_dict)
  37. data.to_csv("required_data.csv")
  38. return data
  39. if __name__ == "__main__":
  40. HOME_DIR = os.path.expanduser("~")
  41. BASE_DIR = "Code/pdf_parser/expert_resume"
  42. PATH = os.path.join(HOME_DIR, BASE_DIR)
  43. display(main(PATH))