فهرست منبع

added expert_resume parser

Harsh Parikh 2 سال پیش
والد
کامیت
7443b06f0e
2فایلهای تغییر یافته به همراه78 افزوده شده و 0 حذف شده
  1. 33 0
      expert_resume/parse_expert_resume.py
  2. 45 0
      expert_resume/parse_resume_utils.py

+ 33 - 0
expert_resume/parse_expert_resume.py

@@ -0,0 +1,33 @@
+from collections import defaultdict
+import os
+from IPython.display import display
+import tika
+import pandas as pd
+
+tika.initVM()
+from tika import parser
+
+from parse_resume_utils import extract_email, extract_phone, extract_zipcode
+
+
+def main(PATH):
+    """
+    Returns the required data in a dataframe format
+    """
+    data_dict = defaultdict(list)
+    required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
+    for idx, file in enumerate(required_files):
+        print(idx, file)
+        parse_content = parser.from_file(file)["content"].strip().replace("\n", "")
+        data_dict["email"].append(extract_email(parse_content))
+        data_dict["phone"].append(extract_phone(parse_content))
+        data_dict["zipcode"].append(extract_zipcode(parse_content))
+
+    return pd.DataFrame(data_dict)
+
+
+if __name__ == "__main__":
+    HOME_DIR = os.path.expanduser("~")
+    BASE_DIR = "Code/pdf_parser/expert_resume"
+    PATH = os.path.join(HOME_DIR, BASE_DIR)
+    display(main(PATH))

+ 45 - 0
expert_resume/parse_resume_utils.py

@@ -0,0 +1,45 @@
+import re
+
+
+def extract_email(content):
+    """
+    Extracts email id of the expert
+    """
+    try:
+        pattern = r"[a-z0-9]+@[a-z]+\.[a-z]{2,3}"
+        return re.search(pattern, content).group()[0]
+    except:
+        return "None"
+
+
+def extract_zipcode(content):
+    """
+    Extracts zipcode from the resume
+    """
+    try:
+        pattern = r"\w{2}\s\d{5}"
+        return re.search(pattern, content).group()[0]
+    except:
+        return "None"
+
+
+def extract_phone(content):
+    """
+    Extracts phone number of the expert.
+    """
+    try:
+        pattern = r"\(?\d{3}\)?\-\d{3}\-\d{4}"
+        return re.search(pattern, content).group()[0]
+    except:
+        return "None"
+
+
+def extract_case_numbers(content):
+    """
+    Extracts all the case numbers associated with resume
+    """
+    results = []
+    case_numbers = re.compile(r"\d\:\d{2}\-\w+\-\d+\-\w+\-?\w+", re.IGNORECASE)
+    for current in case_numbers.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))