ソースを参照

modified pin code and email parser

Harsh Parikh 2 年 前
コミット
6bbbfdc0ee

+ 1 - 1
expert_resume/parse_expert_resume.py

@@ -27,7 +27,7 @@ def main(PATH):
     required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
     for idx, file in enumerate(required_files):
         print(idx, file)
-        parse_content = parser.from_file(file)["content"].strip().replace("\n", "")
+        parse_content = parser.from_file(file)["content"]
         data_dict["name"].append(extract_name(parse_content))
         data_dict["email"].append(extract_email(parse_content))
         data_dict["phone"].append(extract_phone(parse_content))

+ 5 - 5
expert_resume/parse_resume_utils.py

@@ -6,7 +6,7 @@ def extract_email(content):
     Extracts email id of the expert
     """
     try:
-        pattern = r"([a-z0-9]+@[a-z]+\.[a-z]{2,3})"
+        pattern = r"[a-z0-9]+@+[a-z].*?\.\w+\.?\w+"
         return re.search(pattern, content).group()
     except:
         return "None"
@@ -16,9 +16,9 @@ def extract_zipcode(content):
     """
     Extracts zipcode from the resume
     """
+    pattern = r"[A-Z]{2}\s\d{5,6}"
     try:
-        pattern = r"(\w{2}\s\d{5})"
-        return re.search(pattern, content).groups()[0]
+        return re.search(pattern, content).group()
     except:
         return "None"
 
@@ -72,8 +72,8 @@ def extract_name(content):
     """
     Returns the name of the expert
     """
-    # pattern = r"(\w+\s\w+.*?)Resume" Old pattern
-    pattern = r"(RESUME|\SResume)\s(\w+\s\w+\.?\s\w+)|\-(\s.*?)Resume"
+    # pattern = r"(RESUME|\SResume)\s(\w+\s\w+\.?\s\w+)|\-(\s.*?)Resume" Old pattern
+    pattern = r"\w+.*\n"
     try:
         return re.search(pattern, content).group()
     except: