Browse Source

added parsers for elements of expert resume

Harsh Parikh 2 năm trước cách đây
mục cha
commit
6a5cac4a92
2 tập tin đã thay đổi với 43 bổ sung8 xóa
  1. 13 1
      expert_resume/parse_expert_resume.py
  2. 30 7
      expert_resume/parse_resume_utils.py

+ 13 - 1
expert_resume/parse_expert_resume.py

@@ -7,7 +7,14 @@ import pandas as pd
 tika.initVM()
 from tika import parser
 
-from parse_resume_utils import extract_email, extract_phone, extract_zipcode
+from parse_resume_utils import (
+    extract_email,
+    extract_phone,
+    extract_zipcode,
+    extract_case_numbers,
+    extract_litigation_experience,
+    extract_patents_issued,
+)
 
 
 def main(PATH):
@@ -22,6 +29,11 @@ def main(PATH):
         data_dict["email"].append(extract_email(parse_content))
         data_dict["phone"].append(extract_phone(parse_content))
         data_dict["zipcode"].append(extract_zipcode(parse_content))
+        data_dict["cases"].append(extract_case_numbers(parse_content))
+        data_dict["litigation_experience"].append(
+            extract_litigation_experience(parse_content)
+        )
+        data_dict["patents_issued"].append(extract_patents_issued(parse_content))
 
     return pd.DataFrame(data_dict)
 

+ 30 - 7
expert_resume/parse_resume_utils.py

@@ -6,8 +6,8 @@ def extract_email(content):
     Extracts email id of the expert
     """
     try:
-        pattern = r"[a-z0-9]+@[a-z]+\.[a-z]{2,3}"
-        return re.search(pattern, content).group()[0]
+        pattern = r"([a-z0-9]+@[a-z]+\.[a-z]{2,3})"
+        return re.search(pattern, content).groups()[0]
     except:
         return "None"
 
@@ -17,8 +17,8 @@ def extract_zipcode(content):
     Extracts zipcode from the resume
     """
     try:
-        pattern = r"\w{2}\s\d{5}"
-        return re.search(pattern, content).group()[0]
+        pattern = r"(\w{2}\s\d{5})"
+        return re.search(pattern, content).groups()[0]
     except:
         return "None"
 
@@ -28,8 +28,8 @@ def extract_phone(content):
     Extracts phone number of the expert.
     """
     try:
-        pattern = r"\(?\d{3}\)?\-\d{3}\-\d{4}"
-        return re.search(pattern, content).group()[0]
+        pattern = r"(\(?\d{3}\)?\-?\s?\d{3}\-\d{4})"
+        return re.search(pattern, content).group()
     except:
         return "None"
 
@@ -39,7 +39,30 @@ def extract_case_numbers(content):
     Extracts all the case numbers associated with resume
     """
     results = []
-    case_numbers = re.compile(r"\d\:\d{2}\-\w+\-\d+\-\w+\-?\w+", re.IGNORECASE)
+    case_numbers = re.compile(r"\d{2}\-\w+\-\d+\-\w+\-?\w+", re.IGNORECASE)
     for current in case_numbers.finditer(content):
         results.append(current.group().replace(",", ""))
     return list(set(results))
+
+
+def extract_litigation_experience(content):
+    """
+    Extracts the litigation experience of the expert
+    """
+    try:
+        pattern = r"(\d+|\w+)\s?years"
+        return re.search(pattern, content).group()
+    except:
+        return "None"
+
+
+def extract_patents_issued(content):
+    """
+    Returns the patents issued by the expert
+    """
+    regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?"
+    results = []
+    patent = re.compile(regex, re.IGNORECASE)
+    for current in patent.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))