Parcourir la source

updated gitignore and updated the email extraction pattern

Harsh Parikh il y a 2 ans
Parent
commit
bcec109e19
3 fichiers modifiés avec 9 ajouts et 5 suppressions
  1. 1 0
      .gitignore
  2. 4 2
      expert_resume/parse_expert_resume.py
  3. 4 3
      expert_resume/parse_resume_utils.py

+ 1 - 0
.gitignore

@@ -67,3 +67,4 @@ foo.py
 .~lock.Document Parser Fields.xlsx#
 .DS_Store
 .vscode
+.~lock.required_data.csv#

+ 4 - 2
expert_resume/parse_expert_resume.py

@@ -1,6 +1,7 @@
 from collections import defaultdict
 import os
 from IPython.display import display
+from sympy import content
 import tika
 import pandas as pd
 
@@ -36,8 +37,9 @@ def main(PATH):
             extract_litigation_experience(parse_content)
         )
         data_dict["patents_issued"].append(extract_patents_issued(parse_content))
-
-    return pd.DataFrame(data_dict)
+    data = pd.DataFrame(data_dict)
+    data.to_csv("required_data.csv")
+    return data
 
 
 if __name__ == "__main__":

+ 4 - 3
expert_resume/parse_resume_utils.py

@@ -7,7 +7,7 @@ def extract_email(content):
     """
     try:
         pattern = r"([a-z0-9]+@[a-z]+\.[a-z]{2,3})"
-        return re.search(pattern, content).groups()[0]
+        return re.search(pattern, content).group()
     except:
         return "None"
 
@@ -72,8 +72,9 @@ def extract_name(content):
     """
     Returns the name of the expert
     """
-    pattern = r"(\w+\s\w+.*?)Resume"
+    # pattern = r"(\w+\s\w+.*?)Resume" Old pattern
+    pattern = r"(RESUME|\SResume)\s(\w+\s\w+\.?\s\w+)|\-(\s.*?)Resume"
     try:
-        return re.search(pattern, content).groups()[0]
+        return re.search(pattern, content).group()
     except:
         return "None"