Przeglądaj źródła

fixed formatting and added consistency

Harsh Parikh 2 lat temu
rodzic
commit
9ede71404d

+ 4 - 1
expert_report/parse_expert_pdf.py

@@ -3,6 +3,7 @@ import os
 import tika
 from tika import parser
 import pandas as pd
+from pandas import DataFrame
 from parse_expert_pdf_utils import (
     plaintiff_extraction,
     defendent_extraction,
@@ -10,13 +11,14 @@ from parse_expert_pdf_utils import (
     case_number_extraction,
     patent_extraction,
     hourly_compensation,
+    extract_email,
 )
 
 
 tika.initVM()
 
 
-def main(path: str) -> pd.DataFrame:
+def main(path: str) -> DataFrame:
     """
     The functions iterates through all the given files and gathers the data in the \
     form of a dataframe
@@ -27,6 +29,7 @@ def main(path: str) -> pd.DataFrame:
         print(file)
         content = parser.from_file(file)["content"]
         data["file"].append(file)
+        data["email"].append(content)
         data["full_name"].append(expert_name_extraction(content))
         data["hourly_pay"].append(hourly_compensation(content))
         data["plaintiff"].append(plaintiff_extraction(content))

+ 29 - 16
expert_report/parse_expert_pdf_utils.py

@@ -1,7 +1,7 @@
 import re
 
 
-def date_extraction(content):
+def date_extraction(content: str) -> str:
     """
     extracts filing date from the documents.
     """
@@ -12,7 +12,18 @@ def date_extraction(content):
         return "None"
 
 
-def address_extraction(content):
+def extract_email(content: str) -> str:
+    """
+    extracts email from the documents.
+    """
+    pattern = r"[a-z0-9]+@+[a-z].*?\.\w+\.?\w+"
+    try:
+        return re.search(pattern, content).group()
+    except:
+        return "None"
+
+
+def address_extraction(content: str) -> str:
     """
     extracts address from the documents.
     """
@@ -23,7 +34,7 @@ def address_extraction(content):
         return "None"
 
 
-def refer_exteraction(content):
+def refer_exteraction(content: str) -> str:
     """
     extract referals from the documents.
     """
@@ -37,7 +48,7 @@ def refer_exteraction(content):
         return "None"
 
 
-def case_number_extraction(content):
+def case_number_extraction(content: str) -> str:
     """
     Extracts the case number from the documents.
     """
@@ -49,7 +60,7 @@ def case_number_extraction(content):
     return list(results)
 
 
-def expert_name_extraction(content):
+def expert_name_extraction(content: str) -> str:
     """
     Extracts the name of the expert from the document.
     """
@@ -60,7 +71,7 @@ def expert_name_extraction(content):
         return "None"
 
 
-def plaintiff_extraction(content):
+def plaintiff_extraction(content: str) -> str:
     """
     Extracts the plaintiff from the document
     """
@@ -71,7 +82,7 @@ def plaintiff_extraction(content):
         return "None"
 
 
-def defendent_extraction(content):
+def defendent_extraction(content: str) -> str:
     """
     Extracts the defendant from the document
     """
@@ -82,7 +93,7 @@ def defendent_extraction(content):
         return "None"
 
 
-def patent_extraction(content):
+def patent_extraction(content: str) -> str:
     """
     Extracts patent numbers from the document
     """
@@ -94,7 +105,7 @@ def patent_extraction(content):
     return list(result)
 
 
-def law_firm_extraction(content):
+def law_firm_extraction(content: str) -> str:
     regex = r""
     results = []
     firm = re.compile(regex, re.IGNORECASE)
@@ -103,13 +114,15 @@ def law_firm_extraction(content):
     return results
 
 
-def on_behalf_of_extraction(content):
+def on_behalf_of_extraction(content: str) -> str:
     regex = "on\sbehalf\sof(.*?)(C|c)ase"
-    on_behalf_of = re.search(regex, content).groups()[0].strip()
-    return on_behalf_of
+    # try:
+    return re.search(regex, content)
+    # except:
+    #     return "None"
 
 
-def hourly_compensation(content):
+def hourly_compensation(content: str) -> str:
     """
     Returns the hourly compensation of the expert.
     """
@@ -120,15 +133,15 @@ def hourly_compensation(content):
         return "None"
 
 
-def ref_patents(content):
+def ref_patents(content: str) -> str:
     return
 
 
-def acronym_extraction(content):
+def acronym_extraction(content: str) -> str:
     regex = r"\([A-Z]+\)"
     # results = []
     # acronym = re.compile(regex, re.IGNORECASE)
-    # for current in acronym.finditer(content):
+    # for current in acronym.finditer(content: str) -> str:
     #     results.append(current)
     acronym = re.findall(regex, content)
     return list(set(acronym))

+ 2 - 1
expert_resume/parse_expert_resume.py

@@ -3,6 +3,7 @@ import os
 from IPython.display import display
 import tika
 import pandas as pd
+from pandas import DataFrame
 
 tika.initVM()
 from tika import parser
@@ -18,7 +19,7 @@ from parse_resume_utils import (
 )
 
 
-def main(PATH):
+def main(PATH: str) -> DataFrame:
     """
     Returns the required data in a dataframe format
     """

+ 10 - 10
expert_resume/parse_resume_utils.py

@@ -1,18 +1,18 @@
 import re
 
 
-def extract_email(content):
+def extract_email(content: str) -> str:
     """
     Extracts email id of the expert
     """
+    pattern = r"[a-z0-9]+@+[a-z].*?\.\w+\.?\w+"
     try:
-        pattern = r"[a-z0-9]+@+[a-z].*?\.\w+\.?\w+"
         return re.search(pattern, content).group()
     except:
         return "None"
 
 
-def extract_zipcode(content):
+def extract_zipcode(content: str) -> str:
     """
     Extracts zipcode from the resume
     """
@@ -23,18 +23,18 @@ def extract_zipcode(content):
         return "None"
 
 
-def extract_phone(content):
+def extract_phone(content: str) -> str:
     """
     Extracts phone number of the expert.
     """
+    pattern = r"(\(?\d{3}\)?\-?\s?\d{3}\-\d{4})"
     try:
-        pattern = r"(\(?\d{3}\)?\-?\s?\d{3}\-\d{4})"
         return re.search(pattern, content).group()
     except:
         return "None"
 
 
-def extract_case_numbers(content):
+def extract_case_numbers(content: str) -> str:
     """
     Extracts all the case numbers associated with resume
     """
@@ -45,18 +45,18 @@ def extract_case_numbers(content):
     return list(set(results))
 
 
-def extract_litigation_experience(content):
+def extract_litigation_experience(content: str) -> str:
     """
     Extracts the litigation experience of the expert
     """
+    pattern = r"(\d+|\w+)\s?years"
     try:
-        pattern = r"(\d+|\w+)\s?years"
         return re.search(pattern, content).group()
     except:
         return "None"
 
 
-def extract_patents_issued(content):
+def extract_patents_issued(content: str) -> str:
     """
     Returns the patents issued by the expert
     """
@@ -68,7 +68,7 @@ def extract_patents_issued(content):
     return list(set(results))
 
 
-def extract_name(content):
+def extract_name(content: str) -> str:
     """
     Returns the name of the expert
     """