Преглед на файлове

Updated parser for acronyms

Harsh Parikh преди 2 години
родител
ревизия
deff3db35b
променени са 3 файла, в които са добавени 22 реда и са изтрити 14 реда
  1. 3 1
      .gitignore
  2. 2 0
      complaints/parse_pdf.py
  3. 17 13
      complaints/parse_pdf_utils.py

+ 3 - 1
.gitignore

@@ -62,4 +62,6 @@ target/
 *.xlsx
 complaints/test.py
 *.csv
-foo.py
+foo.py
+.~lock.Document Parser Fields.xlsx#
+.DS_Store

+ 2 - 0
complaints/parse_pdf.py

@@ -14,6 +14,7 @@ from parse_pdf_utils import (
     extract_firm_name,
     extract_patent_number,
     extract_plaintiff,
+    extract_attorney_name,
 )
 
 
@@ -32,6 +33,7 @@ def extract_all(PATH):
         data_dict["defendent_list"].append(extract_defendent(content=parse_file))
         data_dict["acronyms_list"].append(extract_acronyms(content=parse_file))
         data_dict["firm_name_list"].append(extract_firm_name(content=parse_file))
+        data_dict["attorney"].append(extract_attorney_name(content=parse_file))
         data_dict["patent_list"].append(extract_patent_number(content=parse_file))
 
     data = pd.DataFrame(data_dict)

+ 17 - 13
complaints/parse_pdf_utils.py

@@ -35,17 +35,6 @@ def extract_hourly_compensation(content):
     return list(set(results))
 
 
-def extract_expert_name(content):
-    """
-    Returns the name of the expert
-    """
-    results = []
-    exp_name = re.compile(r"\b(REPORT OF ).*\S[.]")
-    for current in exp_name.finditer(content):
-        results.append(current.group().replace(",", ""))
-    return list(set(results))
-
-
 def extract_plaintiff(content):
     """
     Returns the name of the plaintiff
@@ -73,12 +62,15 @@ def extract_defendent(content):
 def extract_acronyms(content):
     """
     Returns the list of all the acronyms present
+    \(["|“](\w{1,10})
     """
-    regex = r"\(“(\w{3})”\)"
+    regex = r'\(["|“](\w{1,10})["|”]\)'
     results = []
     plaintiff = re.compile(regex, re.IGNORECASE)
     for current in plaintiff.finditer(content):
-        results.append(current.group().replace(",", ""))
+        results.append(current.groups()[0].replace(",", ""))
+    if len(results) == 0:
+        return "None"
     return list(set(results))
 
 
@@ -102,3 +94,15 @@ def extract_filing_date(content):
         return re.search(r"(Filed)\s(\d{2}\/\d{2}\/\d{2})", content).groups()[1]
     except:
         return "None"
+
+
+def extract_attorney_name(content):
+    """
+    returns the name of the attorney/attornies.
+    """
+    regex = r"\/s\/\s\w+\s\w\.\s\w+"
+    results = []
+    attorney = re.compile(regex, re.IGNORECASE)
+    for current in attorney.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))