Parcourir la source

added a doctring and changed the acronym code

Harsh Parikh il y a 2 ans
Parent
commit
b87b2eb25d
2 fichiers modifiés avec 13 ajouts et 13 suppressions
  1. 3 1
      expert_report/parse_expert_pdf.py
  2. 10 12
      expert_report/parse_expert_pdf_utils.py

+ 3 - 1
expert_report/parse_expert_pdf.py

@@ -1,3 +1,5 @@
+"""This module is used to parse PDF for expert reports.
+"""
 from collections import defaultdict
 import os
 import tika
@@ -29,7 +31,7 @@ def main(path: str) -> DataFrame:
         print(file)
         content = parser.from_file(file)["content"]
         data["file"].append(file)
-        data["email"].append(content)
+        data["email"].append(extract_email(content))
         data["full_name"].append(expert_name_extraction(content))
         data["hourly_pay"].append(hourly_compensation(content))
         data["plaintiff"].append(plaintiff_extraction(content))

+ 10 - 12
expert_report/parse_expert_pdf_utils.py

@@ -1,3 +1,6 @@
+""" This module are the functions to parse elements from the expert pdfs
+"""
+
 import re
 
 
@@ -52,7 +55,7 @@ def case_number_extraction(content: str) -> str:
     """
     Extracts the case number from the documents.
     """
-    regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"  # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
+    regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"
     results = set()
     case_number = re.compile(regex, re.IGNORECASE)
     for current in case_number.finditer(content):
@@ -133,15 +136,10 @@ def hourly_compensation(content: str) -> str:
         return "None"
 
 
-def ref_patents(content: str) -> str:
-    return
-
-
 def acronym_extraction(content: str) -> str:
-    regex = r"\([A-Z]+\)"
-    # results = []
-    # acronym = re.compile(regex, re.IGNORECASE)
-    # for current in acronym.finditer(content: str) -> str:
-    #     results.append(current)
-    acronym = re.findall(regex, content)
-    return list(set(acronym))
+    regex = r"\(\“([A-Z]{3,4})\”\)"
+    results = set()
+    acronym = re.compile(regex, re.IGNORECASE)
+    for current in acronym.finditer(content):
+        results.add(current.group(1))
+    return list(results)