2 年前 · b87b2eb25d
--- a/expert_report/parse_expert_pdf.py
+++ b/expert_report/parse_expert_pdf.py
@@ -1,3 +1,5 @@
 
				+"""This module is used to parse PDF for expert reports.
			
 
				+"""
			
 
				 from collections import defaultdict
			
 
				 import os
			
 
				 import tika
			
@@ -29,7 +31,7 @@ def main(path: str) -> DataFrame:
 
				         print(file)
			
 
				         content = parser.from_file(file)["content"]
			
 
				         data["file"].append(file)
			
 
				-        data["email"].append(content)
			
 
				+        data["email"].append(extract_email(content))
			
 
				         data["full_name"].append(expert_name_extraction(content))
			
 
				         data["hourly_pay"].append(hourly_compensation(content))
			
 
				         data["plaintiff"].append(plaintiff_extraction(content))
			
--- a/expert_report/parse_expert_pdf_utils.py
+++ b/expert_report/parse_expert_pdf_utils.py
@@ -1,3 +1,6 @@
 
				+""" This module are the functions to parse elements from the expert pdfs
			
 
				+"""
			
 
				+
			
 
				 import re
			
 
				 
			
 
				 
			
@@ -52,7 +55,7 @@ def case_number_extraction(content: str) -> str:
 
				     """
			
 
				     Extracts the case number from the documents.
			
 
				     """
			
 
				-    regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"  # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
			
 
				+    regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"
			
 
				     results = set()
			
 
				     case_number = re.compile(regex, re.IGNORECASE)
			
 
				     for current in case_number.finditer(content):
			
@@ -133,15 +136,10 @@ def hourly_compensation(content: str) -> str:
 
				         return "None"
			
 
				 
			
 
				 
			
 
				-def ref_patents(content: str) -> str:
			
 
				-    return
			
 
				-
			
 
				-
			
 
				 def acronym_extraction(content: str) -> str:
			
 
				-    regex = r"\([A-Z]+\)"
			
 
				-    # results = []
			
 
				-    # acronym = re.compile(regex, re.IGNORECASE)
			
 
				-    # for current in acronym.finditer(content: str) -> str:
			
 
				-    #     results.append(current)
			
 
				-    acronym = re.findall(regex, content)
			
 
				-    return list(set(acronym))
			
 
				+    regex = r"\(\“([A-Z]{3,4})\”\)"
			
 
				+    results = set()
			
 
				+    acronym = re.compile(regex, re.IGNORECASE)
			
 
				+    for current in acronym.finditer(content):
			
 
				+        results.add(current.group(1))
			
 
				+    return list(results)