Browse Source

updated code

Omkar Desai 2 years ago
parent
commit
bfa8e7a141
2 changed files with 17 additions and 0 deletions
  1. 3 0
      expert_report/parse_expert_pdf.py
  2. 14 0
      expert_report/parse_expert_pdf_utils.py

+ 3 - 0
expert_report/parse_expert_pdf.py

@@ -9,6 +9,7 @@ from parse_expert_pdf_utils import (
     hourly_compensation,
     hourly_compensation,
     date_extraction,
     date_extraction,
     address_extraction,
     address_extraction,
+    refer_exteraction,
 )
 )
 import tika
 import tika
 import os
 import os
@@ -33,6 +34,8 @@ def main(path):
         print(date)
         print(date)
         address = address_extraction(content)
         address = address_extraction(content)
         print(address)
         print(address)
+        refer = refer_exteraction(content)
+        print(refer)
 
 
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":

+ 14 - 0
expert_report/parse_expert_pdf_utils.py

@@ -23,6 +23,20 @@ def address_extraction(content):
         return "None"
         return "None"
 
 
 
 
+def refer_exteraction(content):
+    """
+    extract referals from the documents.
+    """
+    regex = r"(\w+)\srefer?s\sto(.*?)as\s"
+    # 1. by reference
+    # 2. In re
+    # 3. in qoutes ""
+    try:
+        print("group1", re.search(regex, content).groups()[0])
+    except:
+        return "None"
+
+
 def case_number_extraction(content):
 def case_number_extraction(content):
     # dict_case_numbers = defaultdict(int)
     # dict_case_numbers = defaultdict(int)
     # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)
     # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)