浏览代码

updated code

Omkar Desai 2 年之前
父节点
当前提交
bfa8e7a141
共有 2 个文件被更改,包括 17 次插入0 次删除
  1. 3 0
      expert_report/parse_expert_pdf.py
  2. 14 0
      expert_report/parse_expert_pdf_utils.py

+ 3 - 0
expert_report/parse_expert_pdf.py

@@ -9,6 +9,7 @@ from parse_expert_pdf_utils import (
     hourly_compensation,
     date_extraction,
     address_extraction,
+    refer_exteraction,
 )
 import tika
 import os
@@ -33,6 +34,8 @@ def main(path):
         print(date)
         address = address_extraction(content)
         print(address)
+        refer = refer_exteraction(content)
+        print(refer)
 
 
 if __name__ == "__main__":

+ 14 - 0
expert_report/parse_expert_pdf_utils.py

@@ -23,6 +23,20 @@ def address_extraction(content):
         return "None"
 
 
+def refer_exteraction(content):
+    """
+    extract referals from the documents.
+    """
+    regex = r"(\w+)\srefer?s\sto(.*?)as\s"
+    # 1. by reference
+    # 2. In re
+    # 3. in qoutes ""
+    try:
+        print("group1", re.search(regex, content).groups()[0])
+    except:
+        return "None"
+
+
 def case_number_extraction(content):
     # dict_case_numbers = defaultdict(int)
     # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)