2 年之前 · aaa838c553
--- a/expert_report/.vscode/settings.json
+++ b/expert_report/.vscode/settings.json
@@ -0,0 +1,3 @@
 
				+{
			
 
				+    "python.formatting.provider": "black"
			
 
				+}
			
--- a/expert_report/parse_expert_pdf.py
+++ b/expert_report/parse_expert_pdf.py
@@ -11,40 +11,32 @@ from parse_expert_pdf_utils import (
 
				     address_extraction,
			
 
				 )
			
 
				 import tika
			
 
				+import os
			
 
				 from tika import parser
			
 
				-import re
			
 
				+
			
 
				+tika.initVM()
			
 
				 import warnings
			
 
				-warnings.filterwarnings('ignore')
			
 
				 
			
 
				+warnings.filterwarnings("ignore")
			
 
				+
			
 
				+
			
 
				+def main(path):
			
 
				+    required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
			
 
				+    for idx, file in enumerate(required_files):
			
 
				+        print(idx, file)
			
 
				+        parsed_pdf = parser.from_file(file)
			
 
				+        # print(parsed_pdf.keys())
			
 
				 
			
 
				-def main():
			
 
				-    tika.initVM()
			
 
				-    path = "/home/ftech/Code/xc/pdf_parser/pdfs/2018008353_Mail_Decision.pdf" #2018008353_Mail_Decision.pdf"
			
 
				-    parsed_pdf = parser.from_file(path)
			
 
				-    print(parsed_pdf.keys())
			
 
				+        content = parsed_pdf["content"].strip().replace("\n", "")
			
 
				+        # print(content)
			
 
				+        date = date_extraction(content)
			
 
				+        print(date)
			
 
				+        address = address_extraction(content)
			
 
				+        print(address)
			
 
				 
			
 
				-    content = parsed_pdf['content'].strip().replace('\n', '')
			
 
				-    # print(content)
			
 
				-    # case_number = case_number_extraction(content)
			
 
				-    # print(case_number)
			
 
				-    # expert_name = expert_name_extraction(content)
			
 
				-    # print(expert_name)
			
 
				-    # plaintiff = plaintiff_extraction(content)
			
 
				-    # print(plaintiff)
			
 
				-    # defendent = defendent_extraction(content)
			
 
				-    # print(defendent)
			
 
				-    # patent = patent_extraction(content)
			
 
				-    # print(patent)
			
 
				-    # on_behalf_of = on_behalf_of_extraction(content)
			
 
				-    # print(on_behalf_of)
			
 
				-    # acronym = acronym_extraction(content)
			
 
				-    # print(acronym)
			
 
				-    # pay = hourly_compensation(content)
			
 
				-    # print(pay)
			
 
				-    date = date_extraction(content)
			
 
				-    print(date)
			
 
				-    address = address_extraction(content)
			
 
				-    print(address)
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    main()
			
 
				+    HOME_DIR = os.path.expanduser("~")
			
 
				+    BASE_DIR = "/home/ftech/Code/pdf_parser/pdfs/"
			
 
				+    path = os.path.join(HOME_DIR, BASE_DIR)
			
 
				+    main(path)
			
--- a/expert_report/parse_expert_pdf_utils.py
+++ b/expert_report/parse_expert_pdf_utils.py
@@ -1,19 +1,27 @@
 
				-from collections import defaultdict
			
 
				 import re
			
 
				-from distutils.filelist import findall
			
 
				 
			
 
				 
			
 
				 def date_extraction(content):
			
 
				-    regex = r"((\d{2}\/\d{3}\,\d{3}\s)\d{1,2}\/\d{2}\/\d{4}|(Entered\:\s|:Filing\sDate\:\s|Date\:\s)[A-Z]\w+\s\d{1,2}\,\s\d{4})"
			
 
				-    date = re.search(regex, content).groups()[0]
			
 
				-    return date.strip()
			
 
				+    """
			
 
				+    extracts filing date from the documents.
			
 
				+    """
			
 
				+    pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})"  # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
			
 
				+    try:
			
 
				+        return re.search(pattern, content).groups()[0]
			
 
				+    except:
			
 
				+        return "None"
			
 
				+
			
 
				 
			
 
				 def address_extraction(content):
			
 
				-    regex = r"Address\:([\s\S].*)\s\d{6}\-\d{4}\swww" 
			
 
				-    data = re.search(regex, content).groups()[0]
			
 
				-    if data == None:
			
 
				-        print("N")
			
 
				-    return data.strip()
			
 
				+    """
			
 
				+    extracts address from the documents.
			
 
				+    """
			
 
				+    regex = r"Address\:\s([\s\S].*)www"
			
 
				+    try:
			
 
				+        print(re.search(regex, content).groups()[0])
			
 
				+    except:
			
 
				+        return "None"
			
 
				+
			
 
				 
			
 
				 def case_number_extraction(content):
			
 
				     # dict_case_numbers = defaultdict(int)