Преглед на файлове

Merge branch 'master' of git.fafadiatech.com:harsh/pdf_parser

Harsh Parikh преди 2 години
родител
ревизия
4caa9d66a4
променени са 2 файла, в които са добавени 49 реда и са изтрити 26 реда
  1. 34 25
      expert_report/parse_expert_pdf.py
  2. 15 1
      expert_report/parse_expert_pdf_utils.py

+ 34 - 25
expert_report/parse_expert_pdf.py

@@ -6,36 +6,45 @@ from parse_expert_pdf_utils import (
     patent_extraction,
     on_behalf_of_extraction,
     acronym_extraction,
+    hourly_compensation,
+    date_extraction,
+    address_extraction,
 )
 import tika
 from tika import parser
+import re
+import warnings
+warnings.filterwarnings('ignore')
 
-tika.initVM()
-import os
 
+def main():
+    tika.initVM()
+    path = "/home/ftech/Code/xc/pdf_parser/pdfs/2018008353_Mail_Decision.pdf" #2018008353_Mail_Decision.pdf"
+    parsed_pdf = parser.from_file(path)
+    print(parsed_pdf.keys())
 
-def main(PATH):
-    required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
-    for file in required_files:
-        content = parser.from_file(file)["content"].strip().replace("\n", "")
-        # case_number = case_number_extraction(content)
-        # print(case_number)
-        # expert_name = expert_name_extraction(content)
-        # print(expert_name)
-        # plaintiff = plaintiff_extraction(content)
-        # print(plaintiff)
-        # defendent = defendent_extraction(content)
-        # print(defendent)
-        # patent = patent_extraction(content)
-        # print(patent)
-        # on_behalf_of = on_behalf_of_extraction(content)
-        # print(on_behalf_of)
-        acronym = acronym_extraction(content)
-        print(acronym)
-
+    content = parsed_pdf['content'].strip().replace('\n', '')
+    # print(content)
+    # case_number = case_number_extraction(content)
+    # print(case_number)
+    # expert_name = expert_name_extraction(content)
+    # print(expert_name)
+    # plaintiff = plaintiff_extraction(content)
+    # print(plaintiff)
+    # defendent = defendent_extraction(content)
+    # print(defendent)
+    # patent = patent_extraction(content)
+    # print(patent)
+    # on_behalf_of = on_behalf_of_extraction(content)
+    # print(on_behalf_of)
+    # acronym = acronym_extraction(content)
+    # print(acronym)
+    # pay = hourly_compensation(content)
+    # print(pay)
+    date = date_extraction(content)
+    print(date)
+    address = address_extraction(content)
+    print(address)
 
 if __name__ == "__main__":
-    HOME_DIR = os.path.expanduser("~")
-    BASE_DIR = "Code/pdf_parser/expert_report"
-    PATH = os.path.join(HOME_DIR, BASE_DIR)
-    main(PATH)
+    main()

+ 15 - 1
expert_report/parse_expert_pdf_utils.py

@@ -3,6 +3,18 @@ import re
 from distutils.filelist import findall
 
 
+def date_extraction(content):
+    regex = r"((\d{2}\/\d{3}\,\d{3}\s)\d{1,2}\/\d{2}\/\d{4}|(Entered\:\s|:Filing\sDate\:\s|Date\:\s)[A-Z]\w+\s\d{1,2}\,\s\d{4})"
+    date = re.search(regex, content).groups()[0]
+    return date.strip()
+
+def address_extraction(content):
+    regex = r"Address\:([\s\S].*)\s\d{6}\-\d{4}\swww" 
+    data = re.search(regex, content).groups()[0]
+    if data == None:
+        print("N")
+    return data.strip()
+
 def case_number_extraction(content):
     # dict_case_numbers = defaultdict(int)
     # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)
@@ -75,7 +87,9 @@ def on_behalf_of_extraction(content):
 
 
 def hourly_compensation(content):
-    return
+    regex = "\$\s\d+"
+    pay = re.findall(regex, content)
+    return pay
 
 
 def ref_patents(content):