há 2 anos atrás · 1d4fd55ddc
--- a/.gitignore
+++ b/.gitignore
@@ -66,5 +66,4 @@ complaints/test.py
 
				 foo.py
			
 
				 .~lock.Document Parser Fields.xlsx#
			
 
				 .DS_Store
			
 
				-#pdf
			
 
				-pdfs/
			
 
				+
			
--- a/expert_report/parse_expert_pdf.py
+++ b/expert_report/parse_expert_pdf.py
@@ -1,12 +1,4 @@
 
				 from parse_expert_pdf_utils import (
			
 
				-    case_number_extraction,
			
 
				-    expert_name_extraction,
			
 
				-    plaintiff_extraction,
			
 
				-    defendent_extraction,
			
 
				-    patent_extraction,
			
 
				-    on_behalf_of_extraction,
			
 
				-    acronym_extraction,
			
 
				-    hourly_compensation,
			
 
				     date_extraction,
			
 
				     address_extraction,
			
 
				     refer_exteraction,
			
@@ -25,10 +17,10 @@ def main(path):
 
				     required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
			
 
				     for idx, file in enumerate(required_files):
			
 
				         print(idx, file)
			
 
				-        parsed_pdf = parser.from_file(file)
			
 
				-        # print(parsed_pdf.keys())
			
 
				+        print(file)
			
 
				+        content = parser.from_file(file)["content"].strip().replace("\n", "")
			
 
				 
			
 
				-        content = parsed_pdf["content"].strip().replace("\n", "")
			
 
				+        # content = parsed_pdf["content"].strip().replace("\n", "")
			
 
				         # print(content)
			
 
				         date = date_extraction(content)
			
 
				         print(date)
			
@@ -40,6 +32,6 @@ def main(path):
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     HOME_DIR = os.path.expanduser("~")
			
 
				-    BASE_DIR = "/home/ftech/Code/pdf_parser/pdfs/"
			
 
				+    BASE_DIR = "Code/pdf_parser/pdfs"
			
 
				     path = os.path.join(HOME_DIR, BASE_DIR)
			
 
				     main(path)
			
--- a/pdfs/.vscode/settings.json
+++ b/pdfs/.vscode/settings.json
@@ -0,0 +1,3 @@
 
				+{
			
 
				+    "python.formatting.provider": "black"
			
 
				+}
			
--- a/pdfs/pdf_parser.py
+++ b/pdfs/pdf_parser.py
@@ -0,0 +1,35 @@
 
				+import os
			
 
				+from utils import (
			
 
				+    extract_filing_date,
			
 
				+    address_extraction,
			
 
				+    refer_exteraction,
			
 
				+    email_extraction,
			
 
				+    telephone_number_extraction,
			
 
				+)
			
 
				+
			
 
				+import tika
			
 
				+from tika import parser
			
 
				+
			
 
				+tika.initVM()
			
 
				+
			
 
				+
			
 
				+def main(PATH):
			
 
				+    """
			
 
				+    parses the required data from the pdfs
			
 
				+    """
			
 
				+    required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
			
 
				+    for idx, file in enumerate(required_files):
			
 
				+        print(idx, file)
			
 
				+        content = parser.from_file(file)["content"].strip().replace("\n", "")
			
 
				+        print(extract_filing_date(content))
			
 
				+        print(address_extraction(content))
			
 
				+        print(refer_exteraction(content))
			
 
				+        print(email_extraction(content))
			
 
				+        print((telephone_number_extraction(content)))
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    HOME_DIR = os.path.expanduser("~")
			
 
				+    BASE_DIR = "Code/pdf_parser/pdfs"
			
 
				+    PATH = os.path.join(HOME_DIR, BASE_DIR)
			
 
				+    main(PATH)
			
--- a/pdfs/utils.py
+++ b/pdfs/utils.py
@@ -0,0 +1,52 @@
 
				+import re
			
 
				+
			
 
				+
			
 
				+def extract_filing_date(content):
			
 
				+    """
			
 
				+    extracts filing date from the documents.
			
 
				+    """
			
 
				+    pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})"  # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
			
 
				+    try:
			
 
				+        return re.search(pattern, content).groups()[0]
			
 
				+    except:
			
 
				+        return "None"
			
 
				+
			
 
				+
			
 
				+def address_extraction(content):
			
 
				+    """
			
 
				+    extracts address from the documents.
			
 
				+    """
			
 
				+    regex = r"OfficeAddress\:\s([\s\S].*)www"
			
 
				+    try:
			
 
				+        print(re.search(regex, content).groups()[0])
			
 
				+    except:
			
 
				+        return "None"
			
 
				+
			
 
				+
			
 
				+def refer_exteraction(content):
			
 
				+    """
			
 
				+    extract referals from the documents.
			
 
				+    """
			
 
				+    regex = r"by\sreference.+?\d{1,2}\,\d{3}\,\d{3}|In\sre.+?\)"
			
 
				+    # 1. by reference
			
 
				+    # 2. In re
			
 
				+    # 3. in qoutes ""
			
 
				+    try:
			
 
				+        data1 = re.findall(regex, content)
			
 
				+        print(list(set(data1)))
			
 
				+    except:
			
 
				+        return "None"
			
 
				+
			
 
				+
			
 
				+def email_extraction(content):
			
 
				+    regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
			
 
				+    expert_name = re.compile(regex, re.IGNORECASE)
			
 
				+    for current in expert_name.finditer(content):
			
 
				+        print(current.group())
			
 
				+
			
 
				+
			
 
				+def telephone_number_extraction(content):
			
 
				+    regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
			
 
				+    expert_name = re.compile(regex, re.IGNORECASE)
			
 
				+    for current in expert_name.finditer(content):
			
 
				+        print(current.group())