2 years ago · 1d4fd55ddc
--- a/.gitignore
+++ b/.gitignore
@@ -66,5 +66,4 @@ complaints/test.py
 
															 foo.py
														
 
															 .~lock.Document Parser Fields.xlsx#
														
 
															 .DS_Store
														
 
															-#pdf
														
 
															-pdfs/
														
 
															+
														
--- a/expert_report/parse_expert_pdf.py
+++ b/expert_report/parse_expert_pdf.py
@@ -1,12 +1,4 @@
 
															 from parse_expert_pdf_utils import (
														
 
															-    case_number_extraction,
														
 
															-    expert_name_extraction,
														
 
															-    plaintiff_extraction,
														
 
															-    defendent_extraction,
														
 
															-    patent_extraction,
														
 
															-    on_behalf_of_extraction,
														
 
															-    acronym_extraction,
														
 
															-    hourly_compensation,
														
 
															     date_extraction,
														
 
															     address_extraction,
														
 
															     refer_exteraction,
														
@@ -25,10 +17,10 @@ def main(path):
 
															     required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
														
 
															     for idx, file in enumerate(required_files):
														
 
															         print(idx, file)
														
 
															-        parsed_pdf = parser.from_file(file)
														
 
															-        # print(parsed_pdf.keys())
														
 
															+        print(file)
														
 
															+        content = parser.from_file(file)["content"].strip().replace("\n", "")
														
 
															-        content = parsed_pdf["content"].strip().replace("\n", "")
														
 
															+        # content = parsed_pdf["content"].strip().replace("\n", "")
														
 
															         # print(content)
														
 
															         date = date_extraction(content)
														
 
															         print(date)
														
@@ -40,6 +32,6 @@ def main(path):
 
															 if __name__ == "__main__":
														
 
															     HOME_DIR = os.path.expanduser("~")
														
 
															-    BASE_DIR = "/home/ftech/Code/pdf_parser/pdfs/"
														
 
															+    BASE_DIR = "Code/pdf_parser/pdfs"
														
 
															     path = os.path.join(HOME_DIR, BASE_DIR)
														
 
															     main(path)
														
--- a/pdfs/.vscode/settings.json
+++ b/pdfs/.vscode/settings.json
@@ -0,0 +1,3 @@
 
															+{
														
 
															+    "python.formatting.provider": "black"
														
 
															+}
														
--- a/pdfs/pdf_parser.py
+++ b/pdfs/pdf_parser.py
@@ -0,0 +1,35 @@
 
															+import os
														
 
															+from utils import (
														
 
															+    extract_filing_date,
														
 
															+    address_extraction,
														
 
															+    refer_exteraction,
														
 
															+    email_extraction,
														
 
															+    telephone_number_extraction,
														
 
															+)
														
 
															+
														
 
															+import tika
														
 
															+from tika import parser
														
 
															+
														
 
															+tika.initVM()
														
 
															+
														
 
															+
														
 
															+def main(PATH):
														
 
															+    """
														
 
															+    parses the required data from the pdfs
														
 
															+    """
														
 
															+    required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
														
 
															+    for idx, file in enumerate(required_files):
														
 
															+        print(idx, file)
														
 
															+        content = parser.from_file(file)["content"].strip().replace("\n", "")
														
 
															+        print(extract_filing_date(content))
														
 
															+        print(address_extraction(content))
														
 
															+        print(refer_exteraction(content))
														
 
															+        print(email_extraction(content))
														
 
															+        print((telephone_number_extraction(content)))
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    HOME_DIR = os.path.expanduser("~")
														
 
															+    BASE_DIR = "Code/pdf_parser/pdfs"
														
 
															+    PATH = os.path.join(HOME_DIR, BASE_DIR)
														
 
															+    main(PATH)
														
--- a/pdfs/utils.py
+++ b/pdfs/utils.py
@@ -0,0 +1,52 @@
 
															+import re
														
 
															+
														
 
															+
														
 
															+def extract_filing_date(content):
														
 
															+    """
														
 
															+    extracts filing date from the documents.
														
 
															+    """
														
 
															+    pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})"  # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
														
 
															+    try:
														
 
															+        return re.search(pattern, content).groups()[0]
														
 
															+    except:
														
 
															+        return "None"
														
 
															+
														
 
															+
														
 
															+def address_extraction(content):
														
 
															+    """
														
 
															+    extracts address from the documents.
														
 
															+    """
														
 
															+    regex = r"OfficeAddress\:\s([\s\S].*)www"
														
 
															+    try:
														
 
															+        print(re.search(regex, content).groups()[0])
														
 
															+    except:
														
 
															+        return "None"
														
 
															+
														
 
															+
														
 
															+def refer_exteraction(content):
														
 
															+    """
														
 
															+    extract referals from the documents.
														
 
															+    """
														
 
															+    regex = r"by\sreference.+?\d{1,2}\,\d{3}\,\d{3}|In\sre.+?\)"
														
 
															+    # 1. by reference
														
 
															+    # 2. In re
														
 
															+    # 3. in qoutes ""
														
 
															+    try:
														
 
															+        data1 = re.findall(regex, content)
														
 
															+        print(list(set(data1)))
														
 
															+    except:
														
 
															+        return "None"
														
 
															+
														
 
															+
														
 
															+def email_extraction(content):
														
 
															+    regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
														
 
															+    expert_name = re.compile(regex, re.IGNORECASE)
														
 
															+    for current in expert_name.finditer(content):
														
 
															+        print(current.group())
														
 
															+
														
 
															+
														
 
															+def telephone_number_extraction(content):
														
 
															+    regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
														
 
															+    expert_name = re.compile(regex, re.IGNORECASE)
														
 
															+    for current in expert_name.finditer(content):
														
 
															+        print(current.group())