Преглед изворни кода

using .py files from pdfs folder

Omkar Desai пре 2 година
родитељ
комит
1d4fd55ddc
5 измењених фајлова са 95 додато и 14 уклоњено
  1. 1 2
      .gitignore
  2. 4 12
      expert_report/parse_expert_pdf.py
  3. 3 0
      pdfs/.vscode/settings.json
  4. 35 0
      pdfs/pdf_parser.py
  5. 52 0
      pdfs/utils.py

+ 1 - 2
.gitignore

@@ -66,5 +66,4 @@ complaints/test.py
 foo.py
 .~lock.Document Parser Fields.xlsx#
 .DS_Store
-#pdf
-pdfs/
+

+ 4 - 12
expert_report/parse_expert_pdf.py

@@ -1,12 +1,4 @@
 from parse_expert_pdf_utils import (
-    case_number_extraction,
-    expert_name_extraction,
-    plaintiff_extraction,
-    defendent_extraction,
-    patent_extraction,
-    on_behalf_of_extraction,
-    acronym_extraction,
-    hourly_compensation,
     date_extraction,
     address_extraction,
     refer_exteraction,
@@ -25,10 +17,10 @@ def main(path):
     required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
     for idx, file in enumerate(required_files):
         print(idx, file)
-        parsed_pdf = parser.from_file(file)
-        # print(parsed_pdf.keys())
+        print(file)
+        content = parser.from_file(file)["content"].strip().replace("\n", "")
 
-        content = parsed_pdf["content"].strip().replace("\n", "")
+        # content = parsed_pdf["content"].strip().replace("\n", "")
         # print(content)
         date = date_extraction(content)
         print(date)
@@ -40,6 +32,6 @@ def main(path):
 
 if __name__ == "__main__":
     HOME_DIR = os.path.expanduser("~")
-    BASE_DIR = "/home/ftech/Code/pdf_parser/pdfs/"
+    BASE_DIR = "Code/pdf_parser/pdfs"
     path = os.path.join(HOME_DIR, BASE_DIR)
     main(path)

+ 3 - 0
pdfs/.vscode/settings.json

@@ -0,0 +1,3 @@
+{
+    "python.formatting.provider": "black"
+}

+ 35 - 0
pdfs/pdf_parser.py

@@ -0,0 +1,35 @@
+import os
+from utils import (
+    extract_filing_date,
+    address_extraction,
+    refer_exteraction,
+    email_extraction,
+    telephone_number_extraction,
+)
+
+import tika
+from tika import parser
+
+tika.initVM()
+
+
+def main(PATH):
+    """
+    parses the required data from the pdfs
+    """
+    required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
+    for idx, file in enumerate(required_files):
+        print(idx, file)
+        content = parser.from_file(file)["content"].strip().replace("\n", "")
+        print(extract_filing_date(content))
+        print(address_extraction(content))
+        print(refer_exteraction(content))
+        print(email_extraction(content))
+        print((telephone_number_extraction(content)))
+
+
+if __name__ == "__main__":
+    HOME_DIR = os.path.expanduser("~")
+    BASE_DIR = "Code/pdf_parser/pdfs"
+    PATH = os.path.join(HOME_DIR, BASE_DIR)
+    main(PATH)

+ 52 - 0
pdfs/utils.py

@@ -0,0 +1,52 @@
+import re
+
+
+def extract_filing_date(content):
+    """
+    extracts filing date from the documents.
+    """
+    pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})"  # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
+    try:
+        return re.search(pattern, content).groups()[0]
+    except:
+        return "None"
+
+
+def address_extraction(content):
+    """
+    extracts address from the documents.
+    """
+    regex = r"OfficeAddress\:\s([\s\S].*)www"
+    try:
+        print(re.search(regex, content).groups()[0])
+    except:
+        return "None"
+
+
+def refer_exteraction(content):
+    """
+    extract referals from the documents.
+    """
+    regex = r"by\sreference.+?\d{1,2}\,\d{3}\,\d{3}|In\sre.+?\)"
+    # 1. by reference
+    # 2. In re
+    # 3. in qoutes ""
+    try:
+        data1 = re.findall(regex, content)
+        print(list(set(data1)))
+    except:
+        return "None"
+
+
+def email_extraction(content):
+    regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
+    expert_name = re.compile(regex, re.IGNORECASE)
+    for current in expert_name.finditer(content):
+        print(current.group())
+
+
+def telephone_number_extraction(content):
+    regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
+    expert_name = re.compile(regex, re.IGNORECASE)
+    for current in expert_name.finditer(content):
+        print(current.group())