Explorar el Código

renamed folder and added a parser for decision_affirmed documents

Harsh Parikh hace 2 años
padre
commit
3ea4b03c2d

+ 40 - 0
server_documents/decision_affirmed/main.py

@@ -0,0 +1,40 @@
+""" Script to parse documents from Decision_Affirmed
+"""
+import os
+from collections import defaultdict
+import pandas as pd
+import tika
+from tika import parser
+from parse_pdf_utils import (
+    extract_filing_date,
+    address_extraction,
+    email_extraction,
+    telephone_number_extraction,
+)
+
+tika.initVM()
+
+
+def main(PATH):
+    """
+    parses the required data from the pdfs
+    """
+    data_dict = defaultdict(list)
+    required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
+    for file in required_files:
+        content = parser.from_file(file)["content"]
+        data_dict["document_name"].append(file)
+        data_dict["filing_date"].append(extract_filing_date(content))
+        data_dict["address"].append(address_extraction(content))
+        data_dict["email"].append(email_extraction(content))
+        data_dict["telephone_number"].append(telephone_number_extraction(content))
+
+    data = pd.DataFrame(data_dict)
+    data.to_csv("required_data.csv")
+
+
+if __name__ == "__main__":
+    HOME_DIR = os.path.expanduser("~")
+    BASE_DIR = "Code/pdf_parser/server_documents/decision_affirmed/"
+    PATH = os.path.join(HOME_DIR, BASE_DIR)
+    main(PATH)

+ 47 - 0
server_documents/decision_affirmed/parse_pdf_utils.py

@@ -0,0 +1,47 @@
+import re
+
+
+def extract_filing_date(content):
+    """
+    extracts filing date from the documents.
+    """
+    pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
+    try:
+        return re.search(pattern, content).group(0)
+    except:
+        return "None"
+
+
+def email_extraction(content):
+    """
+    Extracts emails from a document.
+    """
+    regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
+    result = []
+    emails = re.compile(regex, re.IGNORECASE)
+    for email in emails.finditer(content):
+        result.append(email.group())
+    return result
+
+
+def telephone_number_extraction(content):
+    """
+    Extracts telephone number[s?] from a document
+    """
+    regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
+    numbers = re.compile(regex, re.IGNORECASE)
+    result = []
+    for number in numbers.finditer(content):
+        result.append(number.group())
+    return result
+
+
+def address_extraction(content):
+    """
+    extracts address from the documents.
+    """
+    regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
+    try:
+        return re.search(regex_address, content).group(0)
+    except:
+        return "None"