harsh
/
pdf_parser


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
							import re


def extract_filing_date(content):
    """
    extracts filing date from the documents.
    """
    pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
    try:
        return re.search(pattern, content).group(0)
    except:
        return "None"


def email_extraction(content):
    """
    Extracts emails from a document.
    """
    regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
    result = []
    emails = re.compile(regex, re.IGNORECASE)
    for email in emails.finditer(content):
        result.append(email.group())
    return result


def telephone_number_extraction(content):
    """
    Extracts telephone number[s?] from a document
    """
    regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
    numbers = re.compile(regex, re.IGNORECASE)
    result = []
    for number in numbers.finditer(content):
        result.append(number.group())
    return result


def address_extraction(content):
    """
    extracts address from the documents.
    """
    regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
    try:
        return re.search(regex_address, content).group(0)
    except:
        return "None"