1234567891011121314151617181920212223242526272829303132333435363738394041424344454647 |
- import re
- def extract_filing_date(content):
- """
- extracts filing date from the documents.
- """
- pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
- try:
- return re.search(pattern, content).group(0)
- except:
- return "None"
- def email_extraction(content):
- """
- Extracts emails from a document.
- """
- regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
- result = []
- emails = re.compile(regex, re.IGNORECASE)
- for email in emails.finditer(content):
- result.append(email.group())
- return result
- def telephone_number_extraction(content):
- """
- Extracts telephone number[s?] from a document
- """
- regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
- numbers = re.compile(regex, re.IGNORECASE)
- result = []
- for number in numbers.finditer(content):
- result.append(number.group())
- return result
- def address_extraction(content):
- """
- extracts address from the documents.
- """
- regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
- try:
- return re.search(regex_address, content).group(0)
- except:
- return "None"
|