|
@@ -0,0 +1,47 @@
|
|
|
+import re
|
|
|
+
|
|
|
+
|
|
|
+def extract_filing_date(content):
|
|
|
+ """
|
|
|
+ extracts filing date from the documents.
|
|
|
+ """
|
|
|
+ pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
|
|
|
+ try:
|
|
|
+ return re.search(pattern, content).group(0)
|
|
|
+ except:
|
|
|
+ return "None"
|
|
|
+
|
|
|
+
|
|
|
+def email_extraction(content):
|
|
|
+ """
|
|
|
+ Extracts emails from a document.
|
|
|
+ """
|
|
|
+ regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
|
|
|
+ result = []
|
|
|
+ emails = re.compile(regex, re.IGNORECASE)
|
|
|
+ for email in emails.finditer(content):
|
|
|
+ result.append(email.group())
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def telephone_number_extraction(content):
|
|
|
+ """
|
|
|
+ Extracts telephone number[s?] from a document
|
|
|
+ """
|
|
|
+ regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
|
|
|
+ numbers = re.compile(regex, re.IGNORECASE)
|
|
|
+ result = []
|
|
|
+ for number in numbers.finditer(content):
|
|
|
+ result.append(number.group())
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def address_extraction(content):
|
|
|
+ """
|
|
|
+ extracts address from the documents.
|
|
|
+ """
|
|
|
+ regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
|
|
|
+ try:
|
|
|
+ return re.search(regex_address, content).group(0)
|
|
|
+ except:
|
|
|
+ return "None"
|