|
@@ -8,7 +8,7 @@ def extract_filing_date(content):
|
|
|
# pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2}|(Filed\:.+?)\w.+\d{1,2}\,\s\d{4}([\s\S].*?Paper))"
|
|
|
pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
|
|
|
try:
|
|
|
- return re.search(pattern, content).groups()[0]
|
|
|
+ return re.search(pattern, content).group(0)
|
|
|
except:
|
|
|
return "None"
|
|
|
|
|
@@ -40,6 +40,9 @@ def refer_exteraction(content):
|
|
|
|
|
|
|
|
|
def email_extraction(content):
|
|
|
+ """
|
|
|
+ Extracts emails from a document.
|
|
|
+ """
|
|
|
regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
|
|
|
result = []
|
|
|
emails = re.compile(regex, re.IGNORECASE)
|
|
@@ -49,8 +52,13 @@ def email_extraction(content):
|
|
|
|
|
|
|
|
|
def telephone_number_extraction(content):
|
|
|
- regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
|
|
|
- try:
|
|
|
- return re.search(regex, content).group()
|
|
|
- except:
|
|
|
- return "None"
|
|
|
+ """
|
|
|
+ Extracts telephone number[s?] from a document
|
|
|
+ """
|
|
|
+ # regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
|
|
|
+ regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
|
|
|
+ numbers = re.compile(regex, re.IGNORECASE)
|
|
|
+ result = []
|
|
|
+ for number in numbers.finditer(content):
|
|
|
+ result.append(number)
|
|
|
+ return result
|