|
@@ -0,0 +1,52 @@
|
|
|
|
+import re
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def extract_filing_date(content):
|
|
|
|
+ """
|
|
|
|
+ extracts filing date from the documents.
|
|
|
|
+ """
|
|
|
|
+ pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})" # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
|
|
|
|
+ try:
|
|
|
|
+ return re.search(pattern, content).groups()[0]
|
|
|
|
+ except:
|
|
|
|
+ return "None"
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def address_extraction(content):
|
|
|
|
+ """
|
|
|
|
+ extracts address from the documents.
|
|
|
|
+ """
|
|
|
|
+ regex = r"OfficeAddress\:\s([\s\S].*)www"
|
|
|
|
+ try:
|
|
|
|
+ print(re.search(regex, content).groups()[0])
|
|
|
|
+ except:
|
|
|
|
+ return "None"
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def refer_exteraction(content):
|
|
|
|
+ """
|
|
|
|
+ extract referals from the documents.
|
|
|
|
+ """
|
|
|
|
+ regex = r"by\sreference.+?\d{1,2}\,\d{3}\,\d{3}|In\sre.+?\)"
|
|
|
|
+ # 1. by reference
|
|
|
|
+ # 2. In re
|
|
|
|
+ # 3. in qoutes ""
|
|
|
|
+ try:
|
|
|
|
+ data1 = re.findall(regex, content)
|
|
|
|
+ print(list(set(data1)))
|
|
|
|
+ except:
|
|
|
|
+ return "None"
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def email_extraction(content):
|
|
|
|
+ regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
|
|
|
|
+ expert_name = re.compile(regex, re.IGNORECASE)
|
|
|
|
+ for current in expert_name.finditer(content):
|
|
|
|
+ print(current.group())
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def telephone_number_extraction(content):
|
|
|
|
+ regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
|
|
|
|
+ expert_name = re.compile(regex, re.IGNORECASE)
|
|
|
|
+ for current in expert_name.finditer(content):
|
|
|
|
+ print(current.group())
|