import re def extract_filing_date(content): """ extracts filing date from the documents. """ pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})" # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})" try: return re.search(pattern, content).groups()[0] except: return "None" def address_extraction(content): """ extracts address from the documents. """ regex = r"OfficeAddress\:\s([\s\S].*)www" try: print(re.search(regex, content).groups()[0]) except: return "None" def refer_exteraction(content): """ extract referals from the documents. """ regex = r"by\sreference.+?\d{1,2}\,\d{3}\,\d{3}|In\sre.+?\)" # 1. by reference # 2. In re # 3. in qoutes "" try: data1 = re.findall(regex, content) print(list(set(data1))) except: return "None" def email_extraction(content): regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}" expert_name = re.compile(regex, re.IGNORECASE) for current in expert_name.finditer(content): print(current.group()) def telephone_number_extraction(content): regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}" expert_name = re.compile(regex, re.IGNORECASE) for current in expert_name.finditer(content): print(current.group())