import re def extract_filing_date(content): """ extracts filing date from the documents. """ # pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2}|(Filed\:.+?)\w.+\d{1,2}\,\s\d{4}([\s\S].*?Paper))" pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}" try: return re.search(pattern, content).group(0) except: return "None" def address_extraction(content): """ extracts address from the documents. """ regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}" try: return re.search(regex_address, content).group(0) except: return "None" def extract_reference(content): """ extract referals from the documents. """ regex = r"((by\sreference\sU\.S\.\sPatent\sNo.\s\d{1,2}\,\d{3}\,\d{3})|(In\sre\s\w+.+?\,?\s\d{2,3}\sF\.\dd\s\d{0,4}\,?\s?\d{0,4}\s?\(?.+?\)))" # |In\sre.+?\)|In\sre.+?\)" # 1. by reference # 2. In re # 3. in qoutes "" try: data1 = re.findall(regex, content) print(list(set(data1))) except: return "None" def email_extraction(content): """ Extracts emails from a document. """ regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}" result = [] emails = re.compile(regex, re.IGNORECASE) for email in emails.finditer(content): result.append(email.group()) return result def telephone_number_extraction(content): """ Extracts telephone number[s?] from a document """ # regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}" regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}" numbers = re.compile(regex, re.IGNORECASE) result = [] for number in numbers.finditer(content): result.append(number.group()) return result