12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364 |
- import re
- def extract_filing_date(content):
- """
- extracts filing date from the documents.
- """
- # pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2}|(Filed\:.+?)\w.+\d{1,2}\,\s\d{4}([\s\S].*?Paper))"
- pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
- try:
- return re.search(pattern, content).group(0)
- except:
- return "None"
- def address_extraction(content):
- """
- extracts address from the documents.
- """
- regex = r"(OfficeAddress\:\s([\s\S].*)www|(A|a)ddress.+?\sof.+?Fax\:\s\(\d{3}\)\s\d{3}\-\d+\s)"
- try:
- re.search(regex, content).groups()[0]
- except:
- return "None"
- def refer_exteraction(content):
- """
- extract referals from the documents.
- """
- regex = r"((by\sreference\sU\.S\.\sPatent\sNo.\s\d{1,2}\,\d{3}\,\d{3})|(In\sre\s\w+.+?\,?\s\d{2,3}\sF\.\dd\s\d{0,4}\,?\s?\d{0,4}\s?\(?.+?\)))" # |In\sre.+?\)|In\sre.+?\)"
- # 1. by reference
- # 2. In re
- # 3. in qoutes ""
- try:
- data1 = re.findall(regex, content)
- print(list(set(data1)))
- except:
- return "None"
- def email_extraction(content):
- """
- Extracts emails from a document.
- """
- regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
- result = []
- emails = re.compile(regex, re.IGNORECASE)
- for email in emails.finditer(content):
- result.append(email.group())
- return result
- def telephone_number_extraction(content):
- """
- Extracts telephone number[s?] from a document
- """
- # regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
- regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
- numbers = re.compile(regex, re.IGNORECASE)
- result = []
- for number in numbers.finditer(content):
- result.append(number)
- return result
|