import re def email_extraction(content: str) -> str: """ Extracts emails from a document. """ regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}" result = [] emails = re.compile(regex, re.IGNORECASE) for email in emails.finditer(content): result.append(email.group()) return result def telephone_number_extraction(content: str) -> str: """ Extracts telephone number[s?] from a document """ regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}" numbers = re.compile(regex, re.IGNORECASE) result = [] for number in numbers.finditer(content): result.append(number.group()) return result def address_extraction(content: str) -> str: """ extracts address from the documents. """ regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}" try: return re.search(regex_address, content).group(0) except: return "None" def case_number_extraction(content: str) -> str: """ Extracts the case number from the documents. """ regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" results = set() case_number = re.compile(regex, re.IGNORECASE) for current in case_number.finditer(content): results.add(current.groups()[0]) return list(results) def plaintiff_extraction(content: str) -> str: """ Extracts the plaintiff from the document """ regex = r"(\w.*)\n\s?\n?\s?(Plaintiffs?|Petitioner)" try: return re.search(regex, content).group(1) except: return "None" def defendent_extraction(content: str) -> str: """ Extracts the defendant from the document """ regex = r"(\w.*?)\n\s?\n?\s?\s?(Defendants|Patent\sOwners?)" try: return re.search(regex, content).group(1) except: return "None" def patent_extraction(content: str) -> str: """ Extracts patent numbers from the document """ regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?" result = set() patent = re.compile(regex, re.IGNORECASE) for current in patent.finditer(content): result.add(current.group().replace(",", "")) return list(result) def acronym_extraction(content: str) -> str: regex = r"\(\“([A-Z]{3,4})\”\)" results = set() acronym = re.compile(regex, re.IGNORECASE) for current in acronym.finditer(content): results.add(current.group(1)) return list(results) def extract_filing_date(content: str) -> str: """ Extracts filing date of the document. """ regex = r"Dated?\:\s(\w+\s\d\,\s\d{4})" try: return re.search(regex, content).group(1) except: return "None" def extract_attorney(content: str) -> str: """ Extracts the name of the attorney """ regex = r"Dated?\:.*?\/(.*?)\/" try: return re.search(regex, content).group(1) except: return "None"