123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 |
- import re
- def email_extraction(content: str) -> str:
- """
- Extracts emails from a document.
- """
- regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
- result = []
- emails = re.compile(regex, re.IGNORECASE)
- for email in emails.finditer(content):
- result.append(email.group())
- return result
- def telephone_number_extraction(content: str) -> str:
- """
- Extracts telephone number[s?] from a document
- """
- regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
- numbers = re.compile(regex, re.IGNORECASE)
- result = []
- for number in numbers.finditer(content):
- result.append(number.group())
- return result
- def address_extraction(content: str) -> str:
- """
- extracts address from the documents.
- """
- regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
- try:
- return re.search(regex_address, content).group(0)
- except:
- return "None"
- def case_number_extraction(content: str) -> str:
- """
- Extracts the case number from the documents.
- """
- regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"
- results = set()
- case_number = re.compile(regex, re.IGNORECASE)
- for current in case_number.finditer(content):
- results.add(current.groups()[0])
- return list(results)
- def plaintiff_extraction(content: str) -> str:
- """
- Extracts the plaintiff from the document
- """
- regex = r"(\w.*)\n\s?\n?\s?(Plaintiffs?|Petitioner)"
- try:
- return re.search(regex, content).group(1)
- except:
- return "None"
- def defendent_extraction(content: str) -> str:
- """
- Extracts the defendant from the document
- """
- regex = r"(\w.*?)\n\s?\n?\s?\s?(Defendants|Patent\sOwners?)"
- try:
- return re.search(regex, content).group(1)
- except:
- return "None"
- def patent_extraction(content: str) -> str:
- """
- Extracts patent numbers from the document
- """
- regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?"
- result = set()
- patent = re.compile(regex, re.IGNORECASE)
- for current in patent.finditer(content):
- result.add(current.group().replace(",", ""))
- return list(result)
- def acronym_extraction(content: str) -> str:
- regex = r"\(\“([A-Z]{3,4})\”\)"
- results = set()
- acronym = re.compile(regex, re.IGNORECASE)
- for current in acronym.finditer(content):
- results.add(current.group(1))
- return list(results)
- def extract_filing_date(content: str) -> str:
- """
- Extracts filing date of the document.
- """
- regex = r"Dated?\:\s(\w+\s\d\,\s\d{4})"
- try:
- return re.search(regex, content).group(1)
- except:
- return "None"
- def extract_attorney(content: str) -> str:
- """
- Extracts the name of the attorney
- """
- regex = r"Dated?\:.*?\/(.*?)\/"
- try:
- return re.search(regex, content).group(1)
- except:
- return "None"
|