|
@@ -0,0 +1,83 @@
|
|
|
|
+import re
|
|
|
|
+
|
|
|
|
+def email_extraction(content: str) -> str:
|
|
|
|
+ """
|
|
|
|
+ Extracts emails from a document.
|
|
|
|
+ """
|
|
|
|
+ regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
|
|
|
|
+ result = []
|
|
|
|
+ emails = re.compile(regex, re.IGNORECASE)
|
|
|
|
+ for email in emails.finditer(content):
|
|
|
|
+ result.append(email.group())
|
|
|
|
+ return result
|
|
|
|
+
|
|
|
|
+def telephone_number_extraction(content: str) -> str:
|
|
|
|
+ """
|
|
|
|
+ Extracts telephone number[s?] from a document
|
|
|
|
+ """
|
|
|
|
+ regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
|
|
|
|
+ numbers = re.compile(regex, re.IGNORECASE)
|
|
|
|
+ result = []
|
|
|
|
+ for number in numbers.finditer(content):
|
|
|
|
+ result.append(number.group())
|
|
|
|
+ return result
|
|
|
|
+
|
|
|
|
+def address_extraction(content: str) -> str:
|
|
|
|
+ """
|
|
|
|
+ extracts address from the documents.
|
|
|
|
+ """
|
|
|
|
+ regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
|
|
|
|
+ try:
|
|
|
|
+ return re.search(regex_address, content).group(0)
|
|
|
|
+ except:
|
|
|
|
+ return "None"
|
|
|
|
+
|
|
|
|
+def case_number_extraction(content: str) -> str:
|
|
|
|
+ """
|
|
|
|
+ Extracts the case number from the documents.
|
|
|
|
+ """
|
|
|
|
+ regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"
|
|
|
|
+ results = set()
|
|
|
|
+ case_number = re.compile(regex, re.IGNORECASE)
|
|
|
|
+ for current in case_number.finditer(content):
|
|
|
|
+ results.add(current.groups()[0])
|
|
|
|
+ return list(results)
|
|
|
|
+
|
|
|
|
+def plaintiff_extraction(content: str) -> str:
|
|
|
|
+ """
|
|
|
|
+ Extracts the plaintiff from the document
|
|
|
|
+ """
|
|
|
|
+ regex = r"(\w.*)\n\s?\n?\s?(Plaintiffs?|Petitioner)"
|
|
|
|
+ try:
|
|
|
|
+ return re.search(regex, content).group(1)
|
|
|
|
+ except:
|
|
|
|
+ return "None"
|
|
|
|
+
|
|
|
|
+def defendent_extraction(content: str) -> str:
|
|
|
|
+ """
|
|
|
|
+ Extracts the defendant from the document
|
|
|
|
+ """
|
|
|
|
+ regex = r"(\w.*?)\n\s?\n?\s?\s?(Defendants|Patent\sOwners?)"
|
|
|
|
+ try:
|
|
|
|
+ return re.search(regex, content).group(1)
|
|
|
|
+ except:
|
|
|
|
+ return "None"
|
|
|
|
+
|
|
|
|
+def patent_extraction(content: str) -> str:
|
|
|
|
+ """
|
|
|
|
+ Extracts patent numbers from the document
|
|
|
|
+ """
|
|
|
|
+ regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?"
|
|
|
|
+ result = set()
|
|
|
|
+ patent = re.compile(regex, re.IGNORECASE)
|
|
|
|
+ for current in patent.finditer(content):
|
|
|
|
+ result.add(current.group().replace(",", ""))
|
|
|
|
+ return list(result)
|
|
|
|
+
|
|
|
|
+def acronym_extraction(content: str) -> str:
|
|
|
|
+ regex = r"\(\“([A-Z]{3,4})\”\)"
|
|
|
|
+ results = set()
|
|
|
|
+ acronym = re.compile(regex, re.IGNORECASE)
|
|
|
|
+ for current in acronym.finditer(content):
|
|
|
|
+ results.add(current.group(1))
|
|
|
|
+ return list(results)
|