|
@@ -16,9 +16,9 @@ def address_extraction(content):
|
|
|
"""
|
|
|
extracts address from the documents.
|
|
|
"""
|
|
|
- regex = r"Address\:\s([\s\S].*)www"
|
|
|
+ regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
|
|
|
try:
|
|
|
- print(re.search(regex, content).groups()[0])
|
|
|
+ return re.search(regex_address, content).group(0)
|
|
|
except:
|
|
|
return "None"
|
|
|
|
|
@@ -38,59 +38,60 @@ def refer_exteraction(content):
|
|
|
|
|
|
|
|
|
def case_number_extraction(content):
|
|
|
- # dict_case_numbers = defaultdict(int)
|
|
|
- # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)
|
|
|
- # case_number = ""
|
|
|
- # for element in case_number_info:
|
|
|
- # dict_case_numbers[element] += 1
|
|
|
- # for mykey, value in dict_case_numbers.items():
|
|
|
- # case_number = mykey
|
|
|
- # return case_number
|
|
|
+ """
|
|
|
+ Extracts the case number from the documents.
|
|
|
+ """
|
|
|
regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
|
|
|
- results = []
|
|
|
+ results = set()
|
|
|
case_number = re.compile(regex, re.IGNORECASE)
|
|
|
for current in case_number.finditer(content):
|
|
|
- results.append(current.groups()[0])
|
|
|
- return list(set(results))
|
|
|
+ results.add(current.groups()[0])
|
|
|
+ return list(results)
|
|
|
|
|
|
|
|
|
def expert_name_extraction(content):
|
|
|
- regex = r"\bEXPERT\sREPORT\sOF\s(.*?),.*(REGARDING|Invalidity)?" # "\bEXPERT\sREPORT\sOF\s(.+?),"
|
|
|
- results = []
|
|
|
- expert_name = re.compile(regex, re.IGNORECASE)
|
|
|
- for current in expert_name.finditer(content):
|
|
|
- results.append(current.groups()[0])
|
|
|
- return list(set(results))
|
|
|
+ """
|
|
|
+ Extracts the name of the expert from the document.
|
|
|
+ """
|
|
|
+ regex = r"(REPORT|DECLARATION)\sOF(\s(DR.)?\s?\w+\s(.*?\.)?\s?\w+)"
|
|
|
+ try:
|
|
|
+ return re.search(regex, content).group(2)
|
|
|
+ except:
|
|
|
+ return "None"
|
|
|
|
|
|
|
|
|
def plaintiff_extraction(content):
|
|
|
- regex = r"\bDIVISION([\s\S]*?)Plaintiff\," # "OF\s\w+(\s.*?)\,.*Plaintiff" # "(.*)\s\nPlaintiff,"
|
|
|
- results = []
|
|
|
- plaintiff = re.compile(regex, re.IGNORECASE)
|
|
|
- for current in plaintiff.finditer(content):
|
|
|
- results.append(current.groups()[0].strip())
|
|
|
- return results
|
|
|
+ """
|
|
|
+ Extracts the plaintiff from the document
|
|
|
+ """
|
|
|
+ regex = r"(\w.*)\n\s?\n?\s?(Plaintiffs?|Petitioner)"
|
|
|
+ try:
|
|
|
+ return re.search(regex, content).group(1)
|
|
|
+ except:
|
|
|
+ return "None"
|
|
|
|
|
|
|
|
|
def defendent_extraction(content):
|
|
|
- # "Plaintiff.*\n.*v\.([\s\S]*?)Defendant" # "\bv\.([\s\S]*?)Defendant"
|
|
|
- regex = r"Plaintiff.*v\.([\s\S]*?)Defendant"
|
|
|
- results = []
|
|
|
- defendent = re.compile(regex, re.IGNORECASE)
|
|
|
- for current in defendent.finditer(content):
|
|
|
- results.append(current.groups()[0].strip())
|
|
|
- return results
|
|
|
+ """
|
|
|
+ Extracts the defendant from the document
|
|
|
+ """
|
|
|
+ regex = r"(\w.*?)\n\s?\n?\s\s?(Defendants|Patent\sOwners?)"
|
|
|
+ try:
|
|
|
+ return re.search(regex, content).group(1)
|
|
|
+ except:
|
|
|
+ return "None"
|
|
|
|
|
|
|
|
|
def patent_extraction(content):
|
|
|
- regex = r"(U\.S\.\sPATENT\sNO.\s\d\,\d{3}\,\d{3})" # "U\.S\.\sPATENT\sNO.*\d{1,2}\,\d{3}\,\d{3}"
|
|
|
- # results = []
|
|
|
- # patent = re.compile(regex, re.IGNORECASE)
|
|
|
- # for current in patent.finditer(content):
|
|
|
- # results.append(current.groups())
|
|
|
- # patent_info = re.search(regex, content).groups()[0].strip()
|
|
|
- patent_info = re.findall(regex, content)
|
|
|
- return patent_info
|
|
|
+ """
|
|
|
+ Extracts patent numbers from the document
|
|
|
+ """
|
|
|
+ regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?"
|
|
|
+ results = []
|
|
|
+ patent = re.compile(regex, re.IGNORECASE)
|
|
|
+ for current in patent.finditer(content):
|
|
|
+ results.append(current.group().replace(",", ""))
|
|
|
+ return list(set(results))
|
|
|
|
|
|
|
|
|
def law_firm_extraction(content):
|
|
@@ -109,7 +110,10 @@ def on_behalf_of_extraction(content):
|
|
|
|
|
|
|
|
|
def hourly_compensation(content):
|
|
|
- regex = "\$\s\d+"
|
|
|
+ """
|
|
|
+ Returns the hourly compensation of the expert.
|
|
|
+ """
|
|
|
+ regex = "\$\s?\d+"
|
|
|
pay = re.findall(regex, content)
|
|
|
return pay
|
|
|
|