123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- import re
- def date_extraction(content):
- """
- extracts filing date from the documents.
- """
- pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})" # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
- try:
- return re.search(pattern, content).groups()[0]
- except:
- return "None"
- def address_extraction(content):
- """
- extracts address from the documents.
- """
- regex = r"Address\:\s([\s\S].*)www"
- try:
- print(re.search(regex, content).groups()[0])
- except:
- return "None"
- def case_number_extraction(content):
- # dict_case_numbers = defaultdict(int)
- # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)
- # case_number = ""
- # for element in case_number_info:
- # dict_case_numbers[element] += 1
- # for mykey, value in dict_case_numbers.items():
- # case_number = mykey
- # return case_number
- regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
- results = []
- case_number = re.compile(regex, re.IGNORECASE)
- for current in case_number.finditer(content):
- results.append(current.groups()[0])
- return list(set(results))
- def expert_name_extraction(content):
- regex = r"\bEXPERT\sREPORT\sOF\s(.*?),.*(REGARDING|Invalidity)?" # "\bEXPERT\sREPORT\sOF\s(.+?),"
- results = []
- expert_name = re.compile(regex, re.IGNORECASE)
- for current in expert_name.finditer(content):
- results.append(current.groups()[0])
- return list(set(results))
- def plaintiff_extraction(content):
- regex = r"\bDIVISION([\s\S]*?)Plaintiff\," # "OF\s\w+(\s.*?)\,.*Plaintiff" # "(.*)\s\nPlaintiff,"
- results = []
- plaintiff = re.compile(regex, re.IGNORECASE)
- for current in plaintiff.finditer(content):
- results.append(current.groups()[0].strip())
- return results
- def defendent_extraction(content):
- # "Plaintiff.*\n.*v\.([\s\S]*?)Defendant" # "\bv\.([\s\S]*?)Defendant"
- regex = r"Plaintiff.*v\.([\s\S]*?)Defendant"
- results = []
- defendent = re.compile(regex, re.IGNORECASE)
- for current in defendent.finditer(content):
- results.append(current.groups()[0].strip())
- return results
- def patent_extraction(content):
- regex = r"(U\.S\.\sPATENT\sNO.\s\d\,\d{3}\,\d{3})" # "U\.S\.\sPATENT\sNO.*\d{1,2}\,\d{3}\,\d{3}"
- # results = []
- # patent = re.compile(regex, re.IGNORECASE)
- # for current in patent.finditer(content):
- # results.append(current.groups())
- # patent_info = re.search(regex, content).groups()[0].strip()
- patent_info = re.findall(regex, content)
- return patent_info
- def law_firm_extraction(content):
- regex = r""
- results = []
- firm = re.compile(regex, re.IGNORECASE)
- for current in firm.finditer(content):
- results.append(current.groups()[0].strip())
- return results
- def on_behalf_of_extraction(content):
- regex = "on\sbehalf\sof(.*?)(C|c)ase"
- on_behalf_of = re.search(regex, content).groups()[0].strip()
- return on_behalf_of
- def hourly_compensation(content):
- regex = "\$\s\d+"
- pay = re.findall(regex, content)
- return pay
- def ref_patents(content):
- return
- def acronym_extraction(content):
- regex = r"\([A-Z]+\)"
- # results = []
- # acronym = re.compile(regex, re.IGNORECASE)
- # for current in acronym.finditer(content):
- # results.append(current)
- acronym = re.findall(regex, content)
- return list(set(acronym))
|