import re def date_extraction(content): """ extracts filing date from the documents. """ pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})" # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})" try: return re.search(pattern, content).groups()[0] except: return "None" def address_extraction(content): """ extracts address from the documents. """ regex = r"Address\:\s([\s\S].*)www" try: print(re.search(regex, content).groups()[0]) except: return "None" def refer_exteraction(content): """ extract referals from the documents. """ regex = r"(\w+)\srefer?s\sto(.*?)as\s" # 1. by reference # 2. In re # 3. in qoutes "" try: print("group1", re.search(regex, content).groups()[0]) except: return "None" def case_number_extraction(content): # dict_case_numbers = defaultdict(int) # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content) # case_number = "" # for element in case_number_info: # dict_case_numbers[element] += 1 # for mykey, value in dict_case_numbers.items(): # case_number = mykey # return case_number regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3}) results = [] case_number = re.compile(regex, re.IGNORECASE) for current in case_number.finditer(content): results.append(current.groups()[0]) return list(set(results)) def expert_name_extraction(content): regex = r"\bEXPERT\sREPORT\sOF\s(.*?),.*(REGARDING|Invalidity)?" # "\bEXPERT\sREPORT\sOF\s(.+?)," results = [] expert_name = re.compile(regex, re.IGNORECASE) for current in expert_name.finditer(content): results.append(current.groups()[0]) return list(set(results)) def plaintiff_extraction(content): regex = r"\bDIVISION([\s\S]*?)Plaintiff\," # "OF\s\w+(\s.*?)\,.*Plaintiff" # "(.*)\s\nPlaintiff," results = [] plaintiff = re.compile(regex, re.IGNORECASE) for current in plaintiff.finditer(content): results.append(current.groups()[0].strip()) return results def defendent_extraction(content): # "Plaintiff.*\n.*v\.([\s\S]*?)Defendant" # "\bv\.([\s\S]*?)Defendant" regex = r"Plaintiff.*v\.([\s\S]*?)Defendant" results = [] defendent = re.compile(regex, re.IGNORECASE) for current in defendent.finditer(content): results.append(current.groups()[0].strip()) return results def patent_extraction(content): regex = r"(U\.S\.\sPATENT\sNO.\s\d\,\d{3}\,\d{3})" # "U\.S\.\sPATENT\sNO.*\d{1,2}\,\d{3}\,\d{3}" # results = [] # patent = re.compile(regex, re.IGNORECASE) # for current in patent.finditer(content): # results.append(current.groups()) # patent_info = re.search(regex, content).groups()[0].strip() patent_info = re.findall(regex, content) return patent_info def law_firm_extraction(content): regex = r"" results = [] firm = re.compile(regex, re.IGNORECASE) for current in firm.finditer(content): results.append(current.groups()[0].strip()) return results def on_behalf_of_extraction(content): regex = "on\sbehalf\sof(.*?)(C|c)ase" on_behalf_of = re.search(regex, content).groups()[0].strip() return on_behalf_of def hourly_compensation(content): regex = "\$\s\d+" pay = re.findall(regex, content) return pay def ref_patents(content): return def acronym_extraction(content): regex = r"\([A-Z]+\)" # results = [] # acronym = re.compile(regex, re.IGNORECASE) # for current in acronym.finditer(content): # results.append(current) acronym = re.findall(regex, content) return list(set(acronym))