import re def extract_patent_number(content): """ Returns the list of unique patent_numbers in the document """ regex = r"\d{1,3}\,\d{1,3}\,\d{3}" results = [] patent = re.compile(regex, re.IGNORECASE) for current in patent.finditer(content): results.append(current.group().replace(",", "")) return list(set(results)) def extract_case_number(content): """ Returns the lisr of unique case_numbers """ regex = r"(\d{1,})\:(\d{1,}\-\w{1,}\-\d{5,}\-\w+)+" try: return re.search(regex, content).groups()[1] except: return "None" def extract_hourly_compensation(content): """ Returns hourly compensation. """ results = [] hourly_comp_re = re.compile("\$\d{1,20}", re.IGNORECASE) for current in hourly_comp_re.finditer(content): results.append(current.group().replace(",", "")) return list(set(results)) def extract_plaintiff(content): """ Returns the name of the plaintiff previous = (OF\s\w{1,})(.*)Plaintiff[s]?, """ regex = r"OF\s\w+(\s.*?\,).*?Plaintiff" try: return re.search(regex, content).groups()[0] except: return "None" def extract_defendent(content): """ Returns the name of the defendant Plaintiff[s]?.*v[s]?\.(.*?)Defendant[s]?\. """ regex = r"Plaintiff[s]?\,.*?[v|V]\.(.*?)Defendant[s]?\.?" try: return re.search(regex, content).groups()[0] except: return "None" def extract_acronyms(content): """ Returns the list of all the acronyms present \(["|“](\w{1,10}) """ regex = r'\(["|“](\w{1,10})["|”]\)' results = [] plaintiff = re.compile(regex, re.IGNORECASE) for current in plaintiff.finditer(content): results.append(current.groups()[0].replace(",", "")) if len(results) == 0: return "None" return list(set(results)) def extract_firm_name(content): """ Returns the list of firm names present in the documents. """ regex = r"(\(Firm\sName\,\sAddress\,\sand\sTelephone\sNumber\))([\r\n]+([^\r\n]+))" results = [] firm = re.compile(regex, re.IGNORECASE) for current in firm.finditer(content): results.append(current.group().replace(",", "")) return list(set(results)) def extract_filing_date(content): """ Returns the filing date. """ try: return re.search(r"(Filed)\s(\d{2}\/\d{2}\/\d{2})", content).groups()[1] except: return "None" def extract_attorney_name(content): """ returns the name of the attorney/attornies. """ regex = r"\/s\/\s\w+\s\w\.\s\w+" results = [] attorney = re.compile(regex, re.IGNORECASE) for current in attorney.finditer(content): results.append(current.group().replace(",", "")) return list(set(results))