import re def date_extraction(content): """ extracts filing date from the documents. """ pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})" # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})" try: return re.search(pattern, content).groups()[0] except: return "None" def address_extraction(content): """ extracts address from the documents. """ regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}" try: return re.search(regex_address, content).group(0) except: return "None" def refer_exteraction(content): """ extract referals from the documents. """ regex = r"(\w+)\srefer?s\sto(.*?)as\s" # 1. by reference # 2. In re # 3. in qoutes "" try: print("group1", re.search(regex, content).groups()[0]) except: return "None" def case_number_extraction(content): """ Extracts the case number from the documents. """ regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3}) results = set() case_number = re.compile(regex, re.IGNORECASE) for current in case_number.finditer(content): results.add(current.groups()[0]) return list(results) def expert_name_extraction(content): """ Extracts the name of the expert from the document. """ regex = r"(REPORT|DECLARATION)\sOF(\s(DR.)?\s?\w+\s(.*?\.)?\s?\w+)" try: return re.search(regex, content).group(2) except: return "None" def plaintiff_extraction(content): """ Extracts the plaintiff from the document """ regex = r"(\w.*)\n\s?\n?\s?(Plaintiffs?|Petitioner)" try: return re.search(regex, content).group(1) except: return "None" def defendent_extraction(content): """ Extracts the defendant from the document """ regex = r"(\w.*?)\n\s?\n?\s?\s?(Defendants|Patent\sOwners?)" try: return re.search(regex, content).group(1) except: return "None" def patent_extraction(content): """ Extracts patent numbers from the document """ regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?" result = set() patent = re.compile(regex, re.IGNORECASE) for current in patent.finditer(content): result.add(current.group().replace(",", "")) return list(result) def law_firm_extraction(content): regex = r"" results = [] firm = re.compile(regex, re.IGNORECASE) for current in firm.finditer(content): results.append(current.groups()[0].strip()) return results def on_behalf_of_extraction(content): regex = "on\sbehalf\sof(.*?)(C|c)ase" on_behalf_of = re.search(regex, content).groups()[0].strip() return on_behalf_of def hourly_compensation(content): """ Returns the hourly compensation of the expert. """ regex = "\$\s?\d{3,4}" try: return re.search(regex, content).group(0) except: return "None" def ref_patents(content): return def acronym_extraction(content): regex = r"\([A-Z]+\)" # results = [] # acronym = re.compile(regex, re.IGNORECASE) # for current in acronym.finditer(content): # results.append(current) acronym = re.findall(regex, content) return list(set(acronym))