""" This module are the functions to parse elements from the expert pdfs """ import re def date_extraction(content: str) -> str: """ extracts filing date from the documents. """ pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})" # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})" try: return re.search(pattern, content).groups()[0] except: return "None" def extract_email(content: str) -> str: """ extracts email from the documents. """ pattern = r"[a-z0-9]+@+[a-z].*?\.\w+\.?\w+" try: return re.search(pattern, content).group() except: return "None" def address_extraction(content: str) -> str: """ extracts address from the documents. """ regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}" try: return re.search(regex_address, content).group(0) except: return "None" def refer_exteraction(content: str) -> str: """ extract referals from the documents. """ regex = r"(\w+)\srefer?s\sto(.*?)as\s" # 1. by reference # 2. In re # 3. in qoutes "" try: print("group1", re.search(regex, content).groups()[0]) except: return "None" def case_number_extraction(content: str) -> str: """ Extracts the case number from the documents. """ regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" results = set() case_number = re.compile(regex, re.IGNORECASE) for current in case_number.finditer(content): results.add(current.groups()[0]) return list(results) def expert_name_extraction(content: str) -> str: """ Extracts the name of the expert from the document. """ regex = r"(REPORT|DECLARATION)\sOF(\s(DR.)?\s?\w+\s(.*?\.)?\s?\w+)" try: return re.search(regex, content).group(2) except: return "None" def plaintiff_extraction(content: str) -> str: """ Extracts the plaintiff from the document """ regex = r"(\w.*)\n\s?\n?\s?(Plaintiffs?|Petitioner)" try: return re.search(regex, content).group(1) except: return "None" def defendent_extraction(content: str) -> str: """ Extracts the defendant from the document """ regex = r"(\w.*?)\n\s?\n?\s?\s?(Defendants|Patent\sOwners?)" try: return re.search(regex, content).group(1) except: return "None" def patent_extraction(content: str) -> str: """ Extracts patent numbers from the document """ regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?" result = set() patent = re.compile(regex, re.IGNORECASE) for current in patent.finditer(content): result.add(current.group().replace(",", "")) return list(result) def law_firm_extraction(content: str) -> str: regex = r"" results = [] firm = re.compile(regex, re.IGNORECASE) for current in firm.finditer(content): results.append(current.groups()[0].strip()) return results def on_behalf_of_extraction(content: str) -> str: regex = "on\sbehalf\sof(.*?)(C|c)ase" # try: return re.search(regex, content) # except: # return "None" def hourly_compensation(content: str) -> str: """ Returns the hourly compensation of the expert. """ regex = "\$\s?\d{3,4}" try: return re.search(regex, content).group(0) except: return "None" def acronym_extraction(content: str) -> str: regex = r"\(\“([A-Z]{3,4})\”\)" results = set() acronym = re.compile(regex, re.IGNORECASE) for current in acronym.finditer(content): results.add(current.group(1)) return list(results)