123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- import re
- def extract_patent_number(content):
- """
- Returns the list of unique patent_numbers in the document
- """
- regex = r"\d{1,3}\,\d{1,3}\,\d{3}"
- results = []
- patent = re.compile(regex, re.IGNORECASE)
- for current in patent.finditer(content):
- results.append(current.group().replace(",", ""))
- return list(set(results))
- def extract_case_number(content):
- """
- Returns the lisr of unique case_numbers
- """
- regex = r"(\d{1,})\:(\d{1,}\-\w{1,}\-\d{5,}\-\w+)+"
- try:
- return re.search(regex, content).groups()[1]
- except:
- return "None"
- def extract_hourly_compensation(content):
- """
- Returns hourly compensation.
- """
- results = []
- hourly_comp_re = re.compile("\$\d{1,20}", re.IGNORECASE)
- for current in hourly_comp_re.finditer(content):
- results.append(current.group().replace(",", ""))
- return list(set(results))
- def extract_expert_name(content):
- """
- Returns the name of the expert
- """
- results = []
- exp_name = re.compile(r"\b(REPORT OF ).*\S[.]")
- for current in exp_name.finditer(content):
- results.append(current.group().replace(",", ""))
- return list(set(results))
- def extract_plaintiff(content):
- """
- Returns the name of the plaintiff
- previous = (OF\s\w{1,})(.*)Plaintiff[s]?,
- """
- regex = r"OF\s\w+(\s.*?\,).*?Plaintiff"
- try:
- return re.search(regex, content).groups()[0]
- except:
- return "None"
- def extract_defendent(content):
- """
- Returns the name of the defendant
- Plaintiff[s]?.*v[s]?\.(.*?)Defendant[s]?\.
- """
- regex = r"Plaintiff[s]?\,.*?[v|V]\.(.*?)Defendant[s]?\.?"
- try:
- return re.search(regex, content).groups()[0]
- except:
- return "None"
- def extract_acronyms(content):
- """
- Returns the list of all the acronyms present
- """
- regex = r"\(“(\w{3})”\)"
- results = []
- plaintiff = re.compile(regex, re.IGNORECASE)
- for current in plaintiff.finditer(content):
- results.append(current.group().replace(",", ""))
- return list(set(results))
- def extract_firm_name(content):
- """
- Returns the list of firm names present in the documents.
- """
- regex = r"(\(Firm\sName\,\sAddress\,\sand\sTelephone\sNumber\))([\r\n]+([^\r\n]+))"
- results = []
- firm = re.compile(regex, re.IGNORECASE)
- for current in firm.finditer(content):
- results.append(current.group().replace(",", ""))
- return list(set(results))
- def extract_filing_date(content):
- """
- Returns the filing date.
- """
- try:
- return re.search(r"(Filed)\s(\d{2}\/\d{2}\/\d{2})", content).groups()[1]
- except:
- return "None"
|