parse_expert_pdf_utils.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192
  1. from collections import defaultdict
  2. import re
  3. from distutils.filelist import findall
  4. def case_number_extraction(content):
  5. # dict_case_numbers = defaultdict(int)
  6. # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)
  7. # case_number = ""
  8. # for element in case_number_info:
  9. # dict_case_numbers[element] += 1
  10. # for mykey, value in dict_case_numbers.items():
  11. # case_number = mykey
  12. # return case_number
  13. regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
  14. results = []
  15. case_number = re.compile(regex, re.IGNORECASE)
  16. for current in case_number.finditer(content):
  17. results.append(current.groups()[0])
  18. return list(set(results))
  19. def expert_name_extraction(content):
  20. regex = r"\bEXPERT\sREPORT\sOF\s(.*?),.*(REGARDING|Invalidity)?" # "\bEXPERT\sREPORT\sOF\s(.+?),"
  21. results = []
  22. expert_name = re.compile(regex, re.IGNORECASE)
  23. for current in expert_name.finditer(content):
  24. results.append(current.groups()[0])
  25. return list(set(results))
  26. def plaintiff_extraction(content):
  27. regex = r"\bDIVISION([\s\S]*?)Plaintiff\," # "OF\s\w+(\s.*?)\,.*Plaintiff" # "(.*)\s\nPlaintiff,"
  28. results = []
  29. plaintiff = re.compile(regex, re.IGNORECASE)
  30. for current in plaintiff.finditer(content):
  31. results.append(current.groups()[0].strip())
  32. return results
  33. def defendent_extraction(content):
  34. # "Plaintiff.*\n.*v\.([\s\S]*?)Defendant" # "\bv\.([\s\S]*?)Defendant"
  35. regex = r"Plaintiff.*v\.([\s\S]*?)Defendant"
  36. results = []
  37. defendent = re.compile(regex, re.IGNORECASE)
  38. for current in defendent.finditer(content):
  39. results.append(current.groups()[0].strip())
  40. return results
  41. def patent_extraction(content):
  42. regex = r"(U\.S\.\sPATENT\sNO.\s\d\,\d{3}\,\d{3})" # "U\.S\.\sPATENT\sNO.*\d{1,2}\,\d{3}\,\d{3}"
  43. # results = []
  44. # patent = re.compile(regex, re.IGNORECASE)
  45. # for current in patent.finditer(content):
  46. # results.append(current.groups())
  47. # patent_info = re.search(regex, content).groups()[0].strip()
  48. patent_info = re.findall(regex, content)
  49. return patent_info
  50. def law_firm_extraction(content):
  51. regex = r""
  52. results = []
  53. firm = re.compile(regex, re.IGNORECASE)
  54. for current in firm.finditer(content):
  55. results.append(current.groups()[0].strip())
  56. return results
  57. def on_behalf_of_extraction(content):
  58. regex = "on\sbehalf\sof(.*?)(C|c)ase"
  59. on_behalf_of = re.search(regex, content).groups()[0].strip()
  60. return on_behalf_of
  61. def hourly_compensation(content):
  62. return
  63. def ref_patents(content):
  64. return
  65. def acronym_extraction(content):
  66. regex = r"\([A-Z]+\)"
  67. # results = []
  68. # acronym = re.compile(regex, re.IGNORECASE)
  69. # for current in acronym.finditer(content):
  70. # results.append(current)
  71. acronym = re.findall(regex, content)
  72. return list(set(acronym))