parse_expert_pdf_utils.py 3.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. from collections import defaultdict
  2. import re
  3. from distutils.filelist import findall
  4. def date_extraction(content):
  5. regex = r"((\d{2}\/\d{3}\,\d{3}\s)\d{1,2}\/\d{2}\/\d{4}|(Entered\:\s|:Filing\sDate\:\s|Date\:\s)[A-Z]\w+\s\d{1,2}\,\s\d{4})"
  6. date = re.search(regex, content).groups()[0]
  7. return date.strip()
  8. def address_extraction(content):
  9. regex = r"Address\:([\s\S].*)\s\d{6}\-\d{4}\swww"
  10. data = re.search(regex, content).groups()[0]
  11. if data == None:
  12. print("N")
  13. return data.strip()
  14. def case_number_extraction(content):
  15. # dict_case_numbers = defaultdict(int)
  16. # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)
  17. # case_number = ""
  18. # for element in case_number_info:
  19. # dict_case_numbers[element] += 1
  20. # for mykey, value in dict_case_numbers.items():
  21. # case_number = mykey
  22. # return case_number
  23. regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
  24. results = []
  25. case_number = re.compile(regex, re.IGNORECASE)
  26. for current in case_number.finditer(content):
  27. results.append(current.groups()[0])
  28. return list(set(results))
  29. def expert_name_extraction(content):
  30. regex = r"\bEXPERT\sREPORT\sOF\s(.*?),.*(REGARDING|Invalidity)?" # "\bEXPERT\sREPORT\sOF\s(.+?),"
  31. results = []
  32. expert_name = re.compile(regex, re.IGNORECASE)
  33. for current in expert_name.finditer(content):
  34. results.append(current.groups()[0])
  35. return list(set(results))
  36. def plaintiff_extraction(content):
  37. regex = r"\bDIVISION([\s\S]*?)Plaintiff\," # "OF\s\w+(\s.*?)\,.*Plaintiff" # "(.*)\s\nPlaintiff,"
  38. results = []
  39. plaintiff = re.compile(regex, re.IGNORECASE)
  40. for current in plaintiff.finditer(content):
  41. results.append(current.groups()[0].strip())
  42. return results
  43. def defendent_extraction(content):
  44. # "Plaintiff.*\n.*v\.([\s\S]*?)Defendant" # "\bv\.([\s\S]*?)Defendant"
  45. regex = r"Plaintiff.*v\.([\s\S]*?)Defendant"
  46. results = []
  47. defendent = re.compile(regex, re.IGNORECASE)
  48. for current in defendent.finditer(content):
  49. results.append(current.groups()[0].strip())
  50. return results
  51. def patent_extraction(content):
  52. regex = r"(U\.S\.\sPATENT\sNO.\s\d\,\d{3}\,\d{3})" # "U\.S\.\sPATENT\sNO.*\d{1,2}\,\d{3}\,\d{3}"
  53. # results = []
  54. # patent = re.compile(regex, re.IGNORECASE)
  55. # for current in patent.finditer(content):
  56. # results.append(current.groups())
  57. # patent_info = re.search(regex, content).groups()[0].strip()
  58. patent_info = re.findall(regex, content)
  59. return patent_info
  60. def law_firm_extraction(content):
  61. regex = r""
  62. results = []
  63. firm = re.compile(regex, re.IGNORECASE)
  64. for current in firm.finditer(content):
  65. results.append(current.groups()[0].strip())
  66. return results
  67. def on_behalf_of_extraction(content):
  68. regex = "on\sbehalf\sof(.*?)(C|c)ase"
  69. on_behalf_of = re.search(regex, content).groups()[0].strip()
  70. return on_behalf_of
  71. def hourly_compensation(content):
  72. regex = "\$\s\d+"
  73. pay = re.findall(regex, content)
  74. return pay
  75. def ref_patents(content):
  76. return
  77. def acronym_extraction(content):
  78. regex = r"\([A-Z]+\)"
  79. # results = []
  80. # acronym = re.compile(regex, re.IGNORECASE)
  81. # for current in acronym.finditer(content):
  82. # results.append(current)
  83. acronym = re.findall(regex, content)
  84. return list(set(acronym))