parse_expert_pdf_utils.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. import re
  2. def date_extraction(content):
  3. """
  4. extracts filing date from the documents.
  5. """
  6. pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})" # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
  7. try:
  8. return re.search(pattern, content).groups()[0]
  9. except:
  10. return "None"
  11. def address_extraction(content):
  12. """
  13. extracts address from the documents.
  14. """
  15. regex = r"Address\:\s([\s\S].*)www"
  16. try:
  17. print(re.search(regex, content).groups()[0])
  18. except:
  19. return "None"
  20. def case_number_extraction(content):
  21. # dict_case_numbers = defaultdict(int)
  22. # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)
  23. # case_number = ""
  24. # for element in case_number_info:
  25. # dict_case_numbers[element] += 1
  26. # for mykey, value in dict_case_numbers.items():
  27. # case_number = mykey
  28. # return case_number
  29. regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
  30. results = []
  31. case_number = re.compile(regex, re.IGNORECASE)
  32. for current in case_number.finditer(content):
  33. results.append(current.groups()[0])
  34. return list(set(results))
  35. def expert_name_extraction(content):
  36. regex = r"\bEXPERT\sREPORT\sOF\s(.*?),.*(REGARDING|Invalidity)?" # "\bEXPERT\sREPORT\sOF\s(.+?),"
  37. results = []
  38. expert_name = re.compile(regex, re.IGNORECASE)
  39. for current in expert_name.finditer(content):
  40. results.append(current.groups()[0])
  41. return list(set(results))
  42. def plaintiff_extraction(content):
  43. regex = r"\bDIVISION([\s\S]*?)Plaintiff\," # "OF\s\w+(\s.*?)\,.*Plaintiff" # "(.*)\s\nPlaintiff,"
  44. results = []
  45. plaintiff = re.compile(regex, re.IGNORECASE)
  46. for current in plaintiff.finditer(content):
  47. results.append(current.groups()[0].strip())
  48. return results
  49. def defendent_extraction(content):
  50. # "Plaintiff.*\n.*v\.([\s\S]*?)Defendant" # "\bv\.([\s\S]*?)Defendant"
  51. regex = r"Plaintiff.*v\.([\s\S]*?)Defendant"
  52. results = []
  53. defendent = re.compile(regex, re.IGNORECASE)
  54. for current in defendent.finditer(content):
  55. results.append(current.groups()[0].strip())
  56. return results
  57. def patent_extraction(content):
  58. regex = r"(U\.S\.\sPATENT\sNO.\s\d\,\d{3}\,\d{3})" # "U\.S\.\sPATENT\sNO.*\d{1,2}\,\d{3}\,\d{3}"
  59. # results = []
  60. # patent = re.compile(regex, re.IGNORECASE)
  61. # for current in patent.finditer(content):
  62. # results.append(current.groups())
  63. # patent_info = re.search(regex, content).groups()[0].strip()
  64. patent_info = re.findall(regex, content)
  65. return patent_info
  66. def law_firm_extraction(content):
  67. regex = r""
  68. results = []
  69. firm = re.compile(regex, re.IGNORECASE)
  70. for current in firm.finditer(content):
  71. results.append(current.groups()[0].strip())
  72. return results
  73. def on_behalf_of_extraction(content):
  74. regex = "on\sbehalf\sof(.*?)(C|c)ase"
  75. on_behalf_of = re.search(regex, content).groups()[0].strip()
  76. return on_behalf_of
  77. def hourly_compensation(content):
  78. regex = "\$\s\d+"
  79. pay = re.findall(regex, content)
  80. return pay
  81. def ref_patents(content):
  82. return
  83. def acronym_extraction(content):
  84. regex = r"\([A-Z]+\)"
  85. # results = []
  86. # acronym = re.compile(regex, re.IGNORECASE)
  87. # for current in acronym.finditer(content):
  88. # results.append(current)
  89. acronym = re.findall(regex, content)
  90. return list(set(acronym))