parse_expert_pdf_utils.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. import re
  2. def date_extraction(content):
  3. """
  4. extracts filing date from the documents.
  5. """
  6. pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})" # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
  7. try:
  8. return re.search(pattern, content).groups()[0]
  9. except:
  10. return "None"
  11. def address_extraction(content):
  12. """
  13. extracts address from the documents.
  14. """
  15. regex = r"Address\:\s([\s\S].*)www"
  16. try:
  17. print(re.search(regex, content).groups()[0])
  18. except:
  19. return "None"
  20. def refer_exteraction(content):
  21. """
  22. extract referals from the documents.
  23. """
  24. regex = r"(\w+)\srefer?s\sto(.*?)as\s"
  25. # 1. by reference
  26. # 2. In re
  27. # 3. in qoutes ""
  28. try:
  29. print("group1", re.search(regex, content).groups()[0])
  30. except:
  31. return "None"
  32. def case_number_extraction(content):
  33. # dict_case_numbers = defaultdict(int)
  34. # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)
  35. # case_number = ""
  36. # for element in case_number_info:
  37. # dict_case_numbers[element] += 1
  38. # for mykey, value in dict_case_numbers.items():
  39. # case_number = mykey
  40. # return case_number
  41. regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
  42. results = []
  43. case_number = re.compile(regex, re.IGNORECASE)
  44. for current in case_number.finditer(content):
  45. results.append(current.groups()[0])
  46. return list(set(results))
  47. def expert_name_extraction(content):
  48. regex = r"\bEXPERT\sREPORT\sOF\s(.*?),.*(REGARDING|Invalidity)?" # "\bEXPERT\sREPORT\sOF\s(.+?),"
  49. results = []
  50. expert_name = re.compile(regex, re.IGNORECASE)
  51. for current in expert_name.finditer(content):
  52. results.append(current.groups()[0])
  53. return list(set(results))
  54. def plaintiff_extraction(content):
  55. regex = r"\bDIVISION([\s\S]*?)Plaintiff\," # "OF\s\w+(\s.*?)\,.*Plaintiff" # "(.*)\s\nPlaintiff,"
  56. results = []
  57. plaintiff = re.compile(regex, re.IGNORECASE)
  58. for current in plaintiff.finditer(content):
  59. results.append(current.groups()[0].strip())
  60. return results
  61. def defendent_extraction(content):
  62. # "Plaintiff.*\n.*v\.([\s\S]*?)Defendant" # "\bv\.([\s\S]*?)Defendant"
  63. regex = r"Plaintiff.*v\.([\s\S]*?)Defendant"
  64. results = []
  65. defendent = re.compile(regex, re.IGNORECASE)
  66. for current in defendent.finditer(content):
  67. results.append(current.groups()[0].strip())
  68. return results
  69. def patent_extraction(content):
  70. regex = r"(U\.S\.\sPATENT\sNO.\s\d\,\d{3}\,\d{3})" # "U\.S\.\sPATENT\sNO.*\d{1,2}\,\d{3}\,\d{3}"
  71. # results = []
  72. # patent = re.compile(regex, re.IGNORECASE)
  73. # for current in patent.finditer(content):
  74. # results.append(current.groups())
  75. # patent_info = re.search(regex, content).groups()[0].strip()
  76. patent_info = re.findall(regex, content)
  77. return patent_info
  78. def law_firm_extraction(content):
  79. regex = r""
  80. results = []
  81. firm = re.compile(regex, re.IGNORECASE)
  82. for current in firm.finditer(content):
  83. results.append(current.groups()[0].strip())
  84. return results
  85. def on_behalf_of_extraction(content):
  86. regex = "on\sbehalf\sof(.*?)(C|c)ase"
  87. on_behalf_of = re.search(regex, content).groups()[0].strip()
  88. return on_behalf_of
  89. def hourly_compensation(content):
  90. regex = "\$\s\d+"
  91. pay = re.findall(regex, content)
  92. return pay
  93. def ref_patents(content):
  94. return
  95. def acronym_extraction(content):
  96. regex = r"\([A-Z]+\)"
  97. # results = []
  98. # acronym = re.compile(regex, re.IGNORECASE)
  99. # for current in acronym.finditer(content):
  100. # results.append(current)
  101. acronym = re.findall(regex, content)
  102. return list(set(acronym))