parse_expert_pdf_utils.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. import re
  2. def date_extraction(content: str) -> str:
  3. """
  4. extracts filing date from the documents.
  5. """
  6. pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})" # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
  7. try:
  8. return re.search(pattern, content).groups()[0]
  9. except:
  10. return "None"
  11. def extract_email(content: str) -> str:
  12. """
  13. extracts email from the documents.
  14. """
  15. pattern = r"[a-z0-9]+@+[a-z].*?\.\w+\.?\w+"
  16. try:
  17. return re.search(pattern, content).group()
  18. except:
  19. return "None"
  20. def address_extraction(content: str) -> str:
  21. """
  22. extracts address from the documents.
  23. """
  24. regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
  25. try:
  26. return re.search(regex_address, content).group(0)
  27. except:
  28. return "None"
  29. def refer_exteraction(content: str) -> str:
  30. """
  31. extract referals from the documents.
  32. """
  33. regex = r"(\w+)\srefer?s\sto(.*?)as\s"
  34. # 1. by reference
  35. # 2. In re
  36. # 3. in qoutes ""
  37. try:
  38. print("group1", re.search(regex, content).groups()[0])
  39. except:
  40. return "None"
  41. def case_number_extraction(content: str) -> str:
  42. """
  43. Extracts the case number from the documents.
  44. """
  45. regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
  46. results = set()
  47. case_number = re.compile(regex, re.IGNORECASE)
  48. for current in case_number.finditer(content):
  49. results.add(current.groups()[0])
  50. return list(results)
  51. def expert_name_extraction(content: str) -> str:
  52. """
  53. Extracts the name of the expert from the document.
  54. """
  55. regex = r"(REPORT|DECLARATION)\sOF(\s(DR.)?\s?\w+\s(.*?\.)?\s?\w+)"
  56. try:
  57. return re.search(regex, content).group(2)
  58. except:
  59. return "None"
  60. def plaintiff_extraction(content: str) -> str:
  61. """
  62. Extracts the plaintiff from the document
  63. """
  64. regex = r"(\w.*)\n\s?\n?\s?(Plaintiffs?|Petitioner)"
  65. try:
  66. return re.search(regex, content).group(1)
  67. except:
  68. return "None"
  69. def defendent_extraction(content: str) -> str:
  70. """
  71. Extracts the defendant from the document
  72. """
  73. regex = r"(\w.*?)\n\s?\n?\s?\s?(Defendants|Patent\sOwners?)"
  74. try:
  75. return re.search(regex, content).group(1)
  76. except:
  77. return "None"
  78. def patent_extraction(content: str) -> str:
  79. """
  80. Extracts patent numbers from the document
  81. """
  82. regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?"
  83. result = set()
  84. patent = re.compile(regex, re.IGNORECASE)
  85. for current in patent.finditer(content):
  86. result.add(current.group().replace(",", ""))
  87. return list(result)
  88. def law_firm_extraction(content: str) -> str:
  89. regex = r""
  90. results = []
  91. firm = re.compile(regex, re.IGNORECASE)
  92. for current in firm.finditer(content):
  93. results.append(current.groups()[0].strip())
  94. return results
  95. def on_behalf_of_extraction(content: str) -> str:
  96. regex = "on\sbehalf\sof(.*?)(C|c)ase"
  97. # try:
  98. return re.search(regex, content)
  99. # except:
  100. # return "None"
  101. def hourly_compensation(content: str) -> str:
  102. """
  103. Returns the hourly compensation of the expert.
  104. """
  105. regex = "\$\s?\d{3,4}"
  106. try:
  107. return re.search(regex, content).group(0)
  108. except:
  109. return "None"
  110. def ref_patents(content: str) -> str:
  111. return
  112. def acronym_extraction(content: str) -> str:
  113. regex = r"\([A-Z]+\)"
  114. # results = []
  115. # acronym = re.compile(regex, re.IGNORECASE)
  116. # for current in acronym.finditer(content: str) -> str:
  117. # results.append(current)
  118. acronym = re.findall(regex, content)
  119. return list(set(acronym))