parse_expert_pdf_utils.py 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. """ This module are the functions to parse elements from the expert pdfs
  2. """
  3. import re
  4. def date_extraction(content: str) -> str:
  5. """
  6. extracts filing date from the documents.
  7. """
  8. pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})" # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
  9. try:
  10. return re.search(pattern, content).groups()[0]
  11. except:
  12. return "None"
  13. def extract_email(content: str) -> str:
  14. """
  15. extracts email from the documents.
  16. """
  17. pattern = r"[a-z0-9]+@+[a-z].*?\.\w+\.?\w+"
  18. try:
  19. return re.search(pattern, content).group()
  20. except:
  21. return "None"
  22. def address_extraction(content: str) -> str:
  23. """
  24. extracts address from the documents.
  25. """
  26. regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
  27. try:
  28. return re.search(regex_address, content).group(0)
  29. except:
  30. return "None"
  31. def refer_exteraction(content: str) -> str:
  32. """
  33. extract referals from the documents.
  34. """
  35. regex = r"(\w+)\srefer?s\sto(.*?)as\s"
  36. # 1. by reference
  37. # 2. In re
  38. # 3. in qoutes ""
  39. try:
  40. print("group1", re.search(regex, content).groups()[0])
  41. except:
  42. return "None"
  43. def case_number_extraction(content: str) -> str:
  44. """
  45. Extracts the case number from the documents.
  46. """
  47. regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"
  48. results = set()
  49. case_number = re.compile(regex, re.IGNORECASE)
  50. for current in case_number.finditer(content):
  51. results.add(current.groups()[0])
  52. return list(results)
  53. def expert_name_extraction(content: str) -> str:
  54. """
  55. Extracts the name of the expert from the document.
  56. """
  57. regex = r"(REPORT|DECLARATION)\sOF(\s(DR.)?\s?\w+\s(.*?\.)?\s?\w+)"
  58. try:
  59. return re.search(regex, content).group(2)
  60. except:
  61. return "None"
  62. def plaintiff_extraction(content: str) -> str:
  63. """
  64. Extracts the plaintiff from the document
  65. """
  66. regex = r"(\w.*)\n\s?\n?\s?(Plaintiffs?|Petitioner)"
  67. try:
  68. return re.search(regex, content).group(1)
  69. except:
  70. return "None"
  71. def defendent_extraction(content: str) -> str:
  72. """
  73. Extracts the defendant from the document
  74. """
  75. regex = r"(\w.*?)\n\s?\n?\s?\s?(Defendants|Patent\sOwners?)"
  76. try:
  77. return re.search(regex, content).group(1)
  78. except:
  79. return "None"
  80. def patent_extraction(content: str) -> str:
  81. """
  82. Extracts patent numbers from the document
  83. """
  84. regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?"
  85. result = set()
  86. patent = re.compile(regex, re.IGNORECASE)
  87. for current in patent.finditer(content):
  88. result.add(current.group().replace(",", ""))
  89. return list(result)
  90. def law_firm_extraction(content: str) -> str:
  91. regex = r""
  92. results = []
  93. firm = re.compile(regex, re.IGNORECASE)
  94. for current in firm.finditer(content):
  95. results.append(current.groups()[0].strip())
  96. return results
  97. def on_behalf_of_extraction(content: str) -> str:
  98. regex = "on\sbehalf\sof(.*?)(C|c)ase"
  99. # try:
  100. return re.search(regex, content)
  101. # except:
  102. # return "None"
  103. def hourly_compensation(content: str) -> str:
  104. """
  105. Returns the hourly compensation of the expert.
  106. """
  107. regex = "\$\s?\d{3,4}"
  108. try:
  109. return re.search(regex, content).group(0)
  110. except:
  111. return "None"
  112. def acronym_extraction(content: str) -> str:
  113. regex = r"\(\“([A-Z]{3,4})\”\)"
  114. results = set()
  115. acronym = re.compile(regex, re.IGNORECASE)
  116. for current in acronym.finditer(content):
  117. results.add(current.group(1))
  118. return list(results)