parse_pdf_utils.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. import re
  2. def extract_patent_number(content):
  3. """
  4. Returns the list of unique patent_numbers in the document
  5. """
  6. regex = r"\d{1,3}\,\d{1,3}\,\d{3}"
  7. results = []
  8. patent = re.compile(regex, re.IGNORECASE)
  9. for current in patent.finditer(content):
  10. results.append(current.group().replace(",", ""))
  11. return list(set(results))
  12. def extract_case_number(content):
  13. """
  14. Returns the lisr of unique case_numbers
  15. """
  16. regex = r"(\d{1,})\:(\d{1,}\-\w{1,}\-\d{5,}\-\w+)+"
  17. try:
  18. return re.search(regex, content).groups()[1]
  19. except:
  20. return "None"
  21. def extract_hourly_compensation(content):
  22. """
  23. Returns hourly compensation.
  24. """
  25. results = []
  26. hourly_comp_re = re.compile("\$\d{1,20}", re.IGNORECASE)
  27. for current in hourly_comp_re.finditer(content):
  28. results.append(current.group().replace(",", ""))
  29. return list(set(results))
  30. def extract_expert_name(content):
  31. """
  32. Returns the name of the expert
  33. """
  34. results = []
  35. exp_name = re.compile(r"\b(REPORT OF ).*\S[.]")
  36. for current in exp_name.finditer(content):
  37. results.append(current.group().replace(",", ""))
  38. return list(set(results))
  39. def extract_plaintiff(content):
  40. """
  41. Returns the name of the plaintiff
  42. previous = (OF\s\w{1,})(.*)Plaintiff[s]?,
  43. """
  44. regex = r"OF\s\w+(\s.*?\,).*?Plaintiff"
  45. try:
  46. return re.search(regex, content).groups()[0]
  47. except:
  48. return "None"
  49. def extract_defendent(content):
  50. """
  51. Returns the name of the defendant
  52. """
  53. regex = r"Plaintiff[s]?.*v[s]?\.(.*?)Defendant[s]?\."
  54. try:
  55. return re.search(regex, content).groups()[0]
  56. except:
  57. return "None"
  58. def extract_acronyms(content):
  59. """
  60. Returns the list of all the acronyms present
  61. """
  62. regex = r"\(“(\w{3})”\)"
  63. results = []
  64. plaintiff = re.compile(regex, re.IGNORECASE)
  65. for current in plaintiff.finditer(content):
  66. results.append(current.group().replace(",", ""))
  67. return list(set(results))
  68. def extract_firm_name(content):
  69. """
  70. Returns the list of firm names present in the documents.
  71. """
  72. regex = r"(\(Firm\sName\,\sAddress\,\sand\sTelephone\sNumber\))([\r\n]+([^\r\n]+))"
  73. results = []
  74. firm = re.compile(regex, re.IGNORECASE)
  75. for current in firm.finditer(content):
  76. results.append(current.group().replace(",", ""))
  77. return list(set(results))
  78. def extract_filing_date(content):
  79. """
  80. Returns the filing date.
  81. """
  82. try:
  83. return re.search(r"(Filed)\s(\d{2}\/\d{2}\/\d{2})", content).groups()[1]
  84. except:
  85. return "None"