parse_pdf_utils.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. import re
  2. def extract_patent_number(content):
  3. """
  4. Returns the list of unique patent_numbers in the document
  5. """
  6. regex = r"\d{1,3}\,\d{1,3}\,\d{3}"
  7. results = []
  8. patent = re.compile(regex, re.IGNORECASE)
  9. for current in patent.finditer(content):
  10. results.append(current.group().replace(",", ""))
  11. return list(set(results))
  12. def extract_case_number(content):
  13. """
  14. Returns the lisr of unique case_numbers
  15. """
  16. regex = r"(\d{1,})\:(\d{1,}\-\w{1,}\-\d{5,}\-\w+)+"
  17. try:
  18. return re.search(regex, content).groups()[1]
  19. except:
  20. return "None"
  21. def extract_hourly_compensation(content):
  22. """
  23. Returns hourly compensation.
  24. """
  25. results = []
  26. hourly_comp_re = re.compile("\$\d{1,20}", re.IGNORECASE)
  27. for current in hourly_comp_re.finditer(content):
  28. results.append(current.group().replace(",", ""))
  29. return list(set(results))
  30. def extract_expert_name(content):
  31. """
  32. Returns the name of the expert
  33. """
  34. results = []
  35. exp_name = re.compile(r"\b(REPORT OF ).*\S[.]")
  36. for current in exp_name.finditer(content):
  37. results.append(current.group().replace(",", ""))
  38. return list(set(results))
  39. def extract_plaintiff(content):
  40. """
  41. Returns the name of the plaintiff
  42. previous = (OF\s\w{1,})(.*)Plaintiff[s]?,
  43. """
  44. regex = r"OF\s\w+(\s.*?\,).*?Plaintiff"
  45. try:
  46. return re.search(regex, content).groups()[0]
  47. except:
  48. return "None"
  49. def extract_defendent(content):
  50. """
  51. Returns the name of the defendant
  52. Plaintiff[s]?.*v[s]?\.(.*?)Defendant[s]?\.
  53. """
  54. regex = r"Plaintiff[s]?\,.*?[v|V]\.(.*?)Defendant[s]?\.?"
  55. try:
  56. return re.search(regex, content).groups()[0]
  57. except:
  58. return "None"
  59. def extract_acronyms(content):
  60. """
  61. Returns the list of all the acronyms present
  62. """
  63. regex = r"\(“(\w{3})”\)"
  64. results = []
  65. plaintiff = re.compile(regex, re.IGNORECASE)
  66. for current in plaintiff.finditer(content):
  67. results.append(current.group().replace(",", ""))
  68. return list(set(results))
  69. def extract_firm_name(content):
  70. """
  71. Returns the list of firm names present in the documents.
  72. """
  73. regex = r"(\(Firm\sName\,\sAddress\,\sand\sTelephone\sNumber\))([\r\n]+([^\r\n]+))"
  74. results = []
  75. firm = re.compile(regex, re.IGNORECASE)
  76. for current in firm.finditer(content):
  77. results.append(current.group().replace(",", ""))
  78. return list(set(results))
  79. def extract_filing_date(content):
  80. """
  81. Returns the filing date.
  82. """
  83. try:
  84. return re.search(r"(Filed)\s(\d{2}\/\d{2}\/\d{2})", content).groups()[1]
  85. except:
  86. return "None"