parse_pdf_utils.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. import re
  2. def extract_patent_number(content):
  3. """
  4. Returns the list of unique patent_numbers in the document
  5. """
  6. regex = r"\d{1,3}\,\d{1,3}\,\d{3}"
  7. results = []
  8. patent = re.compile(regex, re.IGNORECASE)
  9. for current in patent.finditer(content):
  10. results.append(current.group().replace(",", ""))
  11. return list(set(results))
  12. def extract_case_number(content):
  13. """
  14. Returns the lisr of unique case_numbers
  15. """
  16. regex = r"(\d{1,})\:(\d{1,}\-\w{1,}\-\d{5,}\-\w+)+"
  17. try:
  18. return re.search(regex, content).groups()[1]
  19. except:
  20. return "None"
  21. def extract_hourly_compensation(content):
  22. """
  23. Returns hourly compensation.
  24. """
  25. results = []
  26. hourly_comp_re = re.compile("\$\d{1,20}", re.IGNORECASE)
  27. for current in hourly_comp_re.finditer(content):
  28. results.append(current.group().replace(",", ""))
  29. return list(set(results))
  30. def extract_plaintiff(content):
  31. """
  32. Returns the name of the plaintiff
  33. previous = (OF\s\w{1,})(.*)Plaintiff[s]?,
  34. """
  35. regex = r"OF\s\w+(\s.*?\,).*?Plaintiff"
  36. try:
  37. return re.search(regex, content).groups()[0]
  38. except:
  39. return "None"
  40. def extract_defendent(content):
  41. """
  42. Returns the name of the defendant
  43. Plaintiff[s]?.*v[s]?\.(.*?)Defendant[s]?\.
  44. """
  45. regex = r"Plaintiff[s]?\,.*?[v|V]\.(.*?)Defendant[s]?\.?"
  46. try:
  47. return re.search(regex, content).groups()[0]
  48. except:
  49. return "None"
  50. def extract_acronyms(content):
  51. """
  52. Returns the list of all the acronyms present
  53. \(["|“](\w{1,10})
  54. """
  55. regex = r'\(["|“](\w{1,10})["|”]\)'
  56. results = []
  57. plaintiff = re.compile(regex, re.IGNORECASE)
  58. for current in plaintiff.finditer(content):
  59. results.append(current.groups()[0].replace(",", ""))
  60. if len(results) == 0:
  61. return "None"
  62. return list(set(results))
  63. def extract_firm_name(content):
  64. """
  65. Returns the list of firm names present in the documents.
  66. """
  67. regex = r"(\(Firm\sName\,\sAddress\,\sand\sTelephone\sNumber\))([\r\n]+([^\r\n]+))"
  68. results = []
  69. firm = re.compile(regex, re.IGNORECASE)
  70. for current in firm.finditer(content):
  71. results.append(current.group().replace(",", ""))
  72. return list(set(results))
  73. def extract_filing_date(content):
  74. """
  75. Returns the filing date.
  76. """
  77. try:
  78. return re.search(r"(Filed)\s(\d{2}\/\d{2}\/\d{2})", content).groups()[1]
  79. except:
  80. return "None"
  81. def extract_attorney_name(content):
  82. """
  83. returns the name of the attorney/attornies.
  84. """
  85. regex = r"\/s\/\s\w+\s\w\.\s\w+"
  86. results = []
  87. attorney = re.compile(regex, re.IGNORECASE)
  88. for current in attorney.finditer(content):
  89. results.append(current.group().replace(",", ""))
  90. return list(set(results))