utils.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. import re
  2. def email_extraction(content: str) -> str:
  3. """
  4. Extracts emails from a document.
  5. """
  6. regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
  7. result = []
  8. emails = re.compile(regex, re.IGNORECASE)
  9. for email in emails.finditer(content):
  10. result.append(email.group())
  11. return result
  12. def telephone_number_extraction(content: str) -> str:
  13. """
  14. Extracts telephone number[s?] from a document
  15. """
  16. regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
  17. numbers = re.compile(regex, re.IGNORECASE)
  18. result = []
  19. for number in numbers.finditer(content):
  20. result.append(number.group())
  21. return result
  22. def address_extraction(content: str) -> str:
  23. """
  24. extracts address from the documents.
  25. """
  26. regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
  27. try:
  28. return re.search(regex_address, content).group(0)
  29. except:
  30. return "None"
  31. def case_number_extraction(content: str) -> str:
  32. """
  33. Extracts the case number from the documents.
  34. """
  35. regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"
  36. results = set()
  37. case_number = re.compile(regex, re.IGNORECASE)
  38. for current in case_number.finditer(content):
  39. results.add(current.groups()[0])
  40. return list(results)
  41. def plaintiff_extraction(content: str) -> str:
  42. """
  43. Extracts the plaintiff from the document
  44. """
  45. regex = r"(\w.*)\n\s?\n?\s?(Plaintiffs?|Petitioner)"
  46. try:
  47. return re.search(regex, content).group(1)
  48. except:
  49. return "None"
  50. def defendent_extraction(content: str) -> str:
  51. """
  52. Extracts the defendant from the document
  53. """
  54. regex = r"(\w.*?)\n\s?\n?\s?\s?(Defendants|Patent\sOwners?)"
  55. try:
  56. return re.search(regex, content).group(1)
  57. except:
  58. return "None"
  59. def patent_extraction(content: str) -> str:
  60. """
  61. Extracts patent numbers from the document
  62. """
  63. regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?"
  64. result = set()
  65. patent = re.compile(regex, re.IGNORECASE)
  66. for current in patent.finditer(content):
  67. result.add(current.group().replace(",", ""))
  68. return list(result)
  69. def acronym_extraction(content: str) -> str:
  70. regex = r"\(\“([A-Z]{3,4})\”\)"
  71. results = set()
  72. acronym = re.compile(regex, re.IGNORECASE)
  73. for current in acronym.finditer(content):
  74. results.add(current.group(1))
  75. return list(results)