utils.py 1.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import re
  2. def extract_filing_date(content):
  3. """
  4. extracts filing date from the documents.
  5. """
  6. pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})" # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
  7. try:
  8. return re.search(pattern, content).groups()[0]
  9. except:
  10. return "None"
  11. def address_extraction(content):
  12. """
  13. extracts address from the documents.
  14. """
  15. regex = r"OfficeAddress\:\s([\s\S].*)www"
  16. try:
  17. print(re.search(regex, content).groups()[0])
  18. except:
  19. return "None"
  20. def refer_exteraction(content):
  21. """
  22. extract referals from the documents.
  23. """
  24. regex = r"by\sreference.+?\d{1,2}\,\d{3}\,\d{3}|In\sre.+?\)"
  25. # 1. by reference
  26. # 2. In re
  27. # 3. in qoutes ""
  28. try:
  29. data1 = re.findall(regex, content)
  30. print(list(set(data1)))
  31. except:
  32. return "None"
  33. def email_extraction(content):
  34. regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
  35. expert_name = re.compile(regex, re.IGNORECASE)
  36. for current in expert_name.finditer(content):
  37. print(current.group())
  38. def telephone_number_extraction(content):
  39. regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
  40. expert_name = re.compile(regex, re.IGNORECASE)
  41. for current in expert_name.finditer(content):
  42. print(current.group())