utils.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import re
  2. def extract_filing_date(content):
  3. """
  4. extracts filing date from the documents.
  5. """
  6. # pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2}|(Filed\:.+?)\w.+\d{1,2}\,\s\d{4}([\s\S].*?Paper))"
  7. pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
  8. try:
  9. return re.search(pattern, content).groups()[0]
  10. except:
  11. return "None"
  12. def address_extraction(content):
  13. """
  14. extracts address from the documents.
  15. """
  16. regex = r"(OfficeAddress\:\s([\s\S].*)www|(A|a)ddress.+?\sof.+?Fax\:\s\(\d{3}\)\s\d{3}\-\d+\s)"
  17. try:
  18. re.search(regex, content).groups()[0]
  19. except:
  20. return "None"
  21. def refer_exteraction(content):
  22. """
  23. extract referals from the documents.
  24. """
  25. regex = r"((by\sreference\sU\.S\.\sPatent\sNo.\s\d{1,2}\,\d{3}\,\d{3})|(In\sre\s\w+.+?\,?\s\d{2,3}\sF\.\dd\s\d{0,4}\,?\s?\d{0,4}\s?\(?.+?\)))" # |In\sre.+?\)|In\sre.+?\)"
  26. # 1. by reference
  27. # 2. In re
  28. # 3. in qoutes ""
  29. try:
  30. data1 = re.findall(regex, content)
  31. print(list(set(data1)))
  32. except:
  33. return "None"
  34. def email_extraction(content):
  35. regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
  36. result = []
  37. emails = re.compile(regex, re.IGNORECASE)
  38. for email in emails.finditer(content):
  39. result.append(email.group())
  40. return result
  41. def telephone_number_extraction(content):
  42. regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
  43. try:
  44. return re.search(regex, content).group()
  45. except:
  46. return "None"