utils.py 1.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364
  1. import re
  2. def extract_filing_date(content):
  3. """
  4. extracts filing date from the documents.
  5. """
  6. # pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2}|(Filed\:.+?)\w.+\d{1,2}\,\s\d{4}([\s\S].*?Paper))"
  7. pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
  8. try:
  9. return re.search(pattern, content).group(0)
  10. except:
  11. return "None"
  12. def address_extraction(content):
  13. """
  14. extracts address from the documents.
  15. """
  16. regex = r"(OfficeAddress\:\s([\s\S].*)www|(A|a)ddress.+?\sof.+?Fax\:\s\(\d{3}\)\s\d{3}\-\d+\s)"
  17. try:
  18. re.search(regex, content).groups()[0]
  19. except:
  20. return "None"
  21. def refer_exteraction(content):
  22. """
  23. extract referals from the documents.
  24. """
  25. regex = r"((by\sreference\sU\.S\.\sPatent\sNo.\s\d{1,2}\,\d{3}\,\d{3})|(In\sre\s\w+.+?\,?\s\d{2,3}\sF\.\dd\s\d{0,4}\,?\s?\d{0,4}\s?\(?.+?\)))" # |In\sre.+?\)|In\sre.+?\)"
  26. # 1. by reference
  27. # 2. In re
  28. # 3. in qoutes ""
  29. try:
  30. data1 = re.findall(regex, content)
  31. print(list(set(data1)))
  32. except:
  33. return "None"
  34. def email_extraction(content):
  35. """
  36. Extracts emails from a document.
  37. """
  38. regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
  39. result = []
  40. emails = re.compile(regex, re.IGNORECASE)
  41. for email in emails.finditer(content):
  42. result.append(email.group())
  43. return result
  44. def telephone_number_extraction(content):
  45. """
  46. Extracts telephone number[s?] from a document
  47. """
  48. # regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
  49. regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
  50. numbers = re.compile(regex, re.IGNORECASE)
  51. result = []
  52. for number in numbers.finditer(content):
  53. result.append(number.group())
  54. return result