parse_pdf_utils.py 1.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647
  1. import re
  2. def extract_filing_date(content):
  3. """
  4. extracts filing date from the documents.
  5. """
  6. pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
  7. try:
  8. return re.search(pattern, content).group(0)
  9. except:
  10. return "None"
  11. def email_extraction(content):
  12. """
  13. Extracts emails from a document.
  14. """
  15. regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
  16. result = []
  17. emails = re.compile(regex, re.IGNORECASE)
  18. for email in emails.finditer(content):
  19. result.append(email.group())
  20. return result
  21. def telephone_number_extraction(content):
  22. """
  23. Extracts telephone number[s?] from a document
  24. """
  25. regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
  26. numbers = re.compile(regex, re.IGNORECASE)
  27. result = []
  28. for number in numbers.finditer(content):
  29. result.append(number.group())
  30. return result
  31. def address_extraction(content):
  32. """
  33. extracts address from the documents.
  34. """
  35. regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
  36. try:
  37. return re.search(regex_address, content).group(0)
  38. except:
  39. return "None"