Parcourir la source

updated code on 19th July

Omkar Desai il y a 2 ans
Parent
commit
e6ccc00ffd
1 fichiers modifiés avec 3 ajouts et 3 suppressions
  1. 3 3
      pdfs/utils.py

+ 3 - 3
pdfs/utils.py

@@ -5,7 +5,7 @@ def extract_filing_date(content):
     """
     extracts filing date from the documents.
     """
-    pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})"  # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
+    pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2}|(Filed\:.+?)\w.+\d{1,2}\,\s\d{4}([\s\S].*?Paper))"
     try:
         return re.search(pattern, content).groups()[0]
     except:
@@ -16,7 +16,7 @@ def address_extraction(content):
     """
     extracts address from the documents.
     """
-    regex = r"OfficeAddress\:\s([\s\S].*)www"
+    regex = r"(OfficeAddress\:\s([\s\S].*)www|(A|a)ddress.+?\sof.+?Fax\:\s\(\d{3}\)\s\d{3}\-\d+\s)"
     try:
         print(re.search(regex, content).groups()[0])
     except:
@@ -27,7 +27,7 @@ def refer_exteraction(content):
     """
     extract referals from the documents.
     """
-    regex = r"by\sreference.+?\d{1,2}\,\d{3}\,\d{3}|In\sre.+?\)"
+    regex = r"((by\sreference\sU\.S\.\sPatent\sNo.\s\d{1,2}\,\d{3}\,\d{3})|(In\sre\s\w+.+?\,?\s\d{2,3}\sF\.\dd\s\d{0,4}\,?\s?\d{0,4}\s?\(?.+?\)))"  # |In\sre.+?\)|In\sre.+?\)"
     # 1. by reference
     # 2. In re
     # 3. in qoutes ""