Bladeren bron

updated code for email extraction, phone number, and filing date

Harsh Parikh 2 jaren geleden
bovenliggende
commit
3ed2bea3a0
3 gewijzigde bestanden met toevoegingen van 20 en 10 verwijderingen
  1. 2 1
      .gitignore
  2. 4 3
      pdfs/pdf_parser.py
  3. 14 6
      pdfs/utils.py

+ 2 - 1
.gitignore

@@ -67,4 +67,5 @@ foo.py
 .~lock.Document Parser Fields.xlsx#
 .DS_Store
 .vscode
-.~lock.required_data.csv#
+.~lock.required_data.csv#
+.~lock.required_data_analytics.csv#

+ 4 - 3
pdfs/pdf_parser.py

@@ -23,9 +23,10 @@ def main(PATH):
     required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
     for idx, file in enumerate(required_files):
         content = parser.from_file(file)["content"].strip().replace("\n", "")
-        data_dict["filing"].append(extract_filing_date(content))
-        data_dict["address"].append(address_extraction(content))
-        data_dict["refer"].append(refer_exteraction(content))
+        data_dict["document_name"].append(file)
+        data_dict["filing_date"].append(extract_filing_date(content))
+        # data_dict["address"].append(address_extraction(content))
+        # data_dict["refer"].append(refer_exteraction(content))
         data_dict["email"].append(email_extraction(content))
         data_dict["telephone_number"].append(telephone_number_extraction(content))
 

+ 14 - 6
pdfs/utils.py

@@ -8,7 +8,7 @@ def extract_filing_date(content):
     # pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2}|(Filed\:.+?)\w.+\d{1,2}\,\s\d{4}([\s\S].*?Paper))"
     pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
     try:
-        return re.search(pattern, content).groups()[0]
+        return re.search(pattern, content).group(0)
     except:
         return "None"
 
@@ -40,6 +40,9 @@ def refer_exteraction(content):
 
 
 def email_extraction(content):
+    """
+    Extracts emails from a document.
+    """
     regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
     result = []
     emails = re.compile(regex, re.IGNORECASE)
@@ -49,8 +52,13 @@ def email_extraction(content):
 
 
 def telephone_number_extraction(content):
-    regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
-    try:
-        return re.search(regex, content).group()
-    except:
-        return "None"
+    """
+    Extracts telephone number[s?] from a document
+    """
+    # regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
+    regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
+    numbers = re.compile(regex, re.IGNORECASE)
+    result = []
+    for number in numbers.finditer(content):
+        result.append(number)
+    return result