瀏覽代碼

modified code for filing date and email extraction

Harsh Parikh 2 年之前
父節點
當前提交
9d14888421
共有 2 個文件被更改,包括 23 次插入14 次删除
  1. 11 6
      pdfs/pdf_parser.py
  2. 12 8
      pdfs/utils.py

+ 11 - 6
pdfs/pdf_parser.py

@@ -1,4 +1,5 @@
 import os
+from collections import defaultdict
 from utils import (
     extract_filing_date,
     address_extraction,
@@ -7,6 +8,7 @@ from utils import (
     telephone_number_extraction,
 )
 
+import pandas as pd
 import tika
 from tika import parser
 
@@ -17,15 +19,18 @@ def main(PATH):
     """
     parses the required data from the pdfs
     """
+    data_dict = defaultdict(list)
     required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
     for idx, file in enumerate(required_files):
-        print(idx, file)
         content = parser.from_file(file)["content"].strip().replace("\n", "")
-        print(extract_filing_date(content))
-        print(address_extraction(content))
-        print(refer_exteraction(content))
-        print(email_extraction(content))
-        print((telephone_number_extraction(content)))
+        data_dict["filing"].append(extract_filing_date(content))
+        data_dict["address"].append(address_extraction(content))
+        data_dict["refer"].append(refer_exteraction(content))
+        data_dict["email"].append(email_extraction(content))
+        data_dict["telephone_number"].append(telephone_number_extraction(content))
+
+    data = pd.DataFrame(data_dict)
+    data.to_csv("required_data.csv")
 
 
 if __name__ == "__main__":

+ 12 - 8
pdfs/utils.py

@@ -5,7 +5,8 @@ def extract_filing_date(content):
     """
     extracts filing date from the documents.
     """
-    pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2}|(Filed\:.+?)\w.+\d{1,2}\,\s\d{4}([\s\S].*?Paper))"
+    # pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2}|(Filed\:.+?)\w.+\d{1,2}\,\s\d{4}([\s\S].*?Paper))"
+    pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
     try:
         return re.search(pattern, content).groups()[0]
     except:
@@ -18,7 +19,7 @@ def address_extraction(content):
     """
     regex = r"(OfficeAddress\:\s([\s\S].*)www|(A|a)ddress.+?\sof.+?Fax\:\s\(\d{3}\)\s\d{3}\-\d+\s)"
     try:
-        print(re.search(regex, content).groups()[0])
+        re.search(regex, content).groups()[0]
     except:
         return "None"
 
@@ -40,13 +41,16 @@ def refer_exteraction(content):
 
 def email_extraction(content):
     regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
-    expert_name = re.compile(regex, re.IGNORECASE)
-    for current in expert_name.finditer(content):
-        print(current.group())
+    result = []
+    emails = re.compile(regex, re.IGNORECASE)
+    for email in emails.finditer(content):
+        result.append(email.group())
+    return result
 
 
 def telephone_number_extraction(content):
     regex = r"Telephone\:.+?\(\d{3}\)\s\d{3}\-\d{4}"
-    expert_name = re.compile(regex, re.IGNORECASE)
-    for current in expert_name.finditer(content):
-        print(current.group())
+    try:
+        return re.search(regex, content).group()
+    except:
+        return "None"