Ver código fonte

updated formatting and input order

Harsh Parikh 2 anos atrás
pai
commit
7a7cbb3088
1 arquivos alterados com 4 adições e 5 exclusões
  1. 4 5
      pdfs/pdf_parser.py

+ 4 - 5
pdfs/pdf_parser.py

@@ -1,5 +1,8 @@
 import os
 from collections import defaultdict
+import pandas as pd
+import tika
+from tika import parser
 from utils import (
     extract_filing_date,
     address_extraction,
@@ -8,10 +11,6 @@ from utils import (
     telephone_number_extraction,
 )
 
-import pandas as pd
-import tika
-from tika import parser
-
 tika.initVM()
 
 
@@ -21,7 +20,7 @@ def main(PATH):
     """
     data_dict = defaultdict(list)
     required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
-    for idx, file in enumerate(required_files):
+    for file in required_files:
         content = parser.from_file(file)["content"].strip().replace("\n", "")
         data_dict["document_name"].append(file)
         data_dict["filing_date"].append(extract_filing_date(content))