|
@@ -1,5 +1,8 @@
|
|
|
import os
|
|
|
from collections import defaultdict
|
|
|
+import pandas as pd
|
|
|
+import tika
|
|
|
+from tika import parser
|
|
|
from utils import (
|
|
|
extract_filing_date,
|
|
|
address_extraction,
|
|
@@ -8,10 +11,6 @@ from utils import (
|
|
|
telephone_number_extraction,
|
|
|
)
|
|
|
|
|
|
-import pandas as pd
|
|
|
-import tika
|
|
|
-from tika import parser
|
|
|
-
|
|
|
tika.initVM()
|
|
|
|
|
|
|
|
@@ -21,7 +20,7 @@ def main(PATH):
|
|
|
"""
|
|
|
data_dict = defaultdict(list)
|
|
|
required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
|
|
|
- for idx, file in enumerate(required_files):
|
|
|
+ for file in required_files:
|
|
|
content = parser.from_file(file)["content"].strip().replace("\n", "")
|
|
|
data_dict["document_name"].append(file)
|
|
|
data_dict["filing_date"].append(extract_filing_date(content))
|