|
@@ -9,34 +9,33 @@ from parse_expert_pdf_utils import (
|
|
|
)
|
|
|
import tika
|
|
|
from tika import parser
|
|
|
-import re
|
|
|
-import warnings
|
|
|
-warnings.filterwarnings('ignore')
|
|
|
|
|
|
+tika.initVM()
|
|
|
+import os
|
|
|
|
|
|
-def main():
|
|
|
- tika.initVM()
|
|
|
- path = "/home/omkardesai/Code/pdf_parser/pdfs/expert_report1.pdf"
|
|
|
- parsed_pdf = parser.from_file(path)
|
|
|
- print(parsed_pdf.keys())
|
|
|
|
|
|
- content = parsed_pdf['content'].strip().replace('\n', '')
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- acronym = acronym_extraction(content)
|
|
|
- print(acronym)
|
|
|
+def main(PATH):
|
|
|
+ required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
|
|
|
+ for file in required_files:
|
|
|
+ content = parser.from_file(file)["content"].strip().replace("\n", "")
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ acronym = acronym_extraction(content)
|
|
|
+ print(acronym)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
- main()
|
|
|
+ HOME_DIR = os.path.expanduser("~")
|
|
|
+ BASE_DIR = "Code/pdf_parser/expert_report"
|
|
|
+ PATH = os.path.join(HOME_DIR, BASE_DIR)
|
|
|
+ main(PATH)
|