|
@@ -6,36 +6,45 @@ from parse_expert_pdf_utils import (
|
|
patent_extraction,
|
|
patent_extraction,
|
|
on_behalf_of_extraction,
|
|
on_behalf_of_extraction,
|
|
acronym_extraction,
|
|
acronym_extraction,
|
|
|
|
+ hourly_compensation,
|
|
|
|
+ date_extraction,
|
|
|
|
+ address_extraction,
|
|
)
|
|
)
|
|
import tika
|
|
import tika
|
|
from tika import parser
|
|
from tika import parser
|
|
|
|
+import re
|
|
|
|
+import warnings
|
|
|
|
+warnings.filterwarnings('ignore')
|
|
|
|
|
|
-tika.initVM()
|
|
|
|
-import os
|
|
|
|
|
|
|
|
|
|
+def main():
|
|
|
|
+ tika.initVM()
|
|
|
|
+ path = "/home/ftech/Code/xc/pdf_parser/pdfs/2018008353_Mail_Decision.pdf" #2018008353_Mail_Decision.pdf"
|
|
|
|
+ parsed_pdf = parser.from_file(path)
|
|
|
|
+ print(parsed_pdf.keys())
|
|
|
|
|
|
-def main(PATH):
|
|
|
|
- required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
|
|
|
|
- for file in required_files:
|
|
|
|
- content = parser.from_file(file)["content"].strip().replace("\n", "")
|
|
|
|
- # case_number = case_number_extraction(content)
|
|
|
|
- # print(case_number)
|
|
|
|
- # expert_name = expert_name_extraction(content)
|
|
|
|
- # print(expert_name)
|
|
|
|
- # plaintiff = plaintiff_extraction(content)
|
|
|
|
- # print(plaintiff)
|
|
|
|
- # defendent = defendent_extraction(content)
|
|
|
|
- # print(defendent)
|
|
|
|
- # patent = patent_extraction(content)
|
|
|
|
- # print(patent)
|
|
|
|
- # on_behalf_of = on_behalf_of_extraction(content)
|
|
|
|
- # print(on_behalf_of)
|
|
|
|
- acronym = acronym_extraction(content)
|
|
|
|
- print(acronym)
|
|
|
|
-
|
|
|
|
|
|
+ content = parsed_pdf['content'].strip().replace('\n', '')
|
|
|
|
+ # print(content)
|
|
|
|
+ # case_number = case_number_extraction(content)
|
|
|
|
+ # print(case_number)
|
|
|
|
+ # expert_name = expert_name_extraction(content)
|
|
|
|
+ # print(expert_name)
|
|
|
|
+ # plaintiff = plaintiff_extraction(content)
|
|
|
|
+ # print(plaintiff)
|
|
|
|
+ # defendent = defendent_extraction(content)
|
|
|
|
+ # print(defendent)
|
|
|
|
+ # patent = patent_extraction(content)
|
|
|
|
+ # print(patent)
|
|
|
|
+ # on_behalf_of = on_behalf_of_extraction(content)
|
|
|
|
+ # print(on_behalf_of)
|
|
|
|
+ # acronym = acronym_extraction(content)
|
|
|
|
+ # print(acronym)
|
|
|
|
+ # pay = hourly_compensation(content)
|
|
|
|
+ # print(pay)
|
|
|
|
+ date = date_extraction(content)
|
|
|
|
+ print(date)
|
|
|
|
+ address = address_extraction(content)
|
|
|
|
+ print(address)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
- HOME_DIR = os.path.expanduser("~")
|
|
|
|
- BASE_DIR = "Code/pdf_parser/expert_report"
|
|
|
|
- PATH = os.path.join(HOME_DIR, BASE_DIR)
|
|
|
|
- main(PATH)
|
|
|
|
|
|
+ main()
|