1234567891011121314151617181920212223242526272829303132333435363738394041 |
- import os
- from collections import defaultdict
- from utils import (
- extract_filing_date,
- address_extraction,
- refer_exteraction,
- email_extraction,
- telephone_number_extraction,
- )
- import pandas as pd
- import tika
- from tika import parser
- tika.initVM()
- def main(PATH):
- """
- parses the required data from the pdfs
- """
- data_dict = defaultdict(list)
- required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
- for idx, file in enumerate(required_files):
- content = parser.from_file(file)["content"].strip().replace("\n", "")
- data_dict["document_name"].append(file)
- data_dict["filing_date"].append(extract_filing_date(content))
- data_dict["address"].append(address_extraction(content))
- data_dict["refer"].append(refer_exteraction(content))
- data_dict["email"].append(email_extraction(content))
- data_dict["telephone_number"].append(telephone_number_extraction(content))
- data = pd.DataFrame(data_dict)
- data.to_csv("required_data.csv")
- if __name__ == "__main__":
- HOME_DIR = os.path.expanduser("~")
- BASE_DIR = "Code/pdf_parser/pdfs"
- PATH = os.path.join(HOME_DIR, BASE_DIR)
- main(PATH)
|