harsh
/
pdf_parser


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
							import requests
import json
import os
import shutil


def get_pdfs2(firm_name, category_names):
    """Gets 100 pdfs from a specific category ie(Exhibit, Other,
    Preliminary Response, Petition,Notice,Motion, Power of Attorney, Order,
    Decision Affirmed, Objection)"""
    if type(category_names) == list:
        """If category_names is list"""
        for category_name in category_names:
            if os.path.exists(category_name):
                shutil.rmtree(category_name)
            os.makedirs(category_name)
            print(category_name)
            if (category_name != "Other") & (category_name != "Objection"):
                """Here except Other and Objection Category will be parsed and 100 pdfs of specific category are obtained"""
                for pg_no in range(2, 30):
                    response1 = requests.get(
                        f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
                    )
                    json_data = response1.json()
                    q = json_data["_d_"]["hits"]["hits"]

                    for i, data in enumerate(q):
                        count = len(os.listdir(category_name))
                        print("Files Count: ", count)
                        if count < 100:
                            idx, pdf_id, category, title = (
                                i,
                                data["_id"],
                                data["_source"]["document_type"],
                                data["_source"]["caption"],
                            )
                            if pdf_id == "253989":
                                continue
                            li = list(title.split(".pdf")[0])
                            if li[-1] == ".":
                                li[-1] = ""
                            pdf_name = "".join(li)
                            print(idx, category_name, pdf_id, category, pdf_name)
                            response2 = requests.get(
                                f"http://50.211.199.148:8000/local/{pdf_id}/",
                                verify=False,
                            )  # if condition which doesnt contains Objection and Other
                            with open(
                                f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
                                "wb",
                            ) as pdf_file:
                                pdf_file.write(response2.content)
                        elif count > 100:
                            print("100 Pdfs in the directory")
                            break
                    if count == 100:
                        break
            elif category_name == "Other":
                """Here Other Category will be parsed and 100 pdfs are obtained"""
                for pg_no in range(1, 30):
                    response1 = requests.get(
                        f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
                    )
                    json_data = response1.json()
                    q = json_data["_d_"]["hits"]["hits"]
                    for i, data in enumerate(q):
                        count = len(os.listdir(category_name))
                        print("Files Count: ", count)
                        try:
                            if count < 100:

                                idx, pdf_id, category, title = (
                                    i,
                                    data["_id"],
                                    data["_source"]["document_type"],
                                    data["_source"]["caption"],
                                )
                                if category == "":
                                    continue
                                li = list(title.split(".pdf")[0])
                                if li[-1] == ".":
                                    li[-1] = ""
                                pdf_name = "".join(li)
                                # print(idx,category_name, pdf_id,category, pdf_name)
                                pdf_name2 = pdf_name.replace("/", " ")
                                print(pdf_name2)
                                category_c = category.replace("/", " ")
                                print(idx, pdf_id, category_c, pdf_name2)
                                response2 = requests.get(
                                    f"http://50.211.199.148:8000/local/{pdf_id}/",
                                    verify=False,
                                )  # elif condition with contains Other
                                with open(
                                    f"{category_name}/{pdf_id}-{category_c}-{pdf_name2}.pdf",
                                    "wb",
                                ) as pdf_file:
                                    pdf_file.write(response2.content)
                            elif count > 100:
                                print("100 Pdfs in the directory")
                                break
                        except OSError as exc:
                            continue

                    if count == 100:
                        break
            elif category_name == "Objection":
                """Here Other Objection category will be parsed and 100 pdfs are obtained"""
                for pg_no in range(1, 30):
                    response1 = requests.get(
                        f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
                    )
                    json_data = response1.json()
                    q = json_data["_d_"]["hits"]["hits"]
                    for i, data in enumerate(q):
                        count = len(os.listdir(category_name))
                        print("Files Count: ", count)
                        if count < 95:
                            idx, pdf_id, category, title = (
                                i,
                                data["_id"],
                                data["_source"]["document_type"],
                                data["_source"]["caption"],
                            )
                            li = list(title.split(".pdf")[0])
                            if li[-1] == ".":
                                li[-1] = ""
                            pdf_name = "".join(li)
                            print(idx, pdf_id, category, pdf_name)
                            response2 = requests.get(
                                f"http://50.211.199.148:8000/local/{pdf_id}/",
                                verify=False,
                            )
                            with open(
                                f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
                                "wb",
                            ) as pdf_file:
                                pdf_file.write(response2.content)
                        elif count > 95:
                            print("95 Pdfs in the directory")
                            break
                    if count == 95:
                        break
    elif type(category_names) != list:
        """Only a specific category and category_names is not a list where 100 pdfs are obtained"""
        if os.path.exists(category_names):
            shutil.rmtree(category_names)
        os.makedirs(category_names)
        if category_names != "Objection":
            for pg_no in range(2, 30):
                response1 = requests.get(
                    f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(category_names)}&page={pg_no}&category={str(category_names)}&format=json"
                )
                json_data = response1.json()
                q = json_data["_d_"]["hits"]["hits"]
                for i, data in enumerate(q):
                    count = len(os.listdir(category_names))
                    print("Files Count: ", count)
                    if count < 100:
                        idx, pdf_id, category, title = (
                            i,
                            data["_id"],
                            data["_source"]["document_type"],
                            data["_source"]["caption"],
                        )
                        if category == "":
                            continue
                        # if pdf_id == "253989":
                        #     continue
                        print(title)
                        li = list(title.split(".pdf")[0])
                        if li[-1] == ".":
                            li[-1] = ""
                        pdf_name = "".join(li)
                        print(idx, pdf_id, category, pdf_name)
                        pdf_name2 = pdf_name.replace("/", " ")
                        response2 = requests.get(
                            f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
                        )  # elif condition
                        with open(
                            f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
                        ) as pdf_file:
                            pdf_file.write(response2.content)
                    elif count > 100:
                        print("100 Pdfs in the directory")
                        break
                if count == 100:
                    break
        elif category_names == "Objection":
            for pg_no in range(1, 30):
                response1 = requests.get(
                    f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_names)}&format=json"
                )
                json_data = response1.json()
                q = json_data["_d_"]["hits"]["hits"]
                for i, data in enumerate(q):
                    count = len(os.listdir(category_names))
                    print("Files Count: ", count)
                    if count < 95:
                        idx, pdf_id, category, title = (
                            i,
                            data["_id"],
                            data["_source"]["document_type"],
                            data["_source"]["caption"],
                        )
                        li = list(title.split(".pdf")[0])
                        if li[-1] == ".":
                            li[-1] = ""
                        pdf_name = "".join(li)
                        print(idx, pdf_id, category, pdf_name)
                        response2 = requests.get(
                            f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
                        )
                        with open(
                            f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
                        ) as pdf_file:
                            pdf_file.write(response2.content)
                    elif count > 95:
                        print("95 Pdfs in the directory")
                        break
                if count == 95:
                    break


if __name__ == "__main__":
    categories = [
        "Other",
        "Exhibit",
        "Preliminary Response",
        "Petition",
        "Notice",
        "Motion",
        "Order",
        "Power Of Attorney",
        "Decision Affirmed",
        "Objection",
    ]
    category = ["Power Of Attorney"]
    get_pdfs2("Apple", category)