import requests import json import os import shutil def get_pdfs2(firm_name, category_names): """Gets 100 pdfs from a specific category ie(Exhibit, Other, Preliminary Response, Petition,Notice,Motion, Power of Attorney, Order, Decision Affirmed, Objection)""" if type(category_names) == list: """If category_names is list""" for category_name in category_names: if os.path.exists(category_name): shutil.rmtree(category_name) os.makedirs(category_name) print(category_name) if (category_name != "Other") & (category_name != "Objection"): """Here except Other and Objection Category will be parsed and 100 pdfs of specific category are obtained""" for pg_no in range(2, 30): response1 = requests.get( f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json" ) json_data = response1.json() q = json_data["_d_"]["hits"]["hits"] for i, data in enumerate(q): count = len(os.listdir(category_name)) print("Files Count: ", count) if count < 100: idx, pdf_id, category, title = ( i, data["_id"], data["_source"]["document_type"], data["_source"]["caption"], ) if pdf_id == "253989": continue li = list(title.split(".pdf")[0]) if li[-1] == ".": li[-1] = "" pdf_name = "".join(li) print(idx, category_name, pdf_id, category, pdf_name) response2 = requests.get( f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False, ) # if condition which doesnt contains Objection and Other with open( f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf", "wb", ) as pdf_file: pdf_file.write(response2.content) elif count > 100: print("100 Pdfs in the directory") break if count == 100: break elif category_name == "Other": """Here Other Category will be parsed and 100 pdfs are obtained""" for pg_no in range(1, 30): response1 = requests.get( f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json" ) json_data = response1.json() q = json_data["_d_"]["hits"]["hits"] for i, data in enumerate(q): count = len(os.listdir(category_name)) print("Files Count: ", count) try: if count < 100: idx, pdf_id, category, title = ( i, data["_id"], data["_source"]["document_type"], data["_source"]["caption"], ) if category == "": continue li = list(title.split(".pdf")[0]) if li[-1] == ".": li[-1] = "" pdf_name = "".join(li) # print(idx,category_name, pdf_id,category, pdf_name) pdf_name2 = pdf_name.replace("/", " ") print(pdf_name2) category_c = category.replace("/", " ") print(idx, pdf_id, category_c, pdf_name2) response2 = requests.get( f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False, ) # elif condition with contains Other with open( f"{category_name}/{pdf_id}-{category_c}-{pdf_name2}.pdf", "wb", ) as pdf_file: pdf_file.write(response2.content) elif count > 100: print("100 Pdfs in the directory") break except OSError as exc: continue if count == 100: break elif category_name == "Objection": """Here Other Objection category will be parsed and 100 pdfs are obtained""" for pg_no in range(1, 30): response1 = requests.get( f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json" ) json_data = response1.json() q = json_data["_d_"]["hits"]["hits"] for i, data in enumerate(q): count = len(os.listdir(category_name)) print("Files Count: ", count) if count < 95: idx, pdf_id, category, title = ( i, data["_id"], data["_source"]["document_type"], data["_source"]["caption"], ) li = list(title.split(".pdf")[0]) if li[-1] == ".": li[-1] = "" pdf_name = "".join(li) print(idx, pdf_id, category, pdf_name) response2 = requests.get( f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False, ) with open( f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf", "wb", ) as pdf_file: pdf_file.write(response2.content) elif count > 95: print("95 Pdfs in the directory") break if count == 95: break elif type(category_names) != list: """Only a specific category and category_names is not a list where 100 pdfs are obtained""" if os.path.exists(category_names): shutil.rmtree(category_names) os.makedirs(category_names) if category_names != "Objection": for pg_no in range(2, 30): response1 = requests.get( f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(category_names)}&page={pg_no}&category={str(category_names)}&format=json" ) json_data = response1.json() q = json_data["_d_"]["hits"]["hits"] for i, data in enumerate(q): count = len(os.listdir(category_names)) print("Files Count: ", count) if count < 100: idx, pdf_id, category, title = ( i, data["_id"], data["_source"]["document_type"], data["_source"]["caption"], ) if category == "": continue # if pdf_id == "253989": # continue print(title) li = list(title.split(".pdf")[0]) if li[-1] == ".": li[-1] = "" pdf_name = "".join(li) print(idx, pdf_id, category, pdf_name) pdf_name2 = pdf_name.replace("/", " ") response2 = requests.get( f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False ) # elif condition with open( f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb" ) as pdf_file: pdf_file.write(response2.content) elif count > 100: print("100 Pdfs in the directory") break if count == 100: break elif category_names == "Objection": for pg_no in range(1, 30): response1 = requests.get( f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_names)}&format=json" ) json_data = response1.json() q = json_data["_d_"]["hits"]["hits"] for i, data in enumerate(q): count = len(os.listdir(category_names)) print("Files Count: ", count) if count < 95: idx, pdf_id, category, title = ( i, data["_id"], data["_source"]["document_type"], data["_source"]["caption"], ) li = list(title.split(".pdf")[0]) if li[-1] == ".": li[-1] = "" pdf_name = "".join(li) print(idx, pdf_id, category, pdf_name) response2 = requests.get( f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False ) with open( f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb" ) as pdf_file: pdf_file.write(response2.content) elif count > 95: print("95 Pdfs in the directory") break if count == 95: break if __name__ == "__main__": categories = [ "Other", "Exhibit", "Preliminary Response", "Petition", "Notice", "Motion", "Order", "Power Of Attorney", "Decision Affirmed", "Objection", ] category = ["Power Of Attorney"] get_pdfs2("Apple", category)