@@ -0,0 +1,238 @@
+import requests
+import json
+import os
+import shutil
+def get_pdfs2(firm_name, category_names):
+ """Gets 100 pdfs from a specific category ie(Exhibit, Other,
+ Preliminary Response, Petition,Notice,Motion, Power of Attorney, Order,
+ Decision Affirmed, Objection)"""
+ if type(category_names) == list:
+ """If category_names is list"""
+ for category_name in category_names:
+ if os.path.exists(category_name):
+ shutil.rmtree(category_name)
+ os.makedirs(category_name)
+ print(category_name)
+ if (category_name != "Other") & (category_name != "Objection"):
+ """Here except Other and Objection Category will be parsed and 100 pdfs of specific category are obtained"""
+ for pg_no in range(2, 30):
+ response1 = requests.get(
+ f"{str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
+ )
+ json_data = response1.json()
+ q = json_data["_d_"]["hits"]["hits"]
+ for i, data in enumerate(q):
+ count = len(os.listdir(category_name))
+ print("Files Count: ", count)
+ if count < 100:
+ idx, pdf_id, category, title = (
+ i,
+ data["_id"],
+ data["_source"]["document_type"],
+ data["_source"]["caption"],
+ )
+ if pdf_id == "253989":
+ continue
+ li = list(title.split(".pdf")[0])
+ if li[-1] == ".":
+ li[-1] = ""
+ pdf_name = "".join(li)
+ print(idx, category_name, pdf_id, category, pdf_name)
+ response2 = requests.get(
+ f"{pdf_id}/",
+ verify=False,
+ ) # if condition which doesnt contains Objection and Other
+ with open(
+ f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
+ "wb",
+ ) as pdf_file:
+ pdf_file.write(response2.content)
+ elif count > 100:
+ print("100 Pdfs in the directory")
+ break
+ if count == 100:
+ break
+ elif category_name == "Other":
+ """Here Other Category will be parsed and 100 pdfs are obtained"""
+ for pg_no in range(1, 30):
+ response1 = requests.get(
+ f"{str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
+ )
+ json_data = response1.json()
+ q = json_data["_d_"]["hits"]["hits"]
+ for i, data in enumerate(q):
+ count = len(os.listdir(category_name))
+ print("Files Count: ", count)
+ try:
+ if count < 100:
+ idx, pdf_id, category, title = (
+ i,
+ data["_id"],
+ data["_source"]["document_type"],
+ data["_source"]["caption"],
+ )
+ if category == "":
+ continue
+ li = list(title.split(".pdf")[0])
+ if li[-1] == ".":
+ li[-1] = ""
+ pdf_name = "".join(li)
+ # print(idx,category_name, pdf_id,category, pdf_name)
+ pdf_name2 = pdf_name.replace("/", " ")
+ print(pdf_name2)
+ category_c = category.replace("/", " ")
+ print(idx, pdf_id, category_c, pdf_name2)
+ response2 = requests.get(
+ f"{pdf_id}/",
+ verify=False,
+ ) # elif condition with contains Other
+ with open(
+ f"{category_name}/{pdf_id}-{category_c}-{pdf_name2}.pdf",
+ "wb",
+ ) as pdf_file:
+ pdf_file.write(response2.content)
+ elif count > 100:
+ print("100 Pdfs in the directory")
+ break
+ except OSError as exc:
+ continue
+ if count == 100:
+ break
+ elif category_name == "Objection":
+ """Here Other Objection category will be parsed and 100 pdfs are obtained"""
+ for pg_no in range(1, 30):
+ response1 = requests.get(
+ f"{str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
+ )
+ json_data = response1.json()
+ q = json_data["_d_"]["hits"]["hits"]
+ for i, data in enumerate(q):
+ count = len(os.listdir(category_name))
+ print("Files Count: ", count)
+ if count < 95:
+ idx, pdf_id, category, title = (
+ i,
+ data["_id"],
+ data["_source"]["document_type"],
+ data["_source"]["caption"],
+ )
+ li = list(title.split(".pdf")[0])
+ if li[-1] == ".":
+ li[-1] = ""
+ pdf_name = "".join(li)
+ print(idx, pdf_id, category, pdf_name)
+ response2 = requests.get(
+ f"{pdf_id}/",
+ verify=False,
+ )
+ with open(
+ f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
+ "wb",
+ ) as pdf_file:
+ pdf_file.write(response2.content)
+ elif count > 95:
+ print("95 Pdfs in the directory")
+ break
+ if count == 95:
+ break
+ elif type(category_names) != list:
+ """Only a specific category and category_names is not a list where 100 pdfs are obtained"""
+ if os.path.exists(category_names):
+ shutil.rmtree(category_names)
+ os.makedirs(category_names)
+ if category_names != "Objection":
+ for pg_no in range(2, 30):
+ response1 = requests.get(
+ f"{str(category_names)}&page={pg_no}&category={str(category_names)}&format=json"
+ )
+ json_data = response1.json()
+ q = json_data["_d_"]["hits"]["hits"]
+ for i, data in enumerate(q):
+ count = len(os.listdir(category_names))
+ print("Files Count: ", count)
+ if count < 100:
+ idx, pdf_id, category, title = (
+ i,
+ data["_id"],
+ data["_source"]["document_type"],
+ data["_source"]["caption"],
+ )
+ if category == "":
+ continue
+ # if pdf_id == "253989":
+ # continue
+ print(title)
+ li = list(title.split(".pdf")[0])
+ if li[-1] == ".":
+ li[-1] = ""
+ pdf_name = "".join(li)
+ print(idx, pdf_id, category, pdf_name)
+ pdf_name2 = pdf_name.replace("/", " ")
+ response2 = requests.get(
+ f"{pdf_id}/", verify=False
+ ) # elif condition
+ with open(
+ f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
+ ) as pdf_file:
+ pdf_file.write(response2.content)
+ elif count > 100:
+ print("100 Pdfs in the directory")
+ break
+ if count == 100:
+ break
+ elif category_names == "Objection":
+ for pg_no in range(1, 30):
+ response1 = requests.get(
+ f"{str(firm_name)}&page={pg_no}&category={str(category_names)}&format=json"
+ )
+ json_data = response1.json()
+ q = json_data["_d_"]["hits"]["hits"]
+ for i, data in enumerate(q):
+ count = len(os.listdir(category_names))
+ print("Files Count: ", count)
+ if count < 95:
+ idx, pdf_id, category, title = (
+ i,
+ data["_id"],
+ data["_source"]["document_type"],
+ data["_source"]["caption"],
+ )
+ li = list(title.split(".pdf")[0])
+ if li[-1] == ".":
+ li[-1] = ""
+ pdf_name = "".join(li)
+ print(idx, pdf_id, category, pdf_name)
+ response2 = requests.get(
+ f"{pdf_id}/", verify=False
+ )
+ with open(
+ f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
+ ) as pdf_file:
+ pdf_file.write(response2.content)
+ elif count > 95:
+ print("95 Pdfs in the directory")
+ break
+ if count == 95:
+ break
+if __name__ == "__main__":
+ categories = [
+ "Other",
+ "Exhibit",
+ "Preliminary Response",
+ "Petition",
+ "Notice",
+ "Motion",
+ "Order",
+ "Power Of Attorney",
+ "Decision Affirmed",
+ "Objection",
+ ]
+ category = ["Power Of Attorney"]
+ get_pdfs2("Apple", category)