|
@@ -0,0 +1,238 @@
|
|
|
+import requests
|
|
|
+import json
|
|
|
+import os
|
|
|
+import shutil
|
|
|
+
|
|
|
+
|
|
|
+def get_pdfs2(firm_name, category_names):
|
|
|
+ """Gets 100 pdfs from a specific category ie(Exhibit, Other,
|
|
|
+ Preliminary Response, Petition,Notice,Motion, Power of Attorney, Order,
|
|
|
+ Decision Affirmed, Objection)"""
|
|
|
+ if type(category_names) == list:
|
|
|
+ """If category_names is list"""
|
|
|
+ for category_name in category_names:
|
|
|
+ if os.path.exists(category_name):
|
|
|
+ shutil.rmtree(category_name)
|
|
|
+ os.makedirs(category_name)
|
|
|
+ print(category_name)
|
|
|
+ if (category_name != "Other") & (category_name != "Objection"):
|
|
|
+ """Here except Other and Objection Category will be parsed and 100 pdfs of specific category are obtained"""
|
|
|
+ for pg_no in range(2, 30):
|
|
|
+ response1 = requests.get(
|
|
|
+ f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
|
|
|
+ )
|
|
|
+ json_data = response1.json()
|
|
|
+ q = json_data["_d_"]["hits"]["hits"]
|
|
|
+
|
|
|
+ for i, data in enumerate(q):
|
|
|
+ count = len(os.listdir(category_name))
|
|
|
+ print("Files Count: ", count)
|
|
|
+ if count < 100:
|
|
|
+ idx, pdf_id, category, title = (
|
|
|
+ i,
|
|
|
+ data["_id"],
|
|
|
+ data["_source"]["document_type"],
|
|
|
+ data["_source"]["caption"],
|
|
|
+ )
|
|
|
+ if pdf_id == "253989":
|
|
|
+ continue
|
|
|
+ li = list(title.split(".pdf")[0])
|
|
|
+ if li[-1] == ".":
|
|
|
+ li[-1] = ""
|
|
|
+ pdf_name = "".join(li)
|
|
|
+ print(idx, category_name, pdf_id, category, pdf_name)
|
|
|
+ response2 = requests.get(
|
|
|
+ f"http://50.211.199.148:8000/local/{pdf_id}/",
|
|
|
+ verify=False,
|
|
|
+ ) # if condition which doesnt contains Objection and Other
|
|
|
+ with open(
|
|
|
+ f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
|
|
|
+ "wb",
|
|
|
+ ) as pdf_file:
|
|
|
+ pdf_file.write(response2.content)
|
|
|
+ elif count > 100:
|
|
|
+ print("100 Pdfs in the directory")
|
|
|
+ break
|
|
|
+ if count == 100:
|
|
|
+ break
|
|
|
+ elif category_name == "Other":
|
|
|
+ """Here Other Category will be parsed and 100 pdfs are obtained"""
|
|
|
+ for pg_no in range(1, 30):
|
|
|
+ response1 = requests.get(
|
|
|
+ f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
|
|
|
+ )
|
|
|
+ json_data = response1.json()
|
|
|
+ q = json_data["_d_"]["hits"]["hits"]
|
|
|
+ for i, data in enumerate(q):
|
|
|
+ count = len(os.listdir(category_name))
|
|
|
+ print("Files Count: ", count)
|
|
|
+ try:
|
|
|
+ if count < 100:
|
|
|
+
|
|
|
+ idx, pdf_id, category, title = (
|
|
|
+ i,
|
|
|
+ data["_id"],
|
|
|
+ data["_source"]["document_type"],
|
|
|
+ data["_source"]["caption"],
|
|
|
+ )
|
|
|
+ if category == "":
|
|
|
+ continue
|
|
|
+ li = list(title.split(".pdf")[0])
|
|
|
+ if li[-1] == ".":
|
|
|
+ li[-1] = ""
|
|
|
+ pdf_name = "".join(li)
|
|
|
+ # print(idx,category_name, pdf_id,category, pdf_name)
|
|
|
+ pdf_name2 = pdf_name.replace("/", " ")
|
|
|
+ print(pdf_name2)
|
|
|
+ category_c = category.replace("/", " ")
|
|
|
+ print(idx, pdf_id, category_c, pdf_name2)
|
|
|
+ response2 = requests.get(
|
|
|
+ f"http://50.211.199.148:8000/local/{pdf_id}/",
|
|
|
+ verify=False,
|
|
|
+ ) # elif condition with contains Other
|
|
|
+ with open(
|
|
|
+ f"{category_name}/{pdf_id}-{category_c}-{pdf_name2}.pdf",
|
|
|
+ "wb",
|
|
|
+ ) as pdf_file:
|
|
|
+ pdf_file.write(response2.content)
|
|
|
+ elif count > 100:
|
|
|
+ print("100 Pdfs in the directory")
|
|
|
+ break
|
|
|
+ except OSError as exc:
|
|
|
+ continue
|
|
|
+
|
|
|
+ if count == 100:
|
|
|
+ break
|
|
|
+ elif category_name == "Objection":
|
|
|
+ """Here Other Objection category will be parsed and 100 pdfs are obtained"""
|
|
|
+ for pg_no in range(1, 30):
|
|
|
+ response1 = requests.get(
|
|
|
+ f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
|
|
|
+ )
|
|
|
+ json_data = response1.json()
|
|
|
+ q = json_data["_d_"]["hits"]["hits"]
|
|
|
+ for i, data in enumerate(q):
|
|
|
+ count = len(os.listdir(category_name))
|
|
|
+ print("Files Count: ", count)
|
|
|
+ if count < 95:
|
|
|
+ idx, pdf_id, category, title = (
|
|
|
+ i,
|
|
|
+ data["_id"],
|
|
|
+ data["_source"]["document_type"],
|
|
|
+ data["_source"]["caption"],
|
|
|
+ )
|
|
|
+ li = list(title.split(".pdf")[0])
|
|
|
+ if li[-1] == ".":
|
|
|
+ li[-1] = ""
|
|
|
+ pdf_name = "".join(li)
|
|
|
+ print(idx, pdf_id, category, pdf_name)
|
|
|
+ response2 = requests.get(
|
|
|
+ f"http://50.211.199.148:8000/local/{pdf_id}/",
|
|
|
+ verify=False,
|
|
|
+ )
|
|
|
+ with open(
|
|
|
+ f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
|
|
|
+ "wb",
|
|
|
+ ) as pdf_file:
|
|
|
+ pdf_file.write(response2.content)
|
|
|
+ elif count > 95:
|
|
|
+ print("95 Pdfs in the directory")
|
|
|
+ break
|
|
|
+ if count == 95:
|
|
|
+ break
|
|
|
+ elif type(category_names) != list:
|
|
|
+ """Only a specific category and category_names is not a list where 100 pdfs are obtained"""
|
|
|
+ if os.path.exists(category_names):
|
|
|
+ shutil.rmtree(category_names)
|
|
|
+ os.makedirs(category_names)
|
|
|
+ if category_names != "Objection":
|
|
|
+ for pg_no in range(2, 30):
|
|
|
+ response1 = requests.get(
|
|
|
+ f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(category_names)}&page={pg_no}&category={str(category_names)}&format=json"
|
|
|
+ )
|
|
|
+ json_data = response1.json()
|
|
|
+ q = json_data["_d_"]["hits"]["hits"]
|
|
|
+ for i, data in enumerate(q):
|
|
|
+ count = len(os.listdir(category_names))
|
|
|
+ print("Files Count: ", count)
|
|
|
+ if count < 100:
|
|
|
+ idx, pdf_id, category, title = (
|
|
|
+ i,
|
|
|
+ data["_id"],
|
|
|
+ data["_source"]["document_type"],
|
|
|
+ data["_source"]["caption"],
|
|
|
+ )
|
|
|
+ if category == "":
|
|
|
+ continue
|
|
|
+ # if pdf_id == "253989":
|
|
|
+ # continue
|
|
|
+ print(title)
|
|
|
+ li = list(title.split(".pdf")[0])
|
|
|
+ if li[-1] == ".":
|
|
|
+ li[-1] = ""
|
|
|
+ pdf_name = "".join(li)
|
|
|
+ print(idx, pdf_id, category, pdf_name)
|
|
|
+ pdf_name2 = pdf_name.replace("/", " ")
|
|
|
+ response2 = requests.get(
|
|
|
+ f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
|
|
|
+ ) # elif condition
|
|
|
+ with open(
|
|
|
+ f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
|
|
|
+ ) as pdf_file:
|
|
|
+ pdf_file.write(response2.content)
|
|
|
+ elif count > 100:
|
|
|
+ print("100 Pdfs in the directory")
|
|
|
+ break
|
|
|
+ if count == 100:
|
|
|
+ break
|
|
|
+ elif category_names == "Objection":
|
|
|
+ for pg_no in range(1, 30):
|
|
|
+ response1 = requests.get(
|
|
|
+ f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_names)}&format=json"
|
|
|
+ )
|
|
|
+ json_data = response1.json()
|
|
|
+ q = json_data["_d_"]["hits"]["hits"]
|
|
|
+ for i, data in enumerate(q):
|
|
|
+ count = len(os.listdir(category_names))
|
|
|
+ print("Files Count: ", count)
|
|
|
+ if count < 95:
|
|
|
+ idx, pdf_id, category, title = (
|
|
|
+ i,
|
|
|
+ data["_id"],
|
|
|
+ data["_source"]["document_type"],
|
|
|
+ data["_source"]["caption"],
|
|
|
+ )
|
|
|
+ li = list(title.split(".pdf")[0])
|
|
|
+ if li[-1] == ".":
|
|
|
+ li[-1] = ""
|
|
|
+ pdf_name = "".join(li)
|
|
|
+ print(idx, pdf_id, category, pdf_name)
|
|
|
+ response2 = requests.get(
|
|
|
+ f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
|
|
|
+ )
|
|
|
+ with open(
|
|
|
+ f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
|
|
|
+ ) as pdf_file:
|
|
|
+ pdf_file.write(response2.content)
|
|
|
+ elif count > 95:
|
|
|
+ print("95 Pdfs in the directory")
|
|
|
+ break
|
|
|
+ if count == 95:
|
|
|
+ break
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ categories = [
|
|
|
+ "Other",
|
|
|
+ "Exhibit",
|
|
|
+ "Preliminary Response",
|
|
|
+ "Petition",
|
|
|
+ "Notice",
|
|
|
+ "Motion",
|
|
|
+ "Order",
|
|
|
+ "Power Of Attorney",
|
|
|
+ "Decision Affirmed",
|
|
|
+ "Objection",
|
|
|
+ ]
|
|
|
+ category = ["Power Of Attorney"]
|
|
|
+ get_pdfs2("Apple", category)
|