123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238 |
- import requests
- import json
- import os
- import shutil
- def get_pdfs2(firm_name, category_names):
- """Gets 100 pdfs from a specific category ie(Exhibit, Other,
- Preliminary Response, Petition,Notice,Motion, Power of Attorney, Order,
- Decision Affirmed, Objection)"""
- if type(category_names) == list:
- """If category_names is list"""
- for category_name in category_names:
- if os.path.exists(category_name):
- shutil.rmtree(category_name)
- os.makedirs(category_name)
- print(category_name)
- if (category_name != "Other") & (category_name != "Objection"):
- """Here except Other and Objection Category will be parsed and 100 pdfs of specific category are obtained"""
- for pg_no in range(2, 30):
- response1 = requests.get(
- f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
- )
- json_data = response1.json()
- q = json_data["_d_"]["hits"]["hits"]
- for i, data in enumerate(q):
- count = len(os.listdir(category_name))
- print("Files Count: ", count)
- if count < 100:
- idx, pdf_id, category, title = (
- i,
- data["_id"],
- data["_source"]["document_type"],
- data["_source"]["caption"],
- )
- if pdf_id == "253989":
- continue
- li = list(title.split(".pdf")[0])
- if li[-1] == ".":
- li[-1] = ""
- pdf_name = "".join(li)
- print(idx, category_name, pdf_id, category, pdf_name)
- response2 = requests.get(
- f"http://50.211.199.148:8000/local/{pdf_id}/",
- verify=False,
- ) # if condition which doesnt contains Objection and Other
- with open(
- f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
- "wb",
- ) as pdf_file:
- pdf_file.write(response2.content)
- elif count > 100:
- print("100 Pdfs in the directory")
- break
- if count == 100:
- break
- elif category_name == "Other":
- """Here Other Category will be parsed and 100 pdfs are obtained"""
- for pg_no in range(1, 30):
- response1 = requests.get(
- f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
- )
- json_data = response1.json()
- q = json_data["_d_"]["hits"]["hits"]
- for i, data in enumerate(q):
- count = len(os.listdir(category_name))
- print("Files Count: ", count)
- try:
- if count < 100:
- idx, pdf_id, category, title = (
- i,
- data["_id"],
- data["_source"]["document_type"],
- data["_source"]["caption"],
- )
- if category == "":
- continue
- li = list(title.split(".pdf")[0])
- if li[-1] == ".":
- li[-1] = ""
- pdf_name = "".join(li)
- # print(idx,category_name, pdf_id,category, pdf_name)
- pdf_name2 = pdf_name.replace("/", " ")
- print(pdf_name2)
- category_c = category.replace("/", " ")
- print(idx, pdf_id, category_c, pdf_name2)
- response2 = requests.get(
- f"http://50.211.199.148:8000/local/{pdf_id}/",
- verify=False,
- ) # elif condition with contains Other
- with open(
- f"{category_name}/{pdf_id}-{category_c}-{pdf_name2}.pdf",
- "wb",
- ) as pdf_file:
- pdf_file.write(response2.content)
- elif count > 100:
- print("100 Pdfs in the directory")
- break
- except OSError as exc:
- continue
- if count == 100:
- break
- elif category_name == "Objection":
- """Here Other Objection category will be parsed and 100 pdfs are obtained"""
- for pg_no in range(1, 30):
- response1 = requests.get(
- f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
- )
- json_data = response1.json()
- q = json_data["_d_"]["hits"]["hits"]
- for i, data in enumerate(q):
- count = len(os.listdir(category_name))
- print("Files Count: ", count)
- if count < 95:
- idx, pdf_id, category, title = (
- i,
- data["_id"],
- data["_source"]["document_type"],
- data["_source"]["caption"],
- )
- li = list(title.split(".pdf")[0])
- if li[-1] == ".":
- li[-1] = ""
- pdf_name = "".join(li)
- print(idx, pdf_id, category, pdf_name)
- response2 = requests.get(
- f"http://50.211.199.148:8000/local/{pdf_id}/",
- verify=False,
- )
- with open(
- f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
- "wb",
- ) as pdf_file:
- pdf_file.write(response2.content)
- elif count > 95:
- print("95 Pdfs in the directory")
- break
- if count == 95:
- break
- elif type(category_names) != list:
- """Only a specific category and category_names is not a list where 100 pdfs are obtained"""
- if os.path.exists(category_names):
- shutil.rmtree(category_names)
- os.makedirs(category_names)
- if category_names != "Objection":
- for pg_no in range(2, 30):
- response1 = requests.get(
- f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(category_names)}&page={pg_no}&category={str(category_names)}&format=json"
- )
- json_data = response1.json()
- q = json_data["_d_"]["hits"]["hits"]
- for i, data in enumerate(q):
- count = len(os.listdir(category_names))
- print("Files Count: ", count)
- if count < 100:
- idx, pdf_id, category, title = (
- i,
- data["_id"],
- data["_source"]["document_type"],
- data["_source"]["caption"],
- )
- if category == "":
- continue
- # if pdf_id == "253989":
- # continue
- print(title)
- li = list(title.split(".pdf")[0])
- if li[-1] == ".":
- li[-1] = ""
- pdf_name = "".join(li)
- print(idx, pdf_id, category, pdf_name)
- pdf_name2 = pdf_name.replace("/", " ")
- response2 = requests.get(
- f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
- ) # elif condition
- with open(
- f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
- ) as pdf_file:
- pdf_file.write(response2.content)
- elif count > 100:
- print("100 Pdfs in the directory")
- break
- if count == 100:
- break
- elif category_names == "Objection":
- for pg_no in range(1, 30):
- response1 = requests.get(
- f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_names)}&format=json"
- )
- json_data = response1.json()
- q = json_data["_d_"]["hits"]["hits"]
- for i, data in enumerate(q):
- count = len(os.listdir(category_names))
- print("Files Count: ", count)
- if count < 95:
- idx, pdf_id, category, title = (
- i,
- data["_id"],
- data["_source"]["document_type"],
- data["_source"]["caption"],
- )
- li = list(title.split(".pdf")[0])
- if li[-1] == ".":
- li[-1] = ""
- pdf_name = "".join(li)
- print(idx, pdf_id, category, pdf_name)
- response2 = requests.get(
- f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
- )
- with open(
- f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
- ) as pdf_file:
- pdf_file.write(response2.content)
- elif count > 95:
- print("95 Pdfs in the directory")
- break
- if count == 95:
- break
- if __name__ == "__main__":
- categories = [
- "Other",
- "Exhibit",
- "Preliminary Response",
- "Petition",
- "Notice",
- "Motion",
- "Order",
- "Power Of Attorney",
- "Decision Affirmed",
- "Objection",
- ]
- category = ["Power Of Attorney"]
- get_pdfs2("Apple", category)
|