api_get_data.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. import requests
  2. import json
  3. import os
  4. import shutil
  5. def get_pdfs2(firm_name, category_names):
  6. """Gets 100 pdfs from a specific category ie(Exhibit, Other,
  7. Preliminary Response, Petition,Notice,Motion, Power of Attorney, Order,
  8. Decision Affirmed, Objection)"""
  9. if type(category_names) == list:
  10. """If category_names is list"""
  11. for category_name in category_names:
  12. if os.path.exists(category_name):
  13. shutil.rmtree(category_name)
  14. os.makedirs(category_name)
  15. print(category_name)
  16. if (category_name != "Other") & (category_name != "Objection"):
  17. """Here except Other and Objection Category will be parsed and 100 pdfs of specific category are obtained"""
  18. for pg_no in range(2, 30):
  19. response1 = requests.get(
  20. f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
  21. )
  22. json_data = response1.json()
  23. q = json_data["_d_"]["hits"]["hits"]
  24. for i, data in enumerate(q):
  25. count = len(os.listdir(category_name))
  26. print("Files Count: ", count)
  27. if count < 100:
  28. idx, pdf_id, category, title = (
  29. i,
  30. data["_id"],
  31. data["_source"]["document_type"],
  32. data["_source"]["caption"],
  33. )
  34. if pdf_id == "253989":
  35. continue
  36. li = list(title.split(".pdf")[0])
  37. if li[-1] == ".":
  38. li[-1] = ""
  39. pdf_name = "".join(li)
  40. print(idx, category_name, pdf_id, category, pdf_name)
  41. response2 = requests.get(
  42. f"http://50.211.199.148:8000/local/{pdf_id}/",
  43. verify=False,
  44. ) # if condition which doesnt contains Objection and Other
  45. with open(
  46. f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
  47. "wb",
  48. ) as pdf_file:
  49. pdf_file.write(response2.content)
  50. elif count > 100:
  51. print("100 Pdfs in the directory")
  52. break
  53. if count == 100:
  54. break
  55. elif category_name == "Other":
  56. """Here Other Category will be parsed and 100 pdfs are obtained"""
  57. for pg_no in range(1, 30):
  58. response1 = requests.get(
  59. f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
  60. )
  61. json_data = response1.json()
  62. q = json_data["_d_"]["hits"]["hits"]
  63. for i, data in enumerate(q):
  64. count = len(os.listdir(category_name))
  65. print("Files Count: ", count)
  66. try:
  67. if count < 100:
  68. idx, pdf_id, category, title = (
  69. i,
  70. data["_id"],
  71. data["_source"]["document_type"],
  72. data["_source"]["caption"],
  73. )
  74. if category == "":
  75. continue
  76. li = list(title.split(".pdf")[0])
  77. if li[-1] == ".":
  78. li[-1] = ""
  79. pdf_name = "".join(li)
  80. # print(idx,category_name, pdf_id,category, pdf_name)
  81. pdf_name2 = pdf_name.replace("/", " ")
  82. print(pdf_name2)
  83. category_c = category.replace("/", " ")
  84. print(idx, pdf_id, category_c, pdf_name2)
  85. response2 = requests.get(
  86. f"http://50.211.199.148:8000/local/{pdf_id}/",
  87. verify=False,
  88. ) # elif condition with contains Other
  89. with open(
  90. f"{category_name}/{pdf_id}-{category_c}-{pdf_name2}.pdf",
  91. "wb",
  92. ) as pdf_file:
  93. pdf_file.write(response2.content)
  94. elif count > 100:
  95. print("100 Pdfs in the directory")
  96. break
  97. except OSError as exc:
  98. continue
  99. if count == 100:
  100. break
  101. elif category_name == "Objection":
  102. """Here Other Objection category will be parsed and 100 pdfs are obtained"""
  103. for pg_no in range(1, 30):
  104. response1 = requests.get(
  105. f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
  106. )
  107. json_data = response1.json()
  108. q = json_data["_d_"]["hits"]["hits"]
  109. for i, data in enumerate(q):
  110. count = len(os.listdir(category_name))
  111. print("Files Count: ", count)
  112. if count < 95:
  113. idx, pdf_id, category, title = (
  114. i,
  115. data["_id"],
  116. data["_source"]["document_type"],
  117. data["_source"]["caption"],
  118. )
  119. li = list(title.split(".pdf")[0])
  120. if li[-1] == ".":
  121. li[-1] = ""
  122. pdf_name = "".join(li)
  123. print(idx, pdf_id, category, pdf_name)
  124. response2 = requests.get(
  125. f"http://50.211.199.148:8000/local/{pdf_id}/",
  126. verify=False,
  127. )
  128. with open(
  129. f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
  130. "wb",
  131. ) as pdf_file:
  132. pdf_file.write(response2.content)
  133. elif count > 95:
  134. print("95 Pdfs in the directory")
  135. break
  136. if count == 95:
  137. break
  138. elif type(category_names) != list:
  139. """Only a specific category and category_names is not a list where 100 pdfs are obtained"""
  140. if os.path.exists(category_names):
  141. shutil.rmtree(category_names)
  142. os.makedirs(category_names)
  143. if category_names != "Objection":
  144. for pg_no in range(2, 30):
  145. response1 = requests.get(
  146. f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(category_names)}&page={pg_no}&category={str(category_names)}&format=json"
  147. )
  148. json_data = response1.json()
  149. q = json_data["_d_"]["hits"]["hits"]
  150. for i, data in enumerate(q):
  151. count = len(os.listdir(category_names))
  152. print("Files Count: ", count)
  153. if count < 100:
  154. idx, pdf_id, category, title = (
  155. i,
  156. data["_id"],
  157. data["_source"]["document_type"],
  158. data["_source"]["caption"],
  159. )
  160. if category == "":
  161. continue
  162. # if pdf_id == "253989":
  163. # continue
  164. print(title)
  165. li = list(title.split(".pdf")[0])
  166. if li[-1] == ".":
  167. li[-1] = ""
  168. pdf_name = "".join(li)
  169. print(idx, pdf_id, category, pdf_name)
  170. pdf_name2 = pdf_name.replace("/", " ")
  171. response2 = requests.get(
  172. f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
  173. ) # elif condition
  174. with open(
  175. f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
  176. ) as pdf_file:
  177. pdf_file.write(response2.content)
  178. elif count > 100:
  179. print("100 Pdfs in the directory")
  180. break
  181. if count == 100:
  182. break
  183. elif category_names == "Objection":
  184. for pg_no in range(1, 30):
  185. response1 = requests.get(
  186. f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_names)}&format=json"
  187. )
  188. json_data = response1.json()
  189. q = json_data["_d_"]["hits"]["hits"]
  190. for i, data in enumerate(q):
  191. count = len(os.listdir(category_names))
  192. print("Files Count: ", count)
  193. if count < 95:
  194. idx, pdf_id, category, title = (
  195. i,
  196. data["_id"],
  197. data["_source"]["document_type"],
  198. data["_source"]["caption"],
  199. )
  200. li = list(title.split(".pdf")[0])
  201. if li[-1] == ".":
  202. li[-1] = ""
  203. pdf_name = "".join(li)
  204. print(idx, pdf_id, category, pdf_name)
  205. response2 = requests.get(
  206. f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
  207. )
  208. with open(
  209. f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
  210. ) as pdf_file:
  211. pdf_file.write(response2.content)
  212. elif count > 95:
  213. print("95 Pdfs in the directory")
  214. break
  215. if count == 95:
  216. break
  217. if __name__ == "__main__":
  218. categories = [
  219. "Other",
  220. "Exhibit",
  221. "Preliminary Response",
  222. "Petition",
  223. "Notice",
  224. "Motion",
  225. "Order",
  226. "Power Of Attorney",
  227. "Decision Affirmed",
  228. "Objection",
  229. ]
  230. category = ["Power Of Attorney"]
  231. get_pdfs2("Apple", category)