浏览代码

added server documents folder

Harsh Parikh 2 年之前
父节点
当前提交
5c62928def

+ 0 - 0
server_documents/Decision Affirmed/main.py


+ 47 - 0
server_documents/Decision Affirmed/parse_pdf_utils.py

@@ -0,0 +1,47 @@
+import re
+
+
+def extract_filing_date(content):
+    """
+    extracts filing date from the documents.
+    """
+    pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
+    try:
+        return re.search(pattern, content).group(0)
+    except:
+        return "None"
+
+
+def email_extraction(content):
+    """
+    Extracts emails from a document.
+    """
+    regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
+    result = []
+    emails = re.compile(regex, re.IGNORECASE)
+    for email in emails.finditer(content):
+        result.append(email.group())
+    return result
+
+
+def telephone_number_extraction(content):
+    """
+    Extracts telephone number[s?] from a document
+    """
+    regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
+    numbers = re.compile(regex, re.IGNORECASE)
+    result = []
+    for number in numbers.finditer(content):
+        result.append(number.group())
+    return result
+
+
+def address_extraction(content):
+    """
+    extracts address from the documents.
+    """
+    regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
+    try:
+        return re.search(regex_address, content).group(0)
+    except:
+        return "None"

+ 238 - 0
server_documents/api_get_data.py

@@ -0,0 +1,238 @@
+import requests
+import json
+import os
+import shutil
+
+
+def get_pdfs2(firm_name, category_names):
+    """Gets 100 pdfs from a specific category ie(Exhibit, Other,
+    Preliminary Response, Petition,Notice,Motion, Power of Attorney, Order,
+    Decision Affirmed, Objection)"""
+    if type(category_names) == list:
+        """If category_names is list"""
+        for category_name in category_names:
+            if os.path.exists(category_name):
+                shutil.rmtree(category_name)
+            os.makedirs(category_name)
+            print(category_name)
+            if (category_name != "Other") & (category_name != "Objection"):
+                """Here except Other and Objection Category will be parsed and 100 pdfs of specific category are obtained"""
+                for pg_no in range(2, 30):
+                    response1 = requests.get(
+                        f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
+                    )
+                    json_data = response1.json()
+                    q = json_data["_d_"]["hits"]["hits"]
+
+                    for i, data in enumerate(q):
+                        count = len(os.listdir(category_name))
+                        print("Files Count: ", count)
+                        if count < 100:
+                            idx, pdf_id, category, title = (
+                                i,
+                                data["_id"],
+                                data["_source"]["document_type"],
+                                data["_source"]["caption"],
+                            )
+                            if pdf_id == "253989":
+                                continue
+                            li = list(title.split(".pdf")[0])
+                            if li[-1] == ".":
+                                li[-1] = ""
+                            pdf_name = "".join(li)
+                            print(idx, category_name, pdf_id, category, pdf_name)
+                            response2 = requests.get(
+                                f"http://50.211.199.148:8000/local/{pdf_id}/",
+                                verify=False,
+                            )  # if condition which doesnt contains Objection and Other
+                            with open(
+                                f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
+                                "wb",
+                            ) as pdf_file:
+                                pdf_file.write(response2.content)
+                        elif count > 100:
+                            print("100 Pdfs in the directory")
+                            break
+                    if count == 100:
+                        break
+            elif category_name == "Other":
+                """Here Other Category will be parsed and 100 pdfs are obtained"""
+                for pg_no in range(1, 30):
+                    response1 = requests.get(
+                        f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
+                    )
+                    json_data = response1.json()
+                    q = json_data["_d_"]["hits"]["hits"]
+                    for i, data in enumerate(q):
+                        count = len(os.listdir(category_name))
+                        print("Files Count: ", count)
+                        try:
+                            if count < 100:
+
+                                idx, pdf_id, category, title = (
+                                    i,
+                                    data["_id"],
+                                    data["_source"]["document_type"],
+                                    data["_source"]["caption"],
+                                )
+                                if category == "":
+                                    continue
+                                li = list(title.split(".pdf")[0])
+                                if li[-1] == ".":
+                                    li[-1] = ""
+                                pdf_name = "".join(li)
+                                # print(idx,category_name, pdf_id,category, pdf_name)
+                                pdf_name2 = pdf_name.replace("/", " ")
+                                print(pdf_name2)
+                                category_c = category.replace("/", " ")
+                                print(idx, pdf_id, category_c, pdf_name2)
+                                response2 = requests.get(
+                                    f"http://50.211.199.148:8000/local/{pdf_id}/",
+                                    verify=False,
+                                )  # elif condition with contains Other
+                                with open(
+                                    f"{category_name}/{pdf_id}-{category_c}-{pdf_name2}.pdf",
+                                    "wb",
+                                ) as pdf_file:
+                                    pdf_file.write(response2.content)
+                            elif count > 100:
+                                print("100 Pdfs in the directory")
+                                break
+                        except OSError as exc:
+                            continue
+
+                    if count == 100:
+                        break
+            elif category_name == "Objection":
+                """Here Other Objection category will be parsed and 100 pdfs are obtained"""
+                for pg_no in range(1, 30):
+                    response1 = requests.get(
+                        f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
+                    )
+                    json_data = response1.json()
+                    q = json_data["_d_"]["hits"]["hits"]
+                    for i, data in enumerate(q):
+                        count = len(os.listdir(category_name))
+                        print("Files Count: ", count)
+                        if count < 95:
+                            idx, pdf_id, category, title = (
+                                i,
+                                data["_id"],
+                                data["_source"]["document_type"],
+                                data["_source"]["caption"],
+                            )
+                            li = list(title.split(".pdf")[0])
+                            if li[-1] == ".":
+                                li[-1] = ""
+                            pdf_name = "".join(li)
+                            print(idx, pdf_id, category, pdf_name)
+                            response2 = requests.get(
+                                f"http://50.211.199.148:8000/local/{pdf_id}/",
+                                verify=False,
+                            )
+                            with open(
+                                f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
+                                "wb",
+                            ) as pdf_file:
+                                pdf_file.write(response2.content)
+                        elif count > 95:
+                            print("95 Pdfs in the directory")
+                            break
+                    if count == 95:
+                        break
+    elif type(category_names) != list:
+        """Only a specific category and category_names is not a list where 100 pdfs are obtained"""
+        if os.path.exists(category_names):
+            shutil.rmtree(category_names)
+        os.makedirs(category_names)
+        if category_names != "Objection":
+            for pg_no in range(2, 30):
+                response1 = requests.get(
+                    f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(category_names)}&page={pg_no}&category={str(category_names)}&format=json"
+                )
+                json_data = response1.json()
+                q = json_data["_d_"]["hits"]["hits"]
+                for i, data in enumerate(q):
+                    count = len(os.listdir(category_names))
+                    print("Files Count: ", count)
+                    if count < 100:
+                        idx, pdf_id, category, title = (
+                            i,
+                            data["_id"],
+                            data["_source"]["document_type"],
+                            data["_source"]["caption"],
+                        )
+                        if category == "":
+                            continue
+                        # if pdf_id == "253989":
+                        #     continue
+                        print(title)
+                        li = list(title.split(".pdf")[0])
+                        if li[-1] == ".":
+                            li[-1] = ""
+                        pdf_name = "".join(li)
+                        print(idx, pdf_id, category, pdf_name)
+                        pdf_name2 = pdf_name.replace("/", " ")
+                        response2 = requests.get(
+                            f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
+                        )  # elif condition
+                        with open(
+                            f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
+                        ) as pdf_file:
+                            pdf_file.write(response2.content)
+                    elif count > 100:
+                        print("100 Pdfs in the directory")
+                        break
+                if count == 100:
+                    break
+        elif category_names == "Objection":
+            for pg_no in range(1, 30):
+                response1 = requests.get(
+                    f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_names)}&format=json"
+                )
+                json_data = response1.json()
+                q = json_data["_d_"]["hits"]["hits"]
+                for i, data in enumerate(q):
+                    count = len(os.listdir(category_names))
+                    print("Files Count: ", count)
+                    if count < 95:
+                        idx, pdf_id, category, title = (
+                            i,
+                            data["_id"],
+                            data["_source"]["document_type"],
+                            data["_source"]["caption"],
+                        )
+                        li = list(title.split(".pdf")[0])
+                        if li[-1] == ".":
+                            li[-1] = ""
+                        pdf_name = "".join(li)
+                        print(idx, pdf_id, category, pdf_name)
+                        response2 = requests.get(
+                            f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
+                        )
+                        with open(
+                            f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
+                        ) as pdf_file:
+                            pdf_file.write(response2.content)
+                    elif count > 95:
+                        print("95 Pdfs in the directory")
+                        break
+                if count == 95:
+                    break
+
+
+if __name__ == "__main__":
+    categories = [
+        "Other",
+        "Exhibit",
+        "Preliminary Response",
+        "Petition",
+        "Notice",
+        "Motion",
+        "Order",
+        "Power Of Attorney",
+        "Decision Affirmed",
+        "Objection",
+    ]
+    category = ["Power Of Attorney"]
+    get_pdfs2("Apple", category)

+ 17 - 0
server_documents/crawl.py

@@ -0,0 +1,17 @@
+from selenium import webdriver
+from selenium.webdriver.common.keys import Keys
+import time
+
+driver = webdriver.Firefox()
+driver.get("http://50.211.199.148:3002/#/search/documents")
+time.sleep(2)
+driver.implicitly_wait(2)
+# driver.find_element_by_id("nav-search").send_keys("Selenium")
+enter_text = driver.find_element(
+    "xpath", "//input[@placeholder='Example: Apple, CUDA']"
+).send_keys("Apple")
+time.sleep(2)
+driver.implicitly_wait(2)
+enter_button_search = driver.find_element(
+    "xpath", "(//button[@class='btn btn-info'])[2]"
+).click()