2 年之前 · 5c62928def
--- a/server_documents/Decision
+++ b/server_documents/Decision
--- a/Affirmed/parse_pdf_utils.py
+++ b/Affirmed/parse_pdf_utils.py
@@ -0,0 +1,47 @@
 
				+import re
			
 
				+
			
 
				+
			
 
				+def extract_filing_date(content):
			
 
				+    """
			
 
				+    extracts filing date from the documents.
			
 
				+    """
			
 
				+    pattern = r"\w+\.?\s\d{1,2}\,\s\d{4}|\d{2}\/\d{2}\/\d{4}"
			
 
				+    try:
			
 
				+        return re.search(pattern, content).group(0)
			
 
				+    except:
			
 
				+        return "None"
			
 
				+
			
 
				+
			
 
				+def email_extraction(content):
			
 
				+    """
			
 
				+    Extracts emails from a document.
			
 
				+    """
			
 
				+    regex = r"(\w+\-)?\w+@[a-z]+\.[a-z]{2,3}"
			
 
				+    result = []
			
 
				+    emails = re.compile(regex, re.IGNORECASE)
			
 
				+    for email in emails.finditer(content):
			
 
				+        result.append(email.group())
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def telephone_number_extraction(content):
			
 
				+    """
			
 
				+    Extracts telephone number[s?] from a document
			
 
				+    """
			
 
				+    regex = r"\(?\d{3}\)?[\-|\s]\d{3}[\s|\-]\d{4}"
			
 
				+    numbers = re.compile(regex, re.IGNORECASE)
			
 
				+    result = []
			
 
				+    for number in numbers.finditer(content):
			
 
				+        result.append(number.group())
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def address_extraction(content):
			
 
				+    """
			
 
				+    extracts address from the documents.
			
 
				+    """
			
 
				+    regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
			
 
				+    try:
			
 
				+        return re.search(regex_address, content).group(0)
			
 
				+    except:
			
 
				+        return "None"
			
--- a/server_documents/api_get_data.py
+++ b/server_documents/api_get_data.py
@@ -0,0 +1,238 @@
 
				+import requests
			
 
				+import json
			
 
				+import os
			
 
				+import shutil
			
 
				+
			
 
				+
			
 
				+def get_pdfs2(firm_name, category_names):
			
 
				+    """Gets 100 pdfs from a specific category ie(Exhibit, Other,
			
 
				+    Preliminary Response, Petition,Notice,Motion, Power of Attorney, Order,
			
 
				+    Decision Affirmed, Objection)"""
			
 
				+    if type(category_names) == list:
			
 
				+        """If category_names is list"""
			
 
				+        for category_name in category_names:
			
 
				+            if os.path.exists(category_name):
			
 
				+                shutil.rmtree(category_name)
			
 
				+            os.makedirs(category_name)
			
 
				+            print(category_name)
			
 
				+            if (category_name != "Other") & (category_name != "Objection"):
			
 
				+                """Here except Other and Objection Category will be parsed and 100 pdfs of specific category are obtained"""
			
 
				+                for pg_no in range(2, 30):
			
 
				+                    response1 = requests.get(
			
 
				+                        f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
			
 
				+                    )
			
 
				+                    json_data = response1.json()
			
 
				+                    q = json_data["_d_"]["hits"]["hits"]
			
 
				+
			
 
				+                    for i, data in enumerate(q):
			
 
				+                        count = len(os.listdir(category_name))
			
 
				+                        print("Files Count: ", count)
			
 
				+                        if count < 100:
			
 
				+                            idx, pdf_id, category, title = (
			
 
				+                                i,
			
 
				+                                data["_id"],
			
 
				+                                data["_source"]["document_type"],
			
 
				+                                data["_source"]["caption"],
			
 
				+                            )
			
 
				+                            if pdf_id == "253989":
			
 
				+                                continue
			
 
				+                            li = list(title.split(".pdf")[0])
			
 
				+                            if li[-1] == ".":
			
 
				+                                li[-1] = ""
			
 
				+                            pdf_name = "".join(li)
			
 
				+                            print(idx, category_name, pdf_id, category, pdf_name)
			
 
				+                            response2 = requests.get(
			
 
				+                                f"http://50.211.199.148:8000/local/{pdf_id}/",
			
 
				+                                verify=False,
			
 
				+                            )  # if condition which doesnt contains Objection and Other
			
 
				+                            with open(
			
 
				+                                f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
			
 
				+                                "wb",
			
 
				+                            ) as pdf_file:
			
 
				+                                pdf_file.write(response2.content)
			
 
				+                        elif count > 100:
			
 
				+                            print("100 Pdfs in the directory")
			
 
				+                            break
			
 
				+                    if count == 100:
			
 
				+                        break
			
 
				+            elif category_name == "Other":
			
 
				+                """Here Other Category will be parsed and 100 pdfs are obtained"""
			
 
				+                for pg_no in range(1, 30):
			
 
				+                    response1 = requests.get(
			
 
				+                        f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
			
 
				+                    )
			
 
				+                    json_data = response1.json()
			
 
				+                    q = json_data["_d_"]["hits"]["hits"]
			
 
				+                    for i, data in enumerate(q):
			
 
				+                        count = len(os.listdir(category_name))
			
 
				+                        print("Files Count: ", count)
			
 
				+                        try:
			
 
				+                            if count < 100:
			
 
				+
			
 
				+                                idx, pdf_id, category, title = (
			
 
				+                                    i,
			
 
				+                                    data["_id"],
			
 
				+                                    data["_source"]["document_type"],
			
 
				+                                    data["_source"]["caption"],
			
 
				+                                )
			
 
				+                                if category == "":
			
 
				+                                    continue
			
 
				+                                li = list(title.split(".pdf")[0])
			
 
				+                                if li[-1] == ".":
			
 
				+                                    li[-1] = ""
			
 
				+                                pdf_name = "".join(li)
			
 
				+                                # print(idx,category_name, pdf_id,category, pdf_name)
			
 
				+                                pdf_name2 = pdf_name.replace("/", " ")
			
 
				+                                print(pdf_name2)
			
 
				+                                category_c = category.replace("/", " ")
			
 
				+                                print(idx, pdf_id, category_c, pdf_name2)
			
 
				+                                response2 = requests.get(
			
 
				+                                    f"http://50.211.199.148:8000/local/{pdf_id}/",
			
 
				+                                    verify=False,
			
 
				+                                )  # elif condition with contains Other
			
 
				+                                with open(
			
 
				+                                    f"{category_name}/{pdf_id}-{category_c}-{pdf_name2}.pdf",
			
 
				+                                    "wb",
			
 
				+                                ) as pdf_file:
			
 
				+                                    pdf_file.write(response2.content)
			
 
				+                            elif count > 100:
			
 
				+                                print("100 Pdfs in the directory")
			
 
				+                                break
			
 
				+                        except OSError as exc:
			
 
				+                            continue
			
 
				+
			
 
				+                    if count == 100:
			
 
				+                        break
			
 
				+            elif category_name == "Objection":
			
 
				+                """Here Other Objection category will be parsed and 100 pdfs are obtained"""
			
 
				+                for pg_no in range(1, 30):
			
 
				+                    response1 = requests.get(
			
 
				+                        f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_name)}&format=json"
			
 
				+                    )
			
 
				+                    json_data = response1.json()
			
 
				+                    q = json_data["_d_"]["hits"]["hits"]
			
 
				+                    for i, data in enumerate(q):
			
 
				+                        count = len(os.listdir(category_name))
			
 
				+                        print("Files Count: ", count)
			
 
				+                        if count < 95:
			
 
				+                            idx, pdf_id, category, title = (
			
 
				+                                i,
			
 
				+                                data["_id"],
			
 
				+                                data["_source"]["document_type"],
			
 
				+                                data["_source"]["caption"],
			
 
				+                            )
			
 
				+                            li = list(title.split(".pdf")[0])
			
 
				+                            if li[-1] == ".":
			
 
				+                                li[-1] = ""
			
 
				+                            pdf_name = "".join(li)
			
 
				+                            print(idx, pdf_id, category, pdf_name)
			
 
				+                            response2 = requests.get(
			
 
				+                                f"http://50.211.199.148:8000/local/{pdf_id}/",
			
 
				+                                verify=False,
			
 
				+                            )
			
 
				+                            with open(
			
 
				+                                f"{category_name}/{pdf_id}-{category}-{pdf_name}.pdf",
			
 
				+                                "wb",
			
 
				+                            ) as pdf_file:
			
 
				+                                pdf_file.write(response2.content)
			
 
				+                        elif count > 95:
			
 
				+                            print("95 Pdfs in the directory")
			
 
				+                            break
			
 
				+                    if count == 95:
			
 
				+                        break
			
 
				+    elif type(category_names) != list:
			
 
				+        """Only a specific category and category_names is not a list where 100 pdfs are obtained"""
			
 
				+        if os.path.exists(category_names):
			
 
				+            shutil.rmtree(category_names)
			
 
				+        os.makedirs(category_names)
			
 
				+        if category_names != "Objection":
			
 
				+            for pg_no in range(2, 30):
			
 
				+                response1 = requests.get(
			
 
				+                    f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(category_names)}&page={pg_no}&category={str(category_names)}&format=json"
			
 
				+                )
			
 
				+                json_data = response1.json()
			
 
				+                q = json_data["_d_"]["hits"]["hits"]
			
 
				+                for i, data in enumerate(q):
			
 
				+                    count = len(os.listdir(category_names))
			
 
				+                    print("Files Count: ", count)
			
 
				+                    if count < 100:
			
 
				+                        idx, pdf_id, category, title = (
			
 
				+                            i,
			
 
				+                            data["_id"],
			
 
				+                            data["_source"]["document_type"],
			
 
				+                            data["_source"]["caption"],
			
 
				+                        )
			
 
				+                        if category == "":
			
 
				+                            continue
			
 
				+                        # if pdf_id == "253989":
			
 
				+                        #     continue
			
 
				+                        print(title)
			
 
				+                        li = list(title.split(".pdf")[0])
			
 
				+                        if li[-1] == ".":
			
 
				+                            li[-1] = ""
			
 
				+                        pdf_name = "".join(li)
			
 
				+                        print(idx, pdf_id, category, pdf_name)
			
 
				+                        pdf_name2 = pdf_name.replace("/", " ")
			
 
				+                        response2 = requests.get(
			
 
				+                            f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
			
 
				+                        )  # elif condition
			
 
				+                        with open(
			
 
				+                            f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
			
 
				+                        ) as pdf_file:
			
 
				+                            pdf_file.write(response2.content)
			
 
				+                    elif count > 100:
			
 
				+                        print("100 Pdfs in the directory")
			
 
				+                        break
			
 
				+                if count == 100:
			
 
				+                    break
			
 
				+        elif category_names == "Objection":
			
 
				+            for pg_no in range(1, 30):
			
 
				+                response1 = requests.get(
			
 
				+                    f"http://50.211.199.148:8000/api/documents/?collection=Documents&q={str(firm_name)}&page={pg_no}&category={str(category_names)}&format=json"
			
 
				+                )
			
 
				+                json_data = response1.json()
			
 
				+                q = json_data["_d_"]["hits"]["hits"]
			
 
				+                for i, data in enumerate(q):
			
 
				+                    count = len(os.listdir(category_names))
			
 
				+                    print("Files Count: ", count)
			
 
				+                    if count < 95:
			
 
				+                        idx, pdf_id, category, title = (
			
 
				+                            i,
			
 
				+                            data["_id"],
			
 
				+                            data["_source"]["document_type"],
			
 
				+                            data["_source"]["caption"],
			
 
				+                        )
			
 
				+                        li = list(title.split(".pdf")[0])
			
 
				+                        if li[-1] == ".":
			
 
				+                            li[-1] = ""
			
 
				+                        pdf_name = "".join(li)
			
 
				+                        print(idx, pdf_id, category, pdf_name)
			
 
				+                        response2 = requests.get(
			
 
				+                            f"http://50.211.199.148:8000/local/{pdf_id}/", verify=False
			
 
				+                        )
			
 
				+                        with open(
			
 
				+                            f"{category_names}/{pdf_id}-{category}-{pdf_name}.pdf", "wb"
			
 
				+                        ) as pdf_file:
			
 
				+                            pdf_file.write(response2.content)
			
 
				+                    elif count > 95:
			
 
				+                        print("95 Pdfs in the directory")
			
 
				+                        break
			
 
				+                if count == 95:
			
 
				+                    break
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    categories = [
			
 
				+        "Other",
			
 
				+        "Exhibit",
			
 
				+        "Preliminary Response",
			
 
				+        "Petition",
			
 
				+        "Notice",
			
 
				+        "Motion",
			
 
				+        "Order",
			
 
				+        "Power Of Attorney",
			
 
				+        "Decision Affirmed",
			
 
				+        "Objection",
			
 
				+    ]
			
 
				+    category = ["Power Of Attorney"]
			
 
				+    get_pdfs2("Apple", category)
			
--- a/server_documents/crawl.py
+++ b/server_documents/crawl.py
@@ -0,0 +1,17 @@
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.common.keys import Keys
			
 
				+import time
			
 
				+
			
 
				+driver = webdriver.Firefox()
			
 
				+driver.get("http://50.211.199.148:3002/#/search/documents")
			
 
				+time.sleep(2)
			
 
				+driver.implicitly_wait(2)
			
 
				+# driver.find_element_by_id("nav-search").send_keys("Selenium")
			
 
				+enter_text = driver.find_element(
			
 
				+    "xpath", "//input[@placeholder='Example: Apple, CUDA']"
			
 
				+).send_keys("Apple")
			
 
				+time.sleep(2)
			
 
				+driver.implicitly_wait(2)
			
 
				+enter_button_search = driver.find_element(
			
 
				+    "xpath", "(//button[@class='btn btn-info'])[2]"
			
 
				+).click()