Ver código fonte

added a script to collect documents from the postgres server

Harsh Parikh 2 anos atrás
pai
commit
92e83c5949

+ 16 - 0
document_download_from_server/db_utils.py

@@ -0,0 +1,16 @@
+from utils import Base, metadata_rc, session_rc
+from sqlalchemy import Table
+
+
+class Documents(Base):
+    __table__ = Table("core_document", metadata_rc)
+
+
+def get_documents():
+    documents = session_rc.query(Documents).all()
+    for document in documents:
+        yield document
+
+
+if __name__ == "__main__":
+    get_documents()

+ 24 - 0
document_download_from_server/documents_download.py

@@ -0,0 +1,24 @@
+from asyncio.subprocess import PIPE
+import os
+import subprocess
+from subprocess import Popen
+from db_utils import get_documents
+
+
+def main(PATH):
+    itr = 0
+    for document in get_documents():
+        url = document.url
+        args = ["wget", "-r", "-l", "1", "-p", "-P", PATH, url]
+        Popen(args, stdout=PIPE)
+        itr += 1
+        if itr == 1:
+            break
+
+
+if __name__ == "__main__":
+    HOME_DIR = os.path.expanduser("~")
+    BASE_DIR = "Code/pdf_parser/document_download_from_server"
+
+    PATH = os.path.join(HOME_DIR, BASE_DIR)
+    main(PATH)

+ 14 - 0
document_download_from_server/utils.py

@@ -0,0 +1,14 @@
+import os
+
+from sqlalchemy import create_engine, MetaData, orm
+from sqlalchemy.ext.declarative import declarative_base
+
+pengine_rc = create_engine("postgresql://xpertconnect:123@localhost:5432/rsa_crawling")
+
+Base = declarative_base()
+metadata_rc = MetaData(pengine_rc)
+metadata_rc.reflect()
+
+Session_rc = orm.sessionmaker(pengine_rc)
+
+session_rc = Session_rc()