소스 검색

added a script to collect documents from the postgres server

Harsh Parikh 2 년 전
부모
커밋
92e83c5949
3개의 변경된 파일54개의 추가작업 그리고 0개의 파일을 삭제
  1. 16 0
      document_download_from_server/db_utils.py
  2. 24 0
      document_download_from_server/documents_download.py
  3. 14 0
      document_download_from_server/utils.py

+ 16 - 0
document_download_from_server/db_utils.py

@@ -0,0 +1,16 @@
+from utils import Base, metadata_rc, session_rc
+from sqlalchemy import Table
+
+
+class Documents(Base):
+    __table__ = Table("core_document", metadata_rc)
+
+
+def get_documents():
+    documents = session_rc.query(Documents).all()
+    for document in documents:
+        yield document
+
+
+if __name__ == "__main__":
+    get_documents()

+ 24 - 0
document_download_from_server/documents_download.py

@@ -0,0 +1,24 @@
+from asyncio.subprocess import PIPE
+import os
+import subprocess
+from subprocess import Popen
+from db_utils import get_documents
+
+
+def main(PATH):
+    itr = 0
+    for document in get_documents():
+        url = document.url
+        args = ["wget", "-r", "-l", "1", "-p", "-P", PATH, url]
+        Popen(args, stdout=PIPE)
+        itr += 1
+        if itr == 1:
+            break
+
+
+if __name__ == "__main__":
+    HOME_DIR = os.path.expanduser("~")
+    BASE_DIR = "Code/pdf_parser/document_download_from_server"
+
+    PATH = os.path.join(HOME_DIR, BASE_DIR)
+    main(PATH)

+ 14 - 0
document_download_from_server/utils.py

@@ -0,0 +1,14 @@
+import os
+
+from sqlalchemy import create_engine, MetaData, orm
+from sqlalchemy.ext.declarative import declarative_base
+
+pengine_rc = create_engine("postgresql://xpertconnect:123@localhost:5432/rsa_crawling")
+
+Base = declarative_base()
+metadata_rc = MetaData(pengine_rc)
+metadata_rc.reflect()
+
+Session_rc = orm.sessionmaker(pengine_rc)
+
+session_rc = Session_rc()