diff --git a/.github/workflows/build_docker_images.yml b/.github/workflows/build_docker_images.yml index fa6c10b15..afcb1ad0d 100644 --- a/.github/workflows/build_docker_images.yml +++ b/.github/workflows/build_docker_images.yml @@ -13,7 +13,6 @@ on: - "atd-etl/cris_import/**" - "atd-etl/cr3_extract_diagram/**" - "atd-etl/socrata_export/**" - - "atd-etl/cr3_download/**" pull_request: branches: - master @@ -24,7 +23,6 @@ on: - "atd-etl/cris_import/**" - "atd-etl/cr3_extract_diagram/**" - "atd-etl/socrata_export/**" - - "atd-etl/cr3_download/**" # Allows you to run this workflow manually from the Actions tab workflow_dispatch: @@ -55,8 +53,6 @@ jobs: - 'atd-etl/cr3_extract_diagram/**' socrata_export: - 'atd-etl/socrata_export/**' - cr3_download: - - 'atd-etl/cr3_download/**' - name: Set up Docker Buildx uses: docker/setup-buildx-action@v2 @@ -96,12 +92,3 @@ jobs: context: atd-etl/socrata_export push: true tags: atddocker/vz-socrata-export:${{ github.ref == 'refs/heads/master' && 'production' || 'latest' }} - - - name: Build and push CR3 download image - if: steps.changes.outputs.cr3_download == 'true' - uses: docker/build-push-action@v4 - with: - platforms: linux/amd64,linux/arm64 - context: atd-etl/cr3_download - push: true - tags: atddocker/vz-cr3-download:${{ github.ref == 'refs/heads/master' && 'production' || 'latest' }} diff --git a/atd-etl/cr3_download/Dockerfile b/atd-etl/cr3_download/Dockerfile index 6da93ef9c..f184a87a9 100644 --- a/atd-etl/cr3_download/Dockerfile +++ b/atd-etl/cr3_download/Dockerfile @@ -9,3 +9,5 @@ WORKDIR /app COPY . /app RUN cd /app && pip install -r requirements.txt + +CMD python /app/cr3_download.py diff --git a/atd-etl/cr3_download/README.md b/atd-etl/cr3_download/README.md index db4f78321..e109a331b 100644 --- a/atd-etl/cr3_download/README.md +++ b/atd-etl/cr3_download/README.md @@ -3,14 +3,15 @@ ### Invocation After creating an .env file using the variables listed in the env_template file, you can run this script with: + +If you are running this for the first time or developing this script, you will need to run and build: ``` -$ docker compose run -it cr3_download python cr3_download.py +$ docker compose run --build cr3_download ``` -If you are developing, you may find that you need to run and build: +Otherwise, you can run: ``` -$ docker compose run --build -it cr3_download python cr3_download.py +$ docker compose run cr3_download ``` -In production, they will be run from a DAG which handles starting the containers with -the needed environment and other parameters. +The script will prompt for the cookie and then download any pending CR3s. \ No newline at end of file diff --git a/atd-etl/cr3_download/cr3_download.py b/atd-etl/cr3_download/cr3_download.py index 19ea20c5c..5c94c003c 100755 --- a/atd-etl/cr3_download/cr3_download.py +++ b/atd-etl/cr3_download/cr3_download.py @@ -9,21 +9,87 @@ any CR3 files associated. """ +import os import time import json -from process.config import ATD_ETL_CONFIG from process.helpers_cr3 import * +from onepasswordconnectsdk.client import Client, new_client +import onepasswordconnectsdk + # Start timer start = time.time() + +# Get 1Password secrets from environment +ONEPASSWORD_CONNECT_HOST = os.getenv("OP_CONNECT") +ONEPASSWORD_CONNECT_TOKEN = os.getenv("OP_API_TOKEN") +VAULT_ID = os.getenv("OP_VAULT_ID") + +# Setup 1Password server connection +one_password_client = new_client(ONEPASSWORD_CONNECT_HOST, ONEPASSWORD_CONNECT_TOKEN) + +# Get secrets from 1Password +REQUIRED_SECRETS = { + "HASURA_ENDPOINT": { + "opitem": "Vision Zero graphql-engine Endpoints", + "opfield": "production.GraphQL Endpoint", + "opvault": VAULT_ID, + }, + "HASURA_ADMIN_KEY": { + "opitem": "Vision Zero graphql-engine Endpoints", + "opfield": "production.Admin Key", + "opvault": VAULT_ID, + }, + "AWS_ACCESS_KEY_ID": { + "opitem": "CR3 Download IAM Access Key and Secret", + "opfield": "production.accessKeyId", + "opvault": VAULT_ID, + }, + "AWS_SECRET_ACCESS_KEY": { + "opitem": "CR3 Download IAM Access Key and Secret", + "opfield": "production.accessSecret", + "opvault": VAULT_ID, + }, + "AWS_DEFAULT_REGION": { + "opitem": "CR3 Download IAM Access Key and Secret", + "opfield": "production.awsDefaultRegion", + "opvault": VAULT_ID, + }, + "ATD_CRIS_CR3_URL": { + "opitem": "Vision Zero CRIS CR3 Download", + "opfield": "production.ATD_CRIS_CR3_URL", + "opvault": VAULT_ID, + }, + "AWS_CRIS_CR3_BUCKET_NAME": { + "opitem": "Vision Zero CRIS CR3 Download", + "opfield": "production.AWS_CRIS_CR3_BUCKET_NAME", + "opvault": VAULT_ID, + }, + "AWS_CRIS_CR3_BUCKET_PATH": { + "opitem": "Vision Zero CRIS CR3 Download", + "opfield": "production.AWS_CRIS_CR3_BUCKET_PATH", + "opvault": VAULT_ID, + }, +} + +env_vars = onepasswordconnectsdk.load_dict(one_password_client, REQUIRED_SECRETS) + +# Set secrets from 1Password in environment +for key, value in env_vars.items(): + os.environ[key] = value + # # We now need to request a list of N number of records # that do not have a CR3. For each record we must download # the CR3 pdf, upload to S3 # +# ask user for a set of valid cookies for requests to the CRIS website +CRIS_BROWSER_COOKIES = input( + "Please login to CRIS and extract the contents of the Cookie: header and please paste it here:" +) print("Preparing download loop.") @@ -40,8 +106,8 @@ crashes_list_without_skips = [] try: - print("Hasura endpoint: '%s' " % ATD_ETL_CONFIG["HASURA_ENDPOINT"]) - downloads_per_run = ATD_ETL_CONFIG["ATD_CRIS_CR3_DOWNLOADS_PER_RUN"] + print("Hasura endpoint: '%s' " % os.getenv("HASURA_ENDPOINT")) + downloads_per_run = os.getenv("ATD_CRIS_CR3_DOWNLOADS_PER_RUN") downloads_per_run = 2000 print("Downloads Per This Run: %s" % str(downloads_per_run)) @@ -67,7 +133,7 @@ for crash_record in crashes_list_without_skips: process_crash_cr3( crash_record, - ATD_ETL_CONFIG["CRIS_CR3_DOWNLOAD_COOKIE"], + CRIS_BROWSER_COOKIES, skipped_uploads_and_updates, ) diff --git a/atd-etl/cr3_download/env_template b/atd-etl/cr3_download/env_template index 6c7065728..7650c06f4 100644 --- a/atd-etl/cr3_download/env_template +++ b/atd-etl/cr3_download/env_template @@ -1,12 +1,3 @@ -AWS_DEFAULT_REGION= -AWS_ACCESS_KEY_ID= -AWS_SECRET_ACCESS_KEY= -# HASURA -HASURA_ENDPOINT= -HASURA_ADMIN_KEY= -# CRIS -CRIS_CR3_DOWNLOAD_COOKIE= -# CR3 -ATD_CRIS_CR3_URL= -AWS_CRIS_CR3_BUCKET_NAME= -AWS_CRIS_CR3_BUCKET_PATH= +OP_API_TOKEN= +OP_CONNECT= +OP_VAULT_ID= diff --git a/atd-etl/cr3_download/process/config.py b/atd-etl/cr3_download/process/config.py deleted file mode 100644 index eec8f8019..000000000 --- a/atd-etl/cr3_download/process/config.py +++ /dev/null @@ -1,33 +0,0 @@ -""" -ETL Configuration File -Author: Austin Transportation Department, Data and Technology Services - -Description: This file contains an dictionary that contains environment -variables for the CR3 download scripts. These -environment variables are fed through Docker. Please refer to the project -readme for further details on how to import environment variables. -""" - -import os - -ATD_ETL_CONFIG = { - # AWS - "AWS_DEFAULT_REGION": os.getenv("AWS_DEFAULT_REGION", ""), - "AWS_ACCESS_KEY_ID": os.getenv("AWS_ACCESS_KEY_ID", ""), - "AWS_SECRET_ACCESS_KEY": os.getenv("AWS_SECRET_ACCESS_KEY", ""), - # HASURA - "HASURA_ENDPOINT": os.getenv("HASURA_ENDPOINT", ""), - "HASURA_ADMIN_KEY": os.getenv("HASURA_ADMIN_KEY", ""), - "MAX_THREADS": int(os.getenv("MAX_THREADS", "20")), - "MAX_ATTEMPTS": int(os.getenv("MAX_ATTEMPTS", "5")), - "RETRY_WAIT_TIME": int(os.getenv("RETRY_WAIT_TIME", "5")), - # CRIS - "ATD_CRIS_CR3_DOWNLOADS_PER_RUN": os.getenv("ATD_CRIS_DOWNLOADS_PER_RUN", "25"), - "CRIS_CR3_DOWNLOAD_COOKIE": os.getenv("CRIS_CR3_DOWNLOAD_COOKIE", ""), - # CR3 - "ATD_CRIS_CR3_URL": "https://cris.dot.state.tx.us/secure/ImageServices/DisplayImageServlet?target=", - "AWS_CRIS_CR3_BUCKET_NAME": os.getenv("AWS_CRIS_CR3_BUCKET_NAME", ""), - "AWS_CRIS_CR3_BUCKET_PATH": os.getenv( - "AWS_CRIS_CR3_BUCKET_PATH", "production/cris-cr3-files-unassigned" - ), -} diff --git a/atd-etl/cr3_download/process/helpers_cr3.py b/atd-etl/cr3_download/process/helpers_cr3.py index 88e6e70d6..2fd8e7c3f 100644 --- a/atd-etl/cr3_download/process/helpers_cr3.py +++ b/atd-etl/cr3_download/process/helpers_cr3.py @@ -10,6 +10,7 @@ https://pypi.org/project/requests/ """ +import os import requests import base64 import subprocess @@ -18,8 +19,7 @@ import magic -# We need to import our configuration, and the run_query method -from .config import ATD_ETL_CONFIG +# We need the run_query method from .request import run_query @@ -49,7 +49,7 @@ def download_cr3(crash_id, cookies): crash_id_encoded = base64.b64encode( str("CrashId=" + crash_id).encode("utf-8") ).decode("utf-8") - url = ATD_ETL_CONFIG["ATD_CRIS_CR3_URL"] + crash_id_encoded + url = os.getenv("ATD_CRIS_CR3_URL") + crash_id_encoded download_path = "/tmp/" + "%s.pdf" % crash_id print("Downloading (%s): '%s' from %s" % (crash_id, download_path, url)) @@ -66,8 +66,8 @@ def upload_cr3(crash_id): """ file = "/tmp/%s.pdf" % crash_id destination = "s3://%s/%s/%s.pdf" % ( - ATD_ETL_CONFIG["AWS_CRIS_CR3_BUCKET_NAME"], - ATD_ETL_CONFIG["AWS_CRIS_CR3_BUCKET_PATH"], + os.getenv("AWS_CRIS_CR3_BUCKET_NAME"), + os.getenv("AWS_CRIS_CR3_BUCKET_PATH"), crash_id, ) diff --git a/atd-etl/cr3_download/process/request.py b/atd-etl/cr3_download/process/request.py index 227426cc7..149cf0d79 100644 --- a/atd-etl/cr3_download/process/request.py +++ b/atd-etl/cr3_download/process/request.py @@ -1,12 +1,12 @@ # # Request Helper - Makes post requests to a Hasura/GraphQL endpoint. # +import os import time import requests -from .config import ATD_ETL_CONFIG -MAX_ATTEMPTS = ATD_ETL_CONFIG["MAX_ATTEMPTS"] -RETRY_WAIT_TIME = ATD_ETL_CONFIG["RETRY_WAIT_TIME"] +MAX_ATTEMPTS = int(os.getenv("MAX_ATTEMPTS", "5")) +RETRY_WAIT_TIME = int(os.getenv("RETRY_WAIT_TIME", "5")) def run_query(query): @@ -16,23 +16,21 @@ def run_query(query): :return: object - A Json dictionary directly from Hasura """ # Build Header with Admin Secret - headers = { - "x-hasura-admin-secret": ATD_ETL_CONFIG["HASURA_ADMIN_KEY"] - } + headers = {"x-hasura-admin-secret": os.getenv("HASURA_ADMIN_KEY")} # Try up to n times as defined by max_attempts for current_attempt in range(MAX_ATTEMPTS): # Try making the request via POST try: - return requests.post(ATD_ETL_CONFIG["HASURA_ENDPOINT"], - json={'query': query}, - headers=headers).json() + return requests.post( + os.getenv("HASURA_ENDPOINT"), json={"query": query}, headers=headers + ).json() except Exception as e: print("Exception, could not insert: " + str(e)) print("Query: '%s'" % query) response = { "errors": "Exception, could not insert: " + str(e), - "query": query + "query": query, } # If the current attempt is equal to MAX_ATTEMPTS, then exit with failure @@ -41,6 +39,6 @@ def run_query(query): # If less than 5, then wait 5 seconds and try again else: - print("Attempt (%s out of %s)" % (current_attempt+1, MAX_ATTEMPTS)) + print("Attempt (%s out of %s)" % (current_attempt + 1, MAX_ATTEMPTS)) print("Trying again in %s seconds..." % RETRY_WAIT_TIME) time.sleep(RETRY_WAIT_TIME) diff --git a/atd-etl/cr3_download/requirements.txt b/atd-etl/cr3_download/requirements.txt index 1e7db4094..d2ff96fb6 100644 --- a/atd-etl/cr3_download/requirements.txt +++ b/atd-etl/cr3_download/requirements.txt @@ -2,3 +2,4 @@ awscli==1.* boto3==1.* python-magic==0.* requests==2.* +onepasswordconnectsdk==1.*