mwvgroup · hernandezc1 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025 · Jul 21, 2025
diff --git a/broker/cloud_run/lvk/ps_to_storage/Dockerfile b/broker/cloud_run/lvk/ps_to_storage/Dockerfile
@@ -0,0 +1,21 @@
+# Use the official lightweight Python image.
+# https://hub.docker.com/_/python
+FROM python:3.12-slim
+
+# Allow statements and log messages to immediately appear in the Knative logs
+ENV PYTHONUNBUFFERED True
+
+# Copy local code to the container image.
+ENV APP_HOME /app
+WORKDIR $APP_HOME
+COPY . ./
+
+# Install production dependencies.
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Run the web service on container startup. Here we use the gunicorn
+# webserver, with one worker process and 8 threads.
+# For environments with multiple CPU cores, increase the number of workers
+# to be equal to the cores available.
+# Timeout is set to 0 to disable the timeouts of the workers to allow Cloud Run to handle instance scaling.
+CMD exec gunicorn --bind :$PORT --workers 1 --threads 8 --timeout 0 main:app
diff --git a/broker/cloud_run/lvk/ps_to_storage/cloudbuild.yaml b/broker/cloud_run/lvk/ps_to_storage/cloudbuild.yaml
@@ -0,0 +1,29 @@
+# https://cloud.google.com/build/docs/deploying-builds/deploy-cloud-run
+# containerize the module and deploy it to Cloud Run
+steps:
+# Build the image
+- name: 'gcr.io/cloud-builders/docker'
+  args: ['build', '-t', '${_REGION}-docker.pkg.dev/${PROJECT_ID}/${_REPOSITORY}/${_MODULE_IMAGE_NAME}', '.']
+# Push the image to Artifact Registry
+- name: 'gcr.io/cloud-builders/docker'
+  args: ['push', '${_REGION}-docker.pkg.dev/${PROJECT_ID}/${_REPOSITORY}/${_MODULE_IMAGE_NAME}']
+# Deploy image to Cloud Run
+- name: 'gcr.io/google.com/cloudsdktool/cloud-sdk'
+  entrypoint: gcloud
+  args: ['run', 'deploy', '${_MODULE_NAME}', '--image', '${_REGION}-docker.pkg.dev/${PROJECT_ID}/${_REPOSITORY}/${_MODULE_IMAGE_NAME}', '--region', '${_REGION}', '--set-env-vars', '${_ENV_VARS}']
+images:
+- '${_REGION}-docker.pkg.dev/${PROJECT_ID}/${_REPOSITORY}/${_MODULE_IMAGE_NAME}'
+substitutions:
+    _SURVEY: 'lvk'
+    _TESTID: 'testid'
+    _MODULE_NAME: '${_SURVEY}-alerts-to-storage-${_TESTID}'
+    _MODULE_IMAGE_NAME: 'gcr.io/${PROJECT_ID}/${_REPOSITORY}/${_MODULE_NAME}'
+    _REPOSITORY: 'cloud-run-services'
+    # cloud functions automatically sets the projectid env var using the name "GCP_PROJECT"
+    # use the same name here for consistency
+    # [TODO] PROJECT_ID is set in setup.sh. this is confusing and we should revisit the decision.
+    # i (Raen) think i didn't make it a substitution because i didn't want to set a default for it.
+    _ENV_VARS: 'GCP_PROJECT=${PROJECT_ID},SURVEY=${_SURVEY},TESTID=${_TESTID},VERSIONTAG=${_VERSIONTAG}'
+    _REGION: 'us-central1'
+options:
+    dynamic_substitutions: true
diff --git a/broker/cloud_run/lvk/ps_to_storage/deploy.sh b/broker/cloud_run/lvk/ps_to_storage/deploy.sh
@@ -0,0 +1,93 @@
+#! /bin/bash
+# Deploys or deletes broker Cloud Run service
+# This script will not delete Cloud Run services that are in production
+
+# "False" uses production resources
+# any other string will be appended to the names of all resources
+testid="${1:-test}"
+# "True" tearsdown/deletes resources, else setup
+teardown="${2:-False}"
+# name of the survey this broker instance will ingest
+survey="${3:-lvk}"
+region="${4:-us-central1}"
+versiontag="${5:-v1_0}"
+# get the environment variable
+PROJECT_ID=$GOOGLE_CLOUD_PROJECT
+
+MODULE_NAME="alerts-to-storage"  # lower case required by cloud run
+ROUTE_RUN="/"  # url route that will trigger main.run()
+
+define_GCP_resources() {
+    local base_name="$1"
+    local separator="${2:--}"
+    local testid_suffix=""
+
+    if [ "$testid" != "False" ] && [ -n "$testid" ]; then
+        testid_suffix="${separator}${testid}"
+    fi
+    echo "${base_name}${testid_suffix}"
+}
+
+#--- GCP resources used in this script
+artifact_registry_repo=$(define_GCP_resources "${survey}-cloud-run-services")
+cr_module_name=$(define_GCP_resources "${survey}-${MODULE_NAME}")  # lower case required by cloud run
+gcs_json_bucket=$(define_GCP_resources "${PROJECT_ID}-${survey}_alerts")
+ps_deadletter_topic=$(define_GCP_resources "${survey}-deadletter")
+ps_input_subscrip=$(define_GCP_resources "${survey}-alerts_raw") # pub/sub subscription used to trigger cloud run module
+ps_topic_alert_in_bucket=$(define_GCP_resources "projects/${PROJECT_ID}/topics/${survey}-alert_in_bucket")
+ps_trigger_topic=$(define_GCP_resources "${survey}-alerts_raw")
+runinvoker_svcact="cloud-run-invoker@${PROJECT_ID}.iam.gserviceaccount.com"
+
+if [ "${teardown}" = "True" ]; then
+    # ensure that we do not teardown production resources
+    if [ "${testid}" != "False" ]; then
+        echo
+        echo "Deleting resources for ${MODULE_NAME} module..."
+        gsutil rm -r "gs://${gcs_json_bucket}"
+        gcloud pubsub subscriptions delete "${ps_input_subscrip}"
+        gcloud pubsub topics delete "${ps_topic_alert_in_bucket}"
+        gcloud run services delete "${cr_module_name}" --region "${region}"
+    fi
+else
+    echo
+    echo "Creating gcs_json_bucket and uploading files..."
+    if ! gsutil ls -b "gs://${gcs_json_bucket}" >/dev/null 2>&1; then
+        gsutil mb -b on -l "${region}" "gs://${gcs_json_bucket}"
+        gsutil uniformbucketlevelaccess set on "gs://${gcs_json_bucket}"
+        gsutil requesterpays set on "gs://${gcs_json_bucket}"
+        gcloud storage buckets add-iam-policy-binding "gs://${gcs_json_bucket}" \
+            --member="allUsers" \
+            --role="roles/storage.objectViewer"
+    else
+        echo "${gcs_json_bucket} already exists."
+    fi
+
+    echo
+    echo "Configuring Pub/Sub notifications on GCS bucket..."
+    trigger_event=OBJECT_FINALIZE
+    format=json  # json or none; if json, file metadata sent in message body
+    gsutil notification create \
+        -t "$ps_topic_alert_in_bucket" \
+        -e "$trigger_event" \
+        -f "$format" \
+        "gs://${gcs_json_bucket}"
+
+    #--- Deploy Cloud Run service
+    echo
+    echo "Creating container image for ${MODULE_NAME} module and deploying to Cloud Run..."
+    moduledir="."  # assumes deploying what's in our current directory
+    config="${moduledir}/cloudbuild.yaml"
+    url=$(gcloud builds submit --config="${config}" \
+        --substitutions="_SURVEY=${survey},_TESTID=${testid},_MODULE_NAME=${cr_module_name},_REPOSITORY=${artifact_registry_repo},_VERSIONTAG=${versiontag}" \
+        "${moduledir}" | sed -n 's/^Step #2: Service URL: \(.*\)$/\1/p')
+    echo
+    echo "Creating trigger subscription for ${MODULE_NAME} Cloud Run service..."
+    gcloud pubsub subscriptions create "${ps_input_subscrip}" \
+        --topic "${ps_trigger_topic}" \
+        --topic-project "${PROJECT_ID}" \
+        --ack-deadline=600 \
+        --push-endpoint="${url}${ROUTE_RUN}" \
+        --push-auth-service-account="${runinvoker_svcact}" \
+        --dead-letter-topic="${ps_deadletter_topic}" \
+        --max-delivery-attempts=5
+fi
diff --git a/broker/cloud_run/lvk/ps_to_storage/main.py b/broker/cloud_run/lvk/ps_to_storage/main.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+# -*- coding: UTF-8 -*-
+
+"""This module stores LVK alert data as a JSON file in Cloud Storage."""
+
+import os
+import flask
+import pittgoogle
+from google.cloud import logging, storage
+from google.cloud.exceptions import PreconditionFailed
+
+# [FIXME] Make this helpful or else delete it.
+# Connect the python logger to the google cloud logger.
+# By default, this captures INFO level and above.
+# pittgoogle uses the python logger.
+# We don't currently use the python logger directly in this script, but we could.
+logging.Client().setup_logging()
+
+PROJECT_ID = os.getenv("GCP_PROJECT")
+TESTID = os.getenv("TESTID")
+SURVEY = os.getenv("SURVEY")
+VERSIONTAG = os.getenv("VERSIONTAG")
+
+# Variables for incoming data
+# A url route is used in setup.sh when the trigger subscription is created.
+# It is possible to define multiple routes in a single module and trigger them using different subscriptions.
+ROUTE_RUN = "/"  # HTTP route that will trigger run(). Must match deploy.sh
+
+# Variables for outgoing data
+HTTP_204 = 204  # HTTP code: Success
+HTTP_400 = 400  # HTTP code: Bad Request
+
+# GCP resources used in this module
+TOPIC_ALERTS = pittgoogle.Topic.from_cloud(
+    "alerts", survey=SURVEY, testid=TESTID, projectid=PROJECT_ID
+)
+bucket_name = f"{PROJECT_ID}-{SURVEY}_alerts"
+if TESTID != "False":
+    bucket_name = f"{bucket_name}-{TESTID}"
+
+client = storage.Client()
+bucket = client.get_bucket(client.bucket(bucket_name, user_project=PROJECT_ID))
+
+app = flask.Flask(__name__)
+
+
+@app.route(ROUTE_RUN, methods=["POST"])
+def run():
+    """Uploads alert data to a GCS bucket. Publishes a de-duplicated JSON-serialized "alerts" stream
+    (${survey}-alerts) containing the original alert bytes. A BigQuery subscription is used to write alert data to
+    the appropriate BigQuery table.
+
+    This module is intended to be deployed as a Cloud Run service. It will operate as an HTTP endpoint
+    triggered by Pub/Sub messages. This function will be called once for every message sent to this route.
+    It should accept the incoming HTTP request and return a response.
+
+    Returns
+    -------
+    response : tuple(str, int)
+        Tuple containing the response body (string) and HTTP status code (int). Flask will convert the
+        tuple into a proper HTTP response. Note that the response is a status message for the web server.
+    """
+    # extract the envelope from the request that triggered the endpoint
+    # this contains a single Pub/Sub message with the alert to be processed
+    envelope = flask.request.get_json()
+    try:
+        alert = pittgoogle.Alert.from_cloud_run(envelope, "lvk")
+    except pittgoogle.exceptions.BadRequest as exc:
+        return str(exc), HTTP_400
+
+    blob = bucket.blob(_name_in_bucket(alert))
+    blob.metadata = _create_file_metadata(alert, event_id=envelope["message"]["messageId"])
+
+    # raise a PreconditionFailed exception if filename already exists in the bucket using "if_generation_match=0"
+    try:
+        blob.upload_from_string(alert.msg.data, if_generation_match=0)
+    except PreconditionFailed:
+        # this alert is a duplicate. drop it.
+        return "", HTTP_204
+
+    # publish the same alert as JSON
+    TOPIC_ALERTS.publish(alert)
+
+    return "", HTTP_204
+
+
+def _create_file_metadata(alert: pittgoogle.Alert, event_id: str) -> dict:
+    """Return key/value pairs to be attached to the file as metadata."""
+    # https://git.ligo.org/emfollow/igwn-gwalert-schema/-/blob/main/igwn.alerts.v1_0.Alert.schema.json
+    metadata = {"file_origin_message_id": event_id}
+    metadata["_".join("time_created")] = alert.dict["time_created"]
+    metadata["_".join("alert_type")] = alert.dict["alert_type"]
+    metadata["_".join("id")] = alert.dict["superevent_id"]
-    metadata["_".join("time_created")] = alert.dict["time_created"]
-    metadata["_".join("alert_type")] = alert.dict["alert_type"]
-    metadata["_".join("id")] = alert.dict["superevent_id"]
+    metadata["time_created"] = alert.dict["time_created"]
+    metadata["alert_type"] = alert.dict["alert_type"]
+    metadata["superevent_id"] = alert.dict["superevent_id"]
-    metadata["_".join("time_created")] = alert.dict["time_created"]
-    metadata["_".join("alert_type")] = alert.dict["alert_type"]
-    metadata["_".join("id")] = alert.dict["superevent_id"]
+    metadata["time_created"] = alert.dict["time_created"]
+    metadata["alert_type"] = alert.dict["alert_type"]
+    metadata["superevent_id"] = alert.dict["superevent_id"]
+
+    return metadata
+
+
+def _name_in_bucket(alert: pittgoogle.Alert) -> str:
+    """Return the name of the file in the bucket."""
+    _date = alert.dict["time_created"][0:10]
+    _alert_type = alert.dict["alert_type"]
+    _id = alert.dict["superevent_id"]
+
+    return f"{VERSIONTAG}/{_date}/{_alert_type}/{_id}.json"
diff --git a/broker/cloud_run/lvk/ps_to_storage/requirements.txt b/broker/cloud_run/lvk/ps_to_storage/requirements.txt
@@ -0,0 +1,14 @@
+# As explained here
+# https://cloud.google.com/functions/docs/writing/specifying-dependencies-python
+# dependencies for a Cloud Function must be specified in a `requirements.txt`
+# file (or packaged with the function) in the same directory as `main.py`
+
+google-cloud-logging
+google-cloud-storage
+pittgoogle-client>=0.3.15
+
+# for Cloud Run
+# https://cloud.google.com/run/docs/quickstarts/build-and-deploy/deploy-python-service
+Flask
+gunicorn
+Werkzeug
diff --git a/broker/consumer/lvk/vm_startup.sh b/broker/consumer/lvk/vm_startup.sh
@@ -23,7 +23,7 @@ fi
 
 #--- GCP resources used in this script
 broker_bucket="${PROJECT_ID}-${survey}-broker_files"
-PS_TOPIC_DEFAULT="${survey}-alerts"
+PS_TOPIC_DEFAULT="${survey}-alerts_raw"
 # use test resources, if requested
 if [ "$testid" != "False" ]; then
     broker_bucket="${broker_bucket}-${testid}"