Skip to content

Commit aea85f0

Browse files
authored
Merge pull request #45 from mwvgroup/development
Development
2 parents 58af663 + 888feee commit aea85f0

23 files changed

+399
-473
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
# ZTF alert data
22
broker/ztf_archive/data/
33

4+
# Authentication keys
5+
GCPauth.json
6+
krb5.conf
7+
pitt-reader.user.keytab
8+
49
# OS files
510
*.DS_Store
611
.AppleDouble

broker/alert_ingestion/consume.py

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@
5151
"""
5252

5353
import logging
54+
import google.cloud.logging
5455
import os
5556
import re
5657
from pathlib import Path
@@ -67,7 +68,10 @@
6768
from google.cloud import pubsub, storage
6869
from google.cloud.pubsub_v1.publisher.futures import Future
6970

70-
log = logging.getLogger(__name__)
71+
log = logging.getLogger(__name__) # python root logger
72+
client = google.cloud.logging.Client() # google cloud logger
73+
client.get_default_handler()
74+
client.setup_logging() # this connects cloud log to root log
7175

7276
DEFAULT_ZTF_CONFIG = {
7377
'bootstrap.servers': 'public2.alerts.ztf.uw.edu:9094',
@@ -111,7 +115,7 @@ def seekable(self): # necessary so that fastavro can write to the file
111115

112116

113117
def _set_config_defaults(kafka_config: dict) -> dict:
114-
"""Set default values for a Kafka configuration dictionary
118+
"""Set default values for a Kafka configuration dictionaryk
115119
116120
Default values:
117121
enable.auto.commit: False,
@@ -168,7 +172,7 @@ def __init__(
168172
self.pubsub_alert_data_topic = pubsub_alert_data_topic
169173
self.pubsub_in_GCS_topic = pubsub_in_GCS_topic
170174
self.kafka_server = kafka_config["bootstrap.servers"]
171-
log.info(f'Initializing consumer: {self.__repr__()}')
175+
log.debug(f'Initializing consumer: {self.__repr__()}')
172176

173177
# Connect to Kafka stream
174178
# Enforce NO auto commit, correct log handling
@@ -178,12 +182,12 @@ def __init__(
178182
# Connect to Google Cloud Storage
179183
self.storage_client = storage.Client()
180184
self.bucket = self.storage_client.get_bucket(bucket_name)
181-
log.info(f'Connected to bucket: {self.bucket.name}')
185+
log.debug(f'Connected to bucket: {self.bucket.name}')
182186

183187
def close(self) -> None:
184188
"""Close down and terminate the Kafka Consumer"""
185189

186-
log.info(f'Closing consumer: {self.__repr__()}')
190+
log.debug(f'Closing consumer: {self.__repr__()}')
187191
super().close()
188192

189193
@staticmethod
@@ -219,9 +223,9 @@ def fix_schema(temp_file: TempAlertFile, survey: str, version: str) -> None:
219223
temp_file.truncate() # removes leftover data
220224
temp_file.seek(0)
221225

222-
log.debug(f'Schema header reformatted for {survey} version {version}')
226+
log.info(f'Schema header reformatted for {survey} version {version}')
223227

224-
def upload_bytes_to_bucket(self, data: bytes, destination_name: str) -> None:
228+
def upload_bytes_to_bucket(self, data: bytes, destination_name: str) -> bytes:
225229
"""Uploads bytes data to a GCP storage bucket. Prior to storage,
226230
corrects the schema header to be compliant with BigQuery's strict
227231
validation standards if the alert is from a survey version with an
@@ -230,15 +234,19 @@ def upload_bytes_to_bucket(self, data: bytes, destination_name: str) -> None:
230234
Args:
231235
data: Data to upload
232236
destination_name: Name of the file to be created
237+
238+
Returns:
239+
data with a corrected schema header (if one is necessary)
233240
"""
234241

235-
log.debug(f'Uploading {destination_name} to {self.bucket.name}')
242+
log.info(f'Uploading {destination_name} to {self.bucket.name}')
236243
blob = self.bucket.blob(destination_name)
237244

238245
# Get the survey name and version
239246
survey = guess_schema_survey(data)
240247
version = guess_schema_version(data)
241248

249+
# Correct the message schema, upload to GCS, and return it
242250
# By default, spool data in memory to avoid IO unless data is too big
243251
# LSST alerts are anticipated at 80 kB, so 150 kB should be plenty
244252
max_alert_packet_size = 150000
@@ -247,15 +255,19 @@ def upload_bytes_to_bucket(self, data: bytes, destination_name: str) -> None:
247255
temp_file.seek(0)
248256
self.fix_schema(temp_file, survey, version)
249257
blob.upload_from_file(temp_file)
258+
temp_file.seek(0)
259+
return temp_file.read()
250260

251261
def run(self) -> None:
252262
"""Ingest kafka Messages to GCS and PubSub"""
253263

254-
log.info('Starting consumer.run ...')
264+
log.debug('Starting consumer.run ...')
255265
try:
256266
while True:
257-
msg = self.consume(num_messages=1, timeout=5)[0]
267+
# msg = self.consume(num_messages=1, timeout=1)
268+
msg = self.poll(timeout=1)
258269
if msg is None:
270+
log.info('msg is None')
259271
continue
260272

261273
if msg.error():
@@ -266,12 +278,13 @@ def run(self) -> None:
266278

267279
else:
268280
timestamp_kind, timestamp = msg.timestamp()
269-
file_name = f'{timestamp}.avro'
281+
file_name = f'{msg.topic()}_{timestamp}.avro'
270282

271-
log.debug(f'Ingesting {file_name}')
283+
log.info(f'Ingesting {file_name}')
284+
msg_schema_fixed = self.upload_bytes_to_bucket(msg.value(), file_name)
285+
# returns msg.value() bytes object with schema corrected
286+
publish_pubsub(self.pubsub_alert_data_topic, msg_schema_fixed)
272287
publish_pubsub(self.pubsub_in_GCS_topic, file_name.encode('UTF-8'))
273-
publish_pubsub(self.pubsub_alert_data_topic, msg.value())
274-
self.upload_bytes_to_bucket(msg.value(), file_name)
275288

276289
if not self._debug:
277290
self.commit()

broker/deploy_cloudfnc.sh renamed to broker/cloud_functions/GCS_to_BQ.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
###
1212

1313
# NOT SURE OF THE RIGHT WAY TO GET INTO THIS DIRECTORY:
14-
cd broker/alert_ingestion/GCS_to_BQ
14+
cd broker/cloud_functions/GCS_to_BQ
1515

1616
# deploy stream_GCS_to_BQ() to listen to the ztf_alert_avro_bucket
1717
bucket="${GOOGLE_CLOUD_PROJECT}_ztf_alert_avro_bucket"

broker/cloud_functions/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Cloud Functions
2+
3+
This directory contains cloud functions used by the Pitt-Google Broker.
4+
Source code for each function is stored in a dedicated directory
5+
and is accompanied by a bash script that deploys the cloud function
6+
to the Google Cloud Platform.
7+
8+
For more information on cloud functions, see: https://cloud.google.com/functions
9+
10+
| Function | Description |
11+
|---|---|
12+
| `GCS_to_BQ` | Load the contents of avro files from Google Cloud Storage (GCP) into Big Query (BQ) |
13+
| `scheduleinstance` | Deploys and schedules the execution of functions for launching virtual machines that ingest ZTF data into BQ |
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/bin/sh
2+
3+
echo "WARNING: Make sure you have updated the values of [IMAGE NAME] and [VERSION] with the values of the current Docker image using this script.\n"
4+
# version number should be most recent commit used to build the image
5+
# git log -1 --format=format:"%H"
6+
7+
# Configure gcloud as a Docker credential helper
8+
gcloud auth configure-docker gcr.io/ardent-cycling-243415/consumeztf
9+
10+
# Create 2 instances of the consumer
11+
gcloud compute instances create-with-container consume-ztf-1 --zone=us-central1-a --machine-type=f1-micro --image-project=cos-cloud --container-image=gcr.io/ardent-cycling-243415/consumeztf:7019f8aa86ffe16dcb36fa791dc2fb7e56bb687f --labels=env=consume-ztf-1 --image=cos-stable-81-12871-1190-0 --service-account=591409139500-compute@developer.gserviceaccount.com --scopes=cloud-platform
12+
gcloud compute instances create-with-container consume-ztf-2 --zone=us-central1-a --machine-type=f1-micro --image-project=cos-cloud --container-image=gcr.io/ardent-cycling-243415/consumeztf:7019f8aa86ffe16dcb36fa791dc2fb7e56bb687f --labels=env=consume-ztf-2 --image=cos-stable-81-12871-1190-0 --service-account=591409139500-compute@developer.gserviceaccount.com --scopes=cloud-platform
13+
14+
15+
# Create the Pub/Sub topics to trigger starting and stopping the instance
16+
gcloud pubsub topics create start-instance-event
17+
gcloud pubsub topics create stop-instance-event
18+
19+
20+
# Create the cloud functions to publish to PubSub
21+
22+
cd scheduleinstance/
23+
24+
gcloud functions deploy startInstancePubSub --trigger-topic start-instance-event --runtime nodejs8
25+
26+
gcloud functions deploy stopInstancePubSub --trigger-topic stop-instance-event --runtime nodejs8
27+
28+
# Finally, schedule the PubSub messages that trigger the cloud functions.
29+
30+
# Reset consume-ztf-1 on odd days
31+
gcloud scheduler jobs create pubsub stop-consume-ztf-1 --schedule '0 9 1-31/2 * *' --topic stop-instance-event --message-body '{"zone":"us-west1-b", "label":"env=consume-ztf-1"}' --time-zone 'America/Los_Angeles'
32+
33+
gcloud scheduler jobs create pubsub start-consume-ztf-1 --schedule '0 17 1-31/2 * *' --topic start-instance-event --message-body '{"zone":"us-west1-b", "label":"env=consume-ztf-1"}' --time-zone 'America/Los_Angeles'
34+
35+
# Reset consume-ztf-2 on even days
36+
gcloud scheduler jobs create pubsub stop-consume-ztf-2 --schedule '0 0 2-30/2 * *' --topic stop-instance-event --message-body '{"zone":"us-west1-b", "label":"env=consume-ztf-2"}' --time-zone 'America/Los_Angeles'
37+
38+
gcloud scheduler jobs create pubsub start-consume-ztf-2 --schedule '0 0 2-30/2 * *' --topic start-instance-event --message-body '{"zone":"us-west1-b", "label":"env=consume-ztf-2"}' --time-zone 'America/Los_Angeles'

broker/pub_sub_client/message_service.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def publish_pubsub(topic_name, message):
2121

2222
topic_path = publisher.topic_path(project_id, topic_name)
2323

24-
topic = publisher.get_topic(topic_path)
24+
# topic = publisher.get_topic(topic_path)
2525
log.info(f'Connected to PubSub: {topic_path}')
2626

2727
future = publisher.publish(topic_path, data=message)
Lines changed: 38 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,41 @@
1-
FROM python:3.7
1+
# Slim used to reduce image size
2+
FROM python:3.7-slim
3+
4+
# Configure Environment variables
5+
ENV PYTHONPATH "Pitt-Google-Broker/:${PYTHONPATH}"
6+
ENV GOOGLE_CLOUD_PROJECT "ardent-cycling-243415"
7+
ENV ztf_server "public2.alerts.ztf.uw.edu:9094"
8+
ENV ztf_principle "[email protected]"
9+
ENV ztf_keytab_path "pitt-reader.user.keytab"
10+
ENV PATH="/root/miniconda3/bin:${PATH}"
11+
ARG PATH="/root/miniconda3/bin:${PATH}"
12+
13+
# Copy credentials and runtime files
14+
COPY docker_files/consume_ztf.py docker_files/consume_ztf.py
15+
COPY krb5.conf /etc/krb5.conf
16+
COPY pitt-reader.user.keytab pitt-reader.user.keytab
17+
18+
# Install utils for fetching remote source code
19+
RUN apt-get update && \
20+
apt-get install -y git wget python-dev gcc krb5-user && \
21+
rm -rf /var/lib/apt/lists/* && \
22+
apt-get clean
23+
24+
RUN wget \
25+
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
26+
&& mkdir /root/.conda \
27+
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
28+
&& rm -f Miniconda3-latest-Linux-x86_64.sh
29+
30+
RUN conda install -c conda-forge kafka-python -y
31+
RUN conda install -c conda-forge python-confluent-kafka -y
32+
RUN conda install -c stuarteberg -c conda-forge librdkafka -y
33+
34+
# Get broker source code and install dependencies
35+
RUN git clone --single-branch --branch master --depth 1 https://github.com/mwvgroup/Pitt-Google-Broker && \
36+
rm -rf Pitt-Google-Broker/.git
237

3-
MAINTAINER Daniel Perrefort "[email protected]"
4-
5-
COPY consume_ztf.py consume_ztf.py
6-
7-
# Install git
8-
RUN apt-get update
9-
RUN apt-get install -y git
10-
11-
# Get broker source code and add to path
12-
RUN git clone https://github.com/mwvgroup/Pitt-Google-Broker
13-
14-
# Install dependencies
15-
# Some dependency installs may fail without numpy, so we install it first
16-
RUN pip install numpy
1738
RUN pip install -r Pitt-Google-Broker/requirements.txt
1839

19-
# Configure Python Environment
20-
ENV PYTHONPATH="Pitt-Google-Broker/:${PYTHONPATH}"
21-
22-
23-
CMD [ "python", "./consume_ztf.py" ]
40+
# Launch the ZTF consumer
41+
CMD [ "python", "docker_files/consume_ztf.py" ]

docker_files/consume_ztf.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,14 @@
66
import os
77
from datetime import datetime
88

9-
from broker.consumer import GCSKafkaConsumer
9+
from broker.alert_ingestion.consume import GCSKafkaConsumer
1010

1111
# Define connection configuration using default values as a starting point
1212
config = {
1313
'bootstrap.servers': os.environ['ztf_server'],
1414
'group.id': 'group',
1515
'session.timeout.ms': 6000,
16-
'enable.auto.commit': 'FALSE',
16+
'enable.auto.commit': 'False',
1717
'sasl.kerberos.kinit.cmd': 'kinit -t "%{sasl.kerberos.keytab}" -k %{sasl.kerberos.principal}',
1818
'sasl.kerberos.service.name': 'kafka',
1919
'security.protocol': 'SASL_PLAINTEXT',
@@ -30,9 +30,10 @@
3030
# Create a consumer
3131
c = GCSKafkaConsumer(
3232
kafka_config=config,
33-
bucket_name='ardent-cycling-243415-ztf-avro-files',
33+
bucket_name='ardent-cycling-243415_ztf_alert_avro_bucket',
3434
kafka_topic=ztf_topic,
35-
pubsub_topic='ztf-avro-status'
35+
pubsub_alert_data_topic='ztf_alert_data',
36+
pubsub_in_GCS_topic='ztf_alert_avro_in_bucket'
3637
)
3738

3839
if __name__ == '__main__':
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
Cloud Configuration
2+
===================
3+
4+
Before deploying a broker instance to the cloud, you will need to create and
5+
authenticate a new cloud project. This project, and it's unique Id, will be
6+
used to organize the various resources used by the deployed system. For
7+
information on creating a new GCP project, see:
8+
`https://cloud.google.com/resource-manager/docs/creating-managing-projects <https://cloud.google.com/resource-manager/docs/creating-managing-projects>`.
9+
10+
Once your project has been created, take note of the unique project Id as it
11+
will be required at multiple points throughout the deployment process.
12+
13+
Authenticate CLI Tools
14+
-------------------
15+
16+
You will need to authenticate the `gcloud` command line tools
17+
so that they can access your google cloud project. This is accomplished using
18+
the project Id noted earlier:
19+
20+
.. code-block:: bash
21+
22+
gcloud auth login # Login to GCP
23+
gcloud config set project [PROJECT-ID] # Configure the project ID
24+
gcloud auth configure-docker # Allow access for deploying docker images
25+
26+
Setting up GCP
27+
--------------
28+
29+
You will need to set up a handful of tools in GCP. This includes enabling
30+
various API's for use in your GCP project
31+
32+
.. code-block: bash
33+
34+
gcloud services enable containerregistry.googleapis.com
35+
36+
With the API's enabled, the broker package provides
37+
an automated setup tool that creates various recourses required
38+
for the broker to run.
39+
40+
.. code-block:: python
41+
:linenos:
42+
43+
from broker.gcp_setup import auto_setup
44+
45+
# See a list of changes that will be made to your GCP project
46+
help(auto_setup)
47+
48+
# Setup your GCP project
49+
auto_setup()
50+
51+
Deploying the ``stream_GCS_to_BQ`` Cloud Function
52+
-------------------------------------------------
53+
54+
The ``stream_GCS_to_BQ`` function must be deployed from the command line as a
55+
Google Cloud Function so that it listens to the appropriate bucket(s) for new
56+
alert Avro files and appends the data to a BigQuery table. The Google Cloud SDK
57+
must be installed first (see :ref:`_gcloud`). The following script automates the
58+
deployment. Note that it may take a couple of minutes to complete.
59+
60+
.. code-block::bash
61+
:linenos:
62+
63+
./broker/cloud_functions/GCS_to_BQ.sh

0 commit comments

Comments
 (0)