Skip to content

Commit ba86a3c

Browse files
committed
Add support for Spark iceberg
1 parent f594dbd commit ba86a3c

22 files changed

+874
-64
lines changed

.github/workflows/pr_tests.yml

+46-12
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,10 @@ jobs:
5959
# Run tests from integration_tests sub dir
6060
working-directory: ./integration_tests
6161
strategy:
62+
fail-fast: false
6263
matrix:
6364
dbt_version: ["1.*"]
64-
warehouse: ["postgres", "bigquery", "snowflake", "databricks", "redshift"] # TODO: Add RS self-hosted runner
65+
warehouse: ["postgres", "bigquery", "snowflake", "databricks", "redshift", "spark_iceberg"] # TODO: Add RS self-hosted runner
6566
services:
6667
postgres:
6768
image: postgres:latest
@@ -82,7 +83,26 @@ jobs:
8283
steps:
8384
- name: Check out
8485
uses: actions/checkout@v3
85-
86+
- name: Configure Docker credentials
87+
uses: docker/login-action@v2
88+
with:
89+
username: ${{ secrets.DOCKERHUB_SNOWPLOWCI_READ_USERNAME }}
90+
password: ${{ secrets.DOCKERHUB_SNOWPLOWCI_READ_PASSWORD }}
91+
- name: Configure AWS credentials
92+
uses: aws-actions/configure-aws-credentials@v1
93+
with:
94+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
95+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
96+
aws-region: eu-west-1
97+
- name: Set warehouse variables
98+
id: set_warehouse
99+
run: |
100+
WAREHOUSE_PLATFORM=$(echo ${{ matrix.warehouse }} | cut -d'_' -f1)
101+
WAREHOUSE_SPECIFIC=$(echo ${{ matrix.warehouse }} | cut -s -d'_' -f2)
102+
echo "WAREHOUSE_PLATFORM=${WAREHOUSE_PLATFORM}" >> $GITHUB_ENV
103+
echo "WAREHOUSE_SPECIFIC=${WAREHOUSE_SPECIFIC}" >> $GITHUB_ENV
104+
echo "warehouse_platform=${WAREHOUSE_PLATFORM}" >> $GITHUB_OUTPUT
105+
echo "warehouse_specific=${WAREHOUSE_SPECIFIC}" >> $GITHUB_OUTPUT
86106
# Remove '*' and replace '.' with '_' in DBT_VERSION & set as SCHEMA_SUFFIX.
87107
# SCHEMA_SUFFIX allows us to run multiple versions of dbt in parallel without overwriting the output tables
88108
- name: Set SCHEMA_SUFFIX env
@@ -92,7 +112,7 @@ jobs:
92112

93113
- name: Set DEFAULT_TARGET env
94114
run: |
95-
echo "DEFAULT_TARGET=${{ matrix.warehouse }}" >> $GITHUB_ENV
115+
echo "DEFAULT_TARGET=${{matrix.warehouse}}" >> $GITHUB_ENV
96116
97117
- name: Python setup
98118
uses: actions/setup-python@v4
@@ -103,32 +123,46 @@ jobs:
103123
uses: actions/cache@v3
104124
with:
105125
path: ~/.cache/pip
106-
key: ${{ runner.os }}-pip-${{ matrix.dbt_version }}-${{ matrix.warehouse }}
126+
key: ${{ runner.os }}-pip-${{ matrix.dbt_version }}-${{env.WAREHOUSE_PLATFORM}}
107127
restore-keys: |
108-
${{ runner.os }}-pip-${{ matrix.dbt_version }}-${{ matrix.warehouse }}
128+
${{ runner.os }}-pip-${{ matrix.dbt_version }}-${{env.WAREHOUSE_PLATFORM}}
109129
110130
# Install latest patch version. Upgrade if cache contains old patch version.
111131
- name: Install dependencies
112132
run: |
113133
pip install wheel setuptools
114-
pip install -Iv dbt-${{ matrix.warehouse }}==${{ matrix.dbt_version }} --upgrade
134+
pip install -Iv dbt-${{env.WAREHOUSE_PLATFORM}}==${{ matrix.dbt_version }} --upgrade
115135
dbt deps
116-
if: ${{matrix.warehouse != 'spark'}}
136+
if: ${{env.WAREHOUSE_PLATFORM != 'spark'}}
117137

118138
- name: Install spark dependencies
119139
run: |
120140
pip install --upgrade pip wheel setuptools
121-
pip install -Iv "dbt-${{ matrix.warehouse }}[ODBC]"==${{ matrix.dbt_version }} --upgrade
141+
pip install -Iv "dbt-${{ env.WAREHOUSE_PLATFORM }}[PyHive]"==${{ matrix.dbt_version }} --upgrade
122142
dbt deps
123-
if: ${{matrix.warehouse == 'spark'}}
143+
if: ${{env.WAREHOUSE_PLATFORM == 'spark'}}
144+
145+
- name: Install Docker Compose
146+
run: |
147+
sudo curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o /usr/local/bin/docker-compose
148+
sudo chmod +x /usr/local/bin/docker-compose
124149
150+
151+
- name: Build and start Spark cluster
152+
working-directory: .github/workflows/spark_deployment
153+
run: |
154+
docker-compose up -d
155+
echo "Waiting for Spark services to start..."
156+
sleep 90
157+
if: ${{env.WAREHOUSE_PLATFORM == 'spark'}}
158+
125159
- name: "Pre-test: Drop ci schemas"
126160
run: |
127-
dbt run-operation post_ci_cleanup --target ${{ matrix.warehouse }}
161+
dbt run-operation post_ci_cleanup --target ${{matrix.warehouse}}
128162
129163
- name: Run tests
130-
run: ./.scripts/integration_tests.sh -d ${{ matrix.warehouse }}
164+
run: ./.scripts/integration_tests.sh -d ${{matrix.warehouse}}
131165

132166
- name: "Post-test: Drop ci schemas"
133167
run: |
134-
dbt run-operation post_ci_cleanup --target ${{ matrix.warehouse }}
168+
dbt run-operation post_ci_cleanup --target ${{matrix.warehouse}}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
FROM openjdk:11-jre-slim
2+
3+
# Set environment variables
4+
ENV SPARK_VERSION=3.5.1
5+
ENV HADOOP_VERSION=3.3.4
6+
ENV ICEBERG_VERSION=1.4.2
7+
ENV AWS_SDK_VERSION=1.12.581
8+
9+
# Install necessary tools
10+
RUN apt-get update && apt-get install -y curl wget procps rsync ssh
11+
12+
# Download and install Spark
13+
RUN wget https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz && \
14+
tar -xvzf spark-${SPARK_VERSION}-bin-hadoop3.tgz && \
15+
mv spark-${SPARK_VERSION}-bin-hadoop3 /spark && \
16+
rm spark-${SPARK_VERSION}-bin-hadoop3.tgz
17+
18+
# Set Spark environment variables
19+
ENV SPARK_HOME=/spark
20+
ENV PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
21+
22+
# Download necessary JARs
23+
RUN mkdir -p /spark/jars && \
24+
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/${ICEBERG_VERSION}/iceberg-spark-runtime-3.5_2.12-${ICEBERG_VERSION}.jar -O /spark/jars/iceberg-spark-runtime.jar && \
25+
wget https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -O /spark/jars/iceberg-aws-bundle.jar && \
26+
wget https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_VERSION}/hadoop-aws-${HADOOP_VERSION}.jar -O /spark/jars/hadoop-aws.jar && \
27+
wget https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/${AWS_SDK_VERSION}/aws-java-sdk-bundle-${AWS_SDK_VERSION}.jar -O /spark/jars/aws-java-sdk-bundle.jar
28+
29+
# Create directory for Spark events
30+
RUN mkdir -p /tmp/spark-events
31+
32+
WORKDIR /spark
33+
34+
CMD ["bash"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
#!/bin/bash
2+
3+
# Set variables
4+
DOCKER_HUB_ORG="snowplow"
5+
IMAGE_NAME="spark-s3-iceberg"
6+
TAG="latest"
7+
8+
# Build the image
9+
echo "Building Docker image..."
10+
docker build --platform linux/amd64 -t $DOCKER_HUB_ORG/$IMAGE_NAME:$TAG .
11+
12+
# Log in to Docker Hub
13+
echo "Logging in to Docker Hub..."
14+
docker login
15+
16+
# Push the image to Docker Hub
17+
echo "Pushing image to Docker Hub..."
18+
docker push $DOCKER_HUB_ORG/$IMAGE_NAME:$TAG
19+
20+
echo "Image successfully built and pushed to Docker Hub"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
version: '3'
2+
3+
networks:
4+
spark-network:
5+
driver: bridge
6+
7+
services:
8+
spark-master:
9+
image: snowplow/spark-s3-iceberg:latest
10+
command: ["/bin/bash", "-c", "/spark/sbin/start-master.sh -h spark-master --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.master.Master-1-*.out"]
11+
hostname: spark-master
12+
ports:
13+
- '8080:8080'
14+
- '7077:7077'
15+
environment:
16+
- SPARK_LOCAL_IP=spark-master
17+
- SPARK_MASTER_HOST=spark-master
18+
- SPARK_MASTER_PORT=7077
19+
- SPARK_MASTER_OPTS="-Dspark.driver.memory=2g"
20+
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
21+
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
22+
- AWS_REGION=eu-west-1
23+
- AWS_DEFAULT_REGION=eu-west-1
24+
volumes:
25+
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
26+
networks:
27+
- spark-network
28+
29+
spark-worker:
30+
image: snowplow/spark-s3-iceberg:latest
31+
command: ["/bin/bash", "-c", "sleep 10 && /spark/sbin/start-worker.sh spark://spark-master:7077 --properties-file /spark/conf/spark-defaults.conf && tail -f /spark/logs/spark--org.apache.spark.deploy.worker.Worker-*.out"]
32+
depends_on:
33+
- spark-master
34+
environment:
35+
- SPARK_WORKER_CORES=2
36+
- SPARK_WORKER_MEMORY=4G
37+
- SPARK_EXECUTOR_MEMORY=3G
38+
- SPARK_LOCAL_IP=spark-worker
39+
- SPARK_MASTER=spark://spark-master:7077
40+
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
41+
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
42+
- AWS_REGION=eu-west-1
43+
- AWS_DEFAULT_REGION=eu-west-1
44+
volumes:
45+
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
46+
networks:
47+
- spark-network
48+
49+
thrift-server:
50+
image: snowplow/spark-s3-iceberg:latest
51+
command: ["/bin/bash", "-c", "sleep 30 && /spark/sbin/start-thriftserver.sh --master spark://spark-master:7077 --driver-memory 2g --executor-memory 3g --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.server2.thrift.bind.host=0.0.0.0 --conf spark.sql.hive.thriftServer.async=true --conf spark.sql.hive.thriftServer.workerQueue.size=2000 --conf spark.sql.hive.thriftServer.maxWorkerThreads=100 --conf spark.sql.hive.thriftServer.minWorkerThreads=50 && tail -f /spark/logs/spark--org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-*.out"]
52+
ports:
53+
- '10000:10000'
54+
depends_on:
55+
- spark-master
56+
- spark-worker
57+
environment:
58+
- SPARK_LOCAL_IP=thrift-server
59+
- AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
60+
- AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
61+
- AWS_REGION=eu-west-1
62+
- AWS_DEFAULT_REGION=eu-west-1
63+
volumes:
64+
- ./spark-defaults.conf:/spark/conf/spark-defaults.conf
65+
networks:
66+
- spark-network
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
spark.master spark://spark-master:7077
2+
3+
spark.sql.warehouse.dir s3a://dbt-spark-iceberg/github-integration-testing
4+
spark.sql.catalog.glue org.apache.iceberg.spark.SparkCatalog
5+
spark.sql.catalog.glue.catalog-impl org.apache.iceberg.aws.glue.GlueCatalog
6+
spark.sql.catalog.glue.warehouse s3a://dbt-spark-iceberg/github-integration-testing
7+
spark.sql.catalog.glue.io-impl org.apache.iceberg.aws.s3.S3FileIO
8+
spark.sql.defaultCatalog glue
9+
spark.sql.catalog.glue.database dbt-spark-iceberg
10+
11+
spark.hadoop.fs.s3a.impl org.apache.hadoop.fs.s3a.S3AFileSystem
12+
spark.hadoop.fs.s3a.access.key <AWS_ACCESS_KEY_ID>
13+
spark.hadoop.fs.s3a.secret.key <AWS_SECRET_ACCESS_KEY>
14+
spark.hadoop.fs.s3a.endpoint s3.eu-west-1.amazonaws.com
15+
spark.hadoop.fs.s3a.path.style.access true
16+
spark.hadoop.fs.s3a.region eu-west-1
17+
spark.hadoop.fs.s3a.aws.region eu-west-1
18+
19+
# Enabling AWS SDK V4 signing (required for regions launched after January 2014)
20+
spark.hadoop.com.amazonaws.services.s3.enableV4 true
21+
spark.hadoop.fs.s3a.aws.credentials.provider org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider
22+
23+
# Hive Metastore Configuration (using AWS Glue)
24+
spark.hadoop.hive.metastore.client.factory.class com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory
25+
26+
# Thrift Server Configuration for better performance in concurrent environments
27+
spark.sql.hive.thriftServer.singleSession false
28+
spark.sql.hive.thriftServer.async true
29+
# spark.sql.hive.thriftServer.maxWorkerThreads 100
30+
# spark.sql.hive.thriftServer.minWorkerThreads 50
31+
# spark.sql.hive.thriftServer.workerQueue.size 2000
32+
33+
# Memory and Performance Tuning
34+
# spark.driver.memory 2g
35+
# spark.executor.memory 3g
36+
# spark.worker.memory 4g
37+
spark.network.timeout 600s
38+
spark.sql.broadcastTimeout 600s
39+
spark.sql.adaptive.enabled true
40+
spark.serializer org.apache.spark.serializer.KryoSerializer
41+
42+
# Logging and Debugging
43+
spark.eventLog.enabled true
44+
spark.eventLog.dir /tmp/spark-events

integration_tests/.scripts/integration_tests.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ do
1010
esac
1111
done
1212

13-
declare -a SUPPORTED_DATABASES=("bigquery" "databricks" "postgres" "redshift" "snowflake")
13+
declare -a SUPPORTED_DATABASES=("bigquery" "databricks" "postgres" "redshift" "snowflake", "spark_iceberg")
1414

1515
# set to lower case
1616
DATABASE="$(echo $DATABASE | tr '[:upper:]' '[:lower:]')"

integration_tests/ci/profiles.yml

+8-9
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ integration_tests:
4747
token_uri: "{{ env_var('BIGQUERY_SERVICE_TOKEN_URI') }}"
4848
auth_provider_x509_cert_url: "{{ env_var('BIGQUERY_SERVICE_AUTH_PROVIDER_X509_CERT_URL') }}"
4949
client_x509_cert_url: "{{ env_var('BIGQUERY_SERVICE_CLIENT_X509_CERT_URL') }}"
50-
5150
snowflake:
5251
type: snowflake
5352
account: "{{ env_var('SNOWFLAKE_TEST_ACCOUNT') }}"
@@ -58,20 +57,20 @@ integration_tests:
5857
warehouse: "{{ env_var('SNOWFLAKE_TEST_WAREHOUSE') }}"
5958
schema: "gh_sp_utils_dbt_{{ env_var('SCHEMA_SUFFIX') }}"
6059
threads: 4
61-
6260
databricks:
6361
type: databricks
6462
schema: "gh_sp_utils_dbt_{{ env_var('SCHEMA_SUFFIX') }}"
6563
host: "{{ env_var('DATABRICKS_TEST_HOST') }}"
6664
http_path: "{{ env_var('DATABRICKS_TEST_HTTP_PATH') }}"
6765
token: "{{ env_var('DATABRICKS_TEST_TOKEN') }}"
6866
threads: 4
69-
70-
spark:
67+
spark_iceberg:
7168
type: spark
69+
method: thrift
70+
host: "{{ env_var('SPARK_MASTER_HOST', 'localhost') }}"
71+
port: 10000
72+
user: "{{ env_var('SPARK_USER', 'spark') }}"
7273
schema: "gh_sp_utils_dbt_{{ env_var('SCHEMA_SUFFIX') }}"
73-
host: "{{ env_var('DATABRICKS_TEST_HOST') }}"
74-
http_path: "{{ env_var('DATABRICKS_TEST_HTTP_PATH') }}"
75-
token: "{{ env_var('DATABRICKS_TEST_TOKEN') }}"
76-
endpoint: "{{ env_var('DATABRICKS_TEST_ENDPOINT') }}"
77-
threads: 4
74+
connect_retries: 5
75+
connect_timeout: 60
76+
threads: 1

0 commit comments

Comments
 (0)