-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #16 from guimou/dev
Spark notebook
- Loading branch information
Showing
11 changed files
with
411 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
.PHONY: default ubi9-py39 | ||
|
||
TAG ?= $(shell git describe --tags --always --dirty) | ||
|
||
default: | ||
@echo "Options are:" | ||
@echo "ubi9-py39 : builds an image based on UBI9 with Python 3.9" | ||
@echo "---" | ||
@echo "Please specify the base image with BASE_IMAGE=..." | ||
@echo "Please specify the image tag with TAG=..." | ||
|
||
ubi9-py39: | ||
cd container && podman build --build-arg=BASE_IMAGE=${BASE_IMAGE} --build-arg=UBI_VERSION=ubi9 --build-arg=PYTHON_VERSION=py39 --build-arg=PYTHON_VERSION_LONG="Python 3.9" -t s2i-spark-notebook-ubi9-py39:${TAG} . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# S2I Spark image | ||
|
||
Notebook based on the Data Science notebook, adding: | ||
|
||
* [Spark](https://spark.apache.org/) ([Github repo](https://github.com/apache/spark)) | ||
|
||
Apache Spark™ is a multi-language engine for executing data engineering, data science, and machine learning on single-node machines or clusters. | ||
|
||
Build example from UBI9 with Python 3.9: | ||
|
||
```bash | ||
make ubi9-py39 BASE_IMAGE=localhost/s2i-datascience-notebook-ubi9-py39:0.0.1 TAG=0.0.1 | ||
``` |
118 changes: 118 additions & 0 deletions
118
notebook-controller-images/s2i-spark/container/Dockerfile
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
ARG BASE_IMAGE | ||
FROM ${BASE_IMAGE} as builder | ||
|
||
# Build options | ||
ARG SPARK_VERSION=3.3.1 | ||
ARG HADOOP_VERSION=3.3.4 | ||
ARG JMX_PROMETHEUS_JAVAAGENT_VERSION=0.17.0 | ||
# Spark's Guava version to match with Hadoop's | ||
ARG GUAVA_VERSION=27.0-jre | ||
|
||
USER 0 | ||
|
||
WORKDIR / | ||
|
||
# Install gzip to extract archives | ||
RUN dnf install -y gzip && \ | ||
dnf clean all | ||
|
||
# Download Spark | ||
ADD https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-without-hadoop.tgz . | ||
# Unzip Spark | ||
RUN tar -xvzf spark-${SPARK_VERSION}-bin-without-hadoop.tgz --no-same-owner | ||
RUN mv spark-${SPARK_VERSION}-bin-without-hadoop spark | ||
|
||
# Download Hadoop | ||
ADD https://archive.apache.org/dist/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz . | ||
# Unzip Hadoop | ||
RUN tar -xvzf hadoop-${HADOOP_VERSION}.tar.gz --no-same-owner | ||
RUN mv hadoop-${HADOOP_VERSION} hadoop | ||
# Delete unnecessary hadoop documentation | ||
RUN rm -rf hadoop/share/doc | ||
|
||
# Download JMX Prometheus javaagent jar | ||
ADD https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/${JMX_PROMETHEUS_JAVAAGENT_VERSION}/jmx_prometheus_javaagent-${JMX_PROMETHEUS_JAVAAGENT_VERSION}.jar /prometheus/ | ||
RUN chmod 0644 prometheus/jmx_prometheus_javaagent*.jar | ||
|
||
# Add updated Guava | ||
WORKDIR /spark/jars | ||
RUN rm -f guava-*.jar | ||
ADD https://repo1.maven.org/maven2/com/google/guava/guava/${GUAVA_VERSION}/guava-${GUAVA_VERSION}.jar . | ||
|
||
# Add Spark Hadoop Cloud to interact with cloud infrastructures | ||
ADD https://repo1.maven.org/maven2/org/apache/spark/spark-hadoop-cloud_2.12/${SPARK_VERSION}/spark-hadoop-cloud_2.12-${SPARK_VERSION}.jar . | ||
|
||
### Build final image | ||
FROM ${BASE_IMAGE} | ||
|
||
ARG UBI_VERSION | ||
ARG PYTHON_VERSION | ||
ARG PYTHON_VERSION_LONG | ||
ARG JAVA_VERSION=1.8.0 | ||
ARG PKG_ROOT=/opt/app-root | ||
ARG SPARK_VERSION=3.3.1 | ||
ARG HADOOP_VERSION=3.3.4 | ||
ARG JMX_PROMETHEUS_JAVAAGENT_VERSION=0.17.0 | ||
|
||
LABEL name="s2i-aprk-${UBI_VERSION}-${PYTHON_VERSION}:latest" \ | ||
summary="S2I Spark with ${PYTHON_VERSION_LONG} image based on ${UBI_VERSION}" \ | ||
description="Spark notebook with ${PYTHON_VERSION_LONG}, Source-to-Image from ${UBI_VERSION}" \ | ||
io.k8s.description="Spark notebook with ${PYTHON_VERSION_LONG}, Source-to-Image from ${UBI_VERSION}" \ | ||
io.k8s.display-name="S2I Spark notebook with ${PYTHON_VERSION_LONG} ${UBI_VERSION} image" \ | ||
authoritative-source-url="https://github.com/guimou/custom-notebooks" \ | ||
io.openshift.s2i.build.commit.ref="main" \ | ||
io.openshift.s2i.build.source-location="https://github.com/guimou/custom-notebooks/notebook-controller-images/s2i-spark" \ | ||
io.openshift.s2i.build.image="https://quay.io/guimou/s2i-spark-${UBI_VERSION}-${PYTHON_VERSION}" | ||
|
||
USER 0 | ||
|
||
WORKDIR ${PKG_ROOT}/${SPARK_VERSION} | ||
|
||
#################### | ||
# OpenJDK # | ||
#################### | ||
|
||
# Fix for https://issues.redhat.com/browse/OPENJDK-335 | ||
ENV NSS_WRAPPER_PASSWD= | ||
ENV NSS_WRAPPER_GROUP= | ||
|
||
RUN yum -y install java-$JAVA_VERSION-openjdk maven &&\ | ||
yum clean all | ||
|
||
#################### | ||
# Spark # | ||
#################### | ||
|
||
# Copy Spark from builder stage | ||
COPY --from=builder /spark ${PKG_ROOT}/spark-${SPARK_VERSION} | ||
COPY --from=builder /spark/kubernetes/dockerfiles/spark/entrypoint.sh /opt/app-root/bin | ||
|
||
# Copy Hadoop from builder stage | ||
COPY --from=builder /hadoop ${PKG_ROOT}/hadoop-${HADOOP_VERSION} | ||
|
||
# Copy Prometheus jars from builder stage | ||
COPY --from=builder /prometheus ${PKG_ROOT}/prometheus | ||
|
||
RUN chown -R 1001:0 ${PKG_ROOT} | ||
|
||
# Setup required env vars for spark and hadoop | ||
ENV JAVA_HOME=/usr/lib/jvm/jre | ||
ENV SPARK_HOME=${PKG_ROOT}/spark-${SPARK_VERSION} | ||
ENV HADOOP_HOME ${PKG_ROOT}/hadoop-${HADOOP_VERSION} | ||
|
||
ENV SPARK_DIST_CLASSPATH="$HADOOP_HOME/etc/hadoop:$HADOOP_HOME/share/hadoop/common/lib/*:$HADOOP_HOME/share/hadoop/common/*:$HADOOP_HOME/share/hadoop/hdfs:$HADOOP_HOME/share/hadoop/hdfs/lib/*:$HADOOP_HOME/share/hadoop/hdfs/*:$HADOOP_HOME/share/hadoop/yarn:$HADOOP_HOME/share/hadoop/yarn/lib/*:$HADOOP_HOME/share/hadoop/yarn/*:$HADOOP_HOME/share/hadoop/mapreduce/lib/*:$HADOOP_HOME/share/hadoop/mapreduce/*:/contrib/capacity-scheduler/*.jar:$HADOOP_HOME/share/hadoop/tools/lib/*" | ||
|
||
ENV SPARK_EXTRA_CLASSPATH="$SPARK_DIST_CLASSPATH" | ||
|
||
ENV LD_LIBRARY_PATH /lib64 | ||
|
||
ENV PATH="${PATH}:${PKG_ROOT}/spark-${SPARK_VERSION}/bin" | ||
|
||
WORKDIR /opt/app-root/src | ||
USER 1001 | ||
|
||
RUN pip install pyspark==3.3.1 | ||
|
||
RUN fix-permissions ${PKG_ROOT} | ||
|
||
CMD /opt/app-root/bin/start-singleuser.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Spark/PySpark image | ||
|
||
This is the datascience notebook version, enhanced with Spark capabilities. | ||
|
||
## Standard (local) use | ||
|
||
You can use it as a standard notebook and run Spark/PySpark code with a local Spark. | ||
|
||
## Spark-submit to Kubernetes | ||
|
||
Spark has builtin kubernetes support, so you can directly launch a Spark job or a Spark cluster from your notebook. This only requires to setup specific rights for the ServiceAccount that will be used to launch the driver and/or executor pods. | ||
|
||
In the `example` folder you will find: | ||
|
||
* `rbac.yam`: this defines a role that you can create in the namespace where your notebook is running (default `rhods-notebooks`), and a RoleBinding that applies this role to the ServiceAccount used to launch the driver and the executor. To enable this feature, you only have to change the `serviceaccount_name` entries by the name of the ServiceAccount used to launch your notebook. With recent versions of ODH or RHODS, this will be something like `jupyter-nb-username`, with the username part being the name with which you are authenticated. You can also apply the Role to groups if you want to enable the feature for multiple users at once. | ||
* Two notebooks with standard Spark (jars), and PySpark examples. They are launching spark-submit commands in client mode. As the driver is running directly inside the container image, executors are automatically shut down when the notebook (or the kernel) is stopped, allowing for auto-cleaning if you forget to shut down your spark instance. | ||
|
||
Note: as the Role can be applied on a per-user or per-group basis, you can control who can launch Spark jobs like this. Also, you can enable it in some specific namespaces only (Workspace from Data Science Projects feature), then even set quotas on this namespace to prevent people consuming too many resources. |
131 changes: 131 additions & 0 deletions
131
notebook-controller-images/s2i-spark/example/pyspark-submit-example.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "fd4144d6-58cd-46af-abba-8d6e6e62a7f6", | ||
"metadata": {}, | ||
"source": [ | ||
"#### We must first initialize some variables for the current environment" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d5eba711-079b-4fcd-8e6a-4b82557a34d1", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"import socket\n", | ||
"hostname=socket.gethostname()\n", | ||
"IPAddr=socket.gethostbyname(hostname)\n", | ||
"\n", | ||
"with open('/var/run/secrets/kubernetes.io/serviceaccount/namespace', 'r') as f:\n", | ||
" current_namespace = f.readline()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "be8cf6b2-4a71-4668-801a-19204a370e78", | ||
"metadata": {}, | ||
"source": [ | ||
"#### We can then launch a Spark job directly from the notebook" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "9b2882d4-0e37-4834-81e0-56ff5421ad14", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from pyspark import SparkContext, SparkConf\n", | ||
"from pyspark.sql import SparkSession\n", | ||
"\n", | ||
"# Create Spark config for our Kubernetes based cluster manager\n", | ||
"sparkConf = SparkConf()\n", | ||
"sparkConf.setMaster(\"k8s://https://\" + os.environ[\"KUBERNETES_SERVICE_HOST\"] + \":443\")\n", | ||
"sparkConf.set(\"spark.submit.deployMode\",\"client\")\n", | ||
"sparkConf.set(\"spark.kubernetes.container.image\", \"quay.io/opendatahub-contrib/pyspark:s3.3.1-h3.3.4_v0.1.1\")\n", | ||
"sparkConf.set(\"spark.kubernetes.namespace\", current_namespace)\n", | ||
"sparkConf.set(\"spark.driver.host\", IPAddr)\n", | ||
"sparkConf.set(\"spark.executor.instances\", \"3\")\n", | ||
"sparkConf.set(\"spark.executor.memory\", \"512m\")\n", | ||
"sparkConf.set(\"spark.executor.cores\", \"1\")\n", | ||
"sparkConf.set(\"spark.kubernetes.pyspark.pythonVersion\", \"3\")\n", | ||
"# Initialize our Spark cluster, this will actually\n", | ||
"# generate the worker nodes.\n", | ||
"spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()\n", | ||
"sc = spark.sparkContext" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "206476f5-7fb2-4be1-8da9-53f00619f57a", | ||
"metadata": {}, | ||
"source": [ | ||
"#### You can now launch jobs directly from your notebook" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d50346bb-ac3f-4e79-9b5f-e20253bfc678", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from random import random\n", | ||
"from operator import add\n", | ||
"\n", | ||
"partitions = 7\n", | ||
"n = 10000000 * partitions\n", | ||
"def f(_):\n", | ||
" x = random() * 2 - 1\n", | ||
" y = random() * 2 - 1\n", | ||
" \n", | ||
" return 1 if x ** 2 + y ** 2 <= 1 else 0\n", | ||
"count = sc.parallelize(range(1, n + 1), partitions).map(f).reduce(add)\n", | ||
"print(\"Pi is roughly %f\" % (4.0 * count / n))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "1a32946f-2e88-429b-a5f1-b5f38ee5ec50", | ||
"metadata": {}, | ||
"source": [ | ||
"#### Don't forget to shut down your cluster!" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "2a5f9cdc-f02e-4745-945c-af70375549a5", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"sc.stop()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3.9.10", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
Oops, something went wrong.