From 0e60b2f5cd8215602f645102c1c9c5d0102cc2a2 Mon Sep 17 00:00:00 2001 From: Ricardo Martinelli de Oliveira Date: Wed, 17 Nov 2021 15:25:14 -0300 Subject: [PATCH] Upgrade to Spark 3.2.0 --- image.yaml | 3 +- modules/spark/module.yaml | 4 +- openshift-spark-build/Dockerfile | 55 +----- .../modules/s2i/added/assemble | 158 ------------------ .../modules/s2i/added/s2i-env-vars | 3 - openshift-spark-build/modules/s2i/added/usage | 79 --------- openshift-spark-build/modules/s2i/install | 9 - openshift-spark-build/modules/s2i/module.yaml | 30 ---- .../modules/spark/module.yaml | 9 +- ...p3.2.tgz => spark-3.2.0-bin-hadoop3.2.tgz} | 0 10 files changed, 15 insertions(+), 335 deletions(-) delete mode 100755 openshift-spark-build/modules/s2i/added/assemble delete mode 100644 openshift-spark-build/modules/s2i/added/s2i-env-vars delete mode 100755 openshift-spark-build/modules/s2i/added/usage delete mode 100644 openshift-spark-build/modules/s2i/install delete mode 100644 openshift-spark-build/modules/s2i/module.yaml rename openshift-spark-build/{spark-3.0.1-bin-hadoop3.2.tgz => spark-3.2.0-bin-hadoop3.2.tgz} (100%) diff --git a/image.yaml b/image.yaml index bd4e89f..5b1c29d 100644 --- a/image.yaml +++ b/image.yaml @@ -12,7 +12,7 @@ labels: - name: maintainer value: Trevor McKay - name: sparkversion - value: 3.0.1 + value: 3.2.0 - name: "io.openshift.s2i.scripts-url" value: "image:///usr/libexec/s2i" @@ -29,7 +29,6 @@ modules: - name: common - name: metrics - name: spark - - name: s2i run: user: 185 entrypoint: diff --git a/modules/spark/module.yaml b/modules/spark/module.yaml index fe0dc38..e9cfbf2 100644 --- a/modules/spark/module.yaml +++ b/modules/spark/module.yaml @@ -12,7 +12,7 @@ packages: install: - wget artifacts: - - url: https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop3.2.tgz - md5: 31e019e35e75a4c55c7efa4464641bf1 + - url: https://archive.apache.org/dist/spark/spark-3.2.0/spark-3.2.0-bin-hadoop3.2.tgz + md5: 2f28abfe7b8838f01b010fc22ccad155 execute: - script: install diff --git a/openshift-spark-build/Dockerfile b/openshift-spark-build/Dockerfile index 1bec161..ac783cf 100644 --- a/openshift-spark-build/Dockerfile +++ b/openshift-spark-build/Dockerfile @@ -1,19 +1,4 @@ -# Copyright 2019 Red Hat -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# ------------------------------------------------------------------------ -# + # This is a Dockerfile for the radanalyticsio/openshift-spark:3.0 image. @@ -21,6 +6,7 @@ ## \ FROM centos:8 + USER root ###### START module 'common:1.0' @@ -53,9 +39,9 @@ ###### START module 'spark:1.0' ###### \ - # Copy 'spark' module general artifacts + # Copy 'spark' module general artifacts to '/tmp/artifacts/' destination COPY \ - spark-3.0.1-bin-hadoop3.2.tgz \ + spark-3.2.0-bin-hadoop3.2.tgz \ /tmp/artifacts/ # Copy 'spark' module content COPY modules/spark /tmp/scripts/spark @@ -75,27 +61,6 @@ ###### / ###### END module 'spark:1.0' -###### START module 's2i:1.0' -###### \ - # Copy 's2i' module content - COPY modules/s2i /tmp/scripts/s2i - # Switch to 'root' user to install 's2i' module defined packages - USER root - # Install packages defined in the 's2i' module - RUN yum --setopt=tsflags=nodocs install -y wget \ - && rpm -q wget - # Set 's2i' module defined environment variables - ENV \ - PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/spark/bin" \ - SPARK_HOME="/opt/spark" \ - SPARK_INSTALL="/opt/spark-distro" \ - STI_SCRIPTS_PATH="/usr/libexec/s2i" - # Custom scripts from 's2i' module - USER root - RUN [ "sh", "-x", "/tmp/scripts/s2i/install" ] -###### / -###### END module 's2i:1.0' - ###### START image 'radanalyticsio/openshift-spark:3.0' ###### \ # Switch to 'root' user to install 'radanalyticsio/openshift-spark' image defined packages @@ -103,27 +68,21 @@ # Install packages defined in the 'radanalyticsio/openshift-spark' image RUN yum --setopt=tsflags=nodocs install -y java-11-openjdk python3-numpy \ && rpm -q java-11-openjdk python3-numpy - # Set 'radanalyticsio/openshift-spark' image defined environment variables - ENV \ - JBOSS_IMAGE_NAME="radanalyticsio/openshift-spark" \ - JBOSS_IMAGE_VERSION="3.0" # Set 'radanalyticsio/openshift-spark' image defined labels LABEL \ - io.cekit.version="3.6.0" \ + io.cekit.version="3.12.0" \ io.openshift.s2i.scripts-url="image:///usr/libexec/s2i" \ maintainer="Trevor McKay " \ - name="radanalyticsio/openshift-spark" \ - sparkversion="3.0.1" \ - version="3.0" + sparkversion="3.2.0" ###### / ###### END image 'radanalyticsio/openshift-spark:3.0' + # Switch to 'root' user and remove artifacts and modules USER root RUN [ ! -d /tmp/scripts ] || rm -rf /tmp/scripts RUN [ ! -d /tmp/artifacts ] || rm -rf /tmp/artifacts - # Clear package manager metadata RUN yum clean all && [ ! -d /var/cache/yum ] || rm -rf /var/cache/yum diff --git a/openshift-spark-build/modules/s2i/added/assemble b/openshift-spark-build/modules/s2i/added/assemble deleted file mode 100755 index 5567cea..0000000 --- a/openshift-spark-build/modules/s2i/added/assemble +++ /dev/null @@ -1,158 +0,0 @@ -#!/bin/bash - -source $STI_SCRIPTS_PATH/s2i-env-vars - -# Just a word about the directory structure -# SPARK_HOME == /opt/spark -# SPARK_INSTALL == /opt/spark-distro - -# Extra things like default configuration files and additional -# boot scripts may be stored in SPARK_INSTALL - -# At runtime, /opt/spark is a symlink to /opt/spark-distro/distro -# but /opt/spark-distro/distro does not actually exist yet - -# The Spark tarball will be expanded in /opt/spark-distro using -# it's original name, for example /opt/spark-distro/spark-2.3.0-bin-hadoop2.7, -# as a dev aid to tracking and version checking - -# Ultimately, /opt/spark-distro/distro is created as a symlink to the Spark root -# directory. This double-hop from /opt/spark to the Spark root through symlinks -# allows the Spark installation to be staged in the base image but completed in -# the S2I build without expanding permissions - -function match_sum { - local sumfile=$1 - local delim=$2 - local sha512=$3 - local initial=$(cat $sumfile | tr -d [:space:] | cut -d"$delim" -f1 | tr [:upper:] [:lower:]) - local rest=$(cat $sumfile | tr -d [:space:] | cut -d"$delim" --complement -f1 | tr [:upper:] [:lower:]) - if [ "$sha512" == "$initial" ] || [ "$sha512" == "$rest" ]; then - return 0 - fi - return 1 -} - -if [ -f $SPARK_HOME/bin/spark-submit ]; then - echo "Spark is installed, nothing to do" - exit 1 -else - echo "Attempting to install Spark" - # If a url has been specfified for spark use it - if [ -n "$SPARK_URL" ]; then - echo Downloading $SPARK_URL - wget $SPARK_URL -P $S2I_SOURCE_DIR - fi - if [ -n "$SPARK_SHA512_URL" ]; then - echo Downloading $SPARK_SHA512_URL - wget $SPARK_SHA512_URL -P $S2I_SOURCE_DIR - fi - - for spark in $(ls "$S2I_SOURCE_DIR"); do - - spark=$S2I_SOURCE_DIR/$spark - echo Found $spark - echo Checking for valid Spark archive - - # Is the file a directory? If it contains spark-submit, move it - if [ -d "$spark" ]; then - if ! [ -f $spark/bin/spark-submit ]; then - echo Ignoring directory $spark, no spark-submit - continue - fi - echo Installing from directory $spark - sparkdir=$SPARK_INSTALL/$(basename $spark) - mv $spark $SPARK_INSTALL - else - # If we can get the table of contents, it's a tar archive, otherwise ignore - tar -tf $spark &> /dev/null - if [ "$?" -ne 0 ]; then - echo Ignoring $spark, not a tar archive - continue - fi - echo Validating tar archive $spark - - # Does the tarball contain a spark-submit? - name=$(tar -tzf $spark | grep -e "^[^/]*/bin/spark-submit$") - if [ "$?" -ne 0 ]; then - echo Ignoring tarball $spark, no spark-submit - continue - else - echo Found valid tar archive, matching checksums - # See if we have an sha512 file to match against - if [ -f "$spark".sha512 ]; then - calcvalue=$(sha512sum "$spark" | cut -d\ -f1) - # split the sha512 file using a colon - match_sum "$spark".sha512 \: $calcvalue - matched="$?" - if [ "$matched" -ne 0 ]; then - # split the sha512 file using equals sign in case it's BSD - match_sum "$spark".sha512 \= $calcvalue - matched="$?" - fi - if [ "$matched" -ne 0 ]; then - echo Ignoring tarball $spark, sha512sum did not match - continue - fi - fi - - # dname will be the intial directory from the path of spark-submit - # we found in the tarball, ie the dir created by tar - echo Installing from tarball $spark - dname=$(dirname $name | cut -d/ -f 1) - sparkdir=$SPARK_INSTALL/$dname - tar -xzf $spark -C $SPARK_INSTALL - fi - fi - - ln -s $sparkdir $SPARK_INSTALL/distro - - # Search for the spark entrypoint file and copy it to $SPARK_INSTALL - entry=$(find $sparkdir/kubernetes -name entrypoint.sh) - if [ -n "$entry" ]; then - echo Installing spark native entrypoint for use with spark-on-k8s commands - cp $entry $SPARK_INSTALL - - # We want to get rid of the tini invocation - sed -i "s@exec .*/tini -s --@exec@" $SPARK_INSTALL/entrypoint.sh - else - echo No spark native entrypoint found for use with spark-on-k8s commands - fi - - # Include the default spark configuration files - mv --no-clobber "$SPARK_INSTALL"/conf/* "$SPARK_HOME"/conf/ - - # If someone included mods in a parallel directory, install them with rsync - # Don't try to preserve permisions, owner, or group because we don't have - # any control over how s2i uploaded the files, so there's no use preserving. - if [ -x /usr/bin/rsync ] && [ -d "$S2I_SOURCE_DIR/modify-spark" ]; then - echo Found a modify-spark directory, running rsync to install changes - rsync -vrltD "$S2I_SOURCE_DIR/modify-spark/" $SPARK_HOME - fi - - # Spark workers need to write to the spark directory to track apps - chmod -R g+rwX $sparkdir - - # Can we run spark-submit? - $SPARK_HOME/bin/spark-submit --version - if [ "$?" -eq 0 ]; then - echo Spark installed successfully - exit 0 - else - echo Cannot run spark-submit, Spark install failed - fi - - # Just in case there is more than one tarball, clean up - rm -rf $sparkdir - done - - echo no valid Spark distribution found - - if [ -n "$DEBUG_ASSEMBLE" ]; then - echo Looping forever so you can \'oc rsh\' - while true; do - sleep 5 - done - fi - exit 1 -fi diff --git a/openshift-spark-build/modules/s2i/added/s2i-env-vars b/openshift-spark-build/modules/s2i/added/s2i-env-vars deleted file mode 100644 index 9bdd8ca..0000000 --- a/openshift-spark-build/modules/s2i/added/s2i-env-vars +++ /dev/null @@ -1,3 +0,0 @@ -# Local vars setup with defaults -S2I_DESTINATION=${S2I_DESTINATION:-/tmp} -S2I_SOURCE_DIR="${S2I_DESTINATION}/src" diff --git a/openshift-spark-build/modules/s2i/added/usage b/openshift-spark-build/modules/s2i/added/usage deleted file mode 100755 index 4eab57f..0000000 --- a/openshift-spark-build/modules/s2i/added/usage +++ /dev/null @@ -1,79 +0,0 @@ -#!/bin/bash -if [ -f "$SPARK_HOME"/bin/spark-submit ]; then -cat <