Merge pull request #593 from NVIDIA/branch-24.02

release 24.02 [skip ci]
NVIDIA · Mar 12, 2024 · e0f644d · e0f644d
2 parents e8d138b + 4030ccb
commit e0f644d
Show file tree

Hide file tree

Showing 63 changed files with 1,706 additions and 193 deletions.
diff --git a/.github/workflows/auto-merge.yml b/.github/workflows/auto-merge.yml
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@ name: auto-merge HEAD to BASE
 on:
   pull_request_target:
     branches:
-    - branch-23.12
+    - branch-24.02
     types: [closed]
 
 jobs:
@@ -29,14 +29,14 @@ jobs:
     steps:
       - uses: actions/checkout@v3
         with:
-          ref: branch-23.12 # force to fetch from latest upstream instead of PR ref
+          ref: branch-24.02 # force to fetch from latest upstream instead of PR ref
 
       - name: auto-merge job
         uses: ./.github/workflows/auto-merge
         env:
           OWNER: NVIDIA
           REPO_NAME: spark-rapids-ml
-          HEAD: branch-23.12
-          BASE: branch-24.02
+          HEAD: branch-24.02
+          BASE: branch-24.04
           AUTOMERGE_TOKEN: ${{ secrets.AUTOMERGE_TOKEN }} # use to merge PR
 
diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
@@ -42,11 +42,10 @@ jobs:
       rongou,\
       wjxiz1992,\
       GaryShen2008,\
-      pxLi,\
       NvTimLiu,\
       YanxuanLiu,\
       zhanga5,\
-      rishic3,\
+      Er1cCheng,\
       ', format('{0},', github.actor)) && github.event.comment.body == 'build'
     steps:
       - name: Check if comment is issued by authorized person

diff --git a/ci/Dockerfile b/ci/Dockerfile
@@ -37,6 +37,6 @@ RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86
     && conda config --set solver libmamba
 
 # install cuML
-ARG CUML_VER=23.12
+ARG CUML_VER=24.02
 RUN conda install -y -c rapidsai -c conda-forge -c nvidia cuml=$CUML_VER python=3.9 cuda-version=11.8 \
     && conda clean --all -f -y
diff --git a/ci/deploy.sh b/ci/deploy.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/ci/docs.sh b/ci/docs.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/ci/test.sh b/ci/test.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/docker/Dockerfile.pip b/docker/Dockerfile.pip
@@ -18,7 +18,7 @@ ARG CUDA_VERSION=11.8.0
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 
 ARG PYSPARK_VERSION=3.3.1
-ARG RAPIDS_VERSION=23.12.0
+ARG RAPIDS_VERSION=24.2.0
 ARG ARCH=amd64
 #ARG ARCH=arm64
 # Install packages to build spark-rapids-ml

diff --git a/docker/Dockerfile.python b/docker/Dockerfile.python
@@ -17,7 +17,7 @@
 ARG CUDA_VERSION=11.8.0
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
 
-ARG CUML_VERSION=23.12
+ARG CUML_VERSION=24.02
 
 # Install packages to build spark-rapids-ml
 RUN apt update -y \

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -7,9 +7,9 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
 project = 'spark-rapids-ml'
-copyright = '2023, NVIDIA'
+copyright = '2024, NVIDIA'
 author = 'NVIDIA'
-release = '23.12.0'
+release = '24.02.0'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/jvm/README.md b/jvm/README.md
@@ -74,7 +74,7 @@ the _project root path_ with:
 cd jvm
 mvn clean package
 ```
-Then `rapids-4-spark-ml_2.12-23.12.0-SNAPSHOT.jar` will be generated under `target` folder.
+Then `rapids-4-spark-ml_2.12-24.02.0-SNAPSHOT.jar` will be generated under `target` folder.
 
 Users can also use the _release_ version spark-rapids plugin as the dependency if it's already been
 released in public maven repositories, see [rapids-4-spark maven repository](https://mvnrepository.com/artifact/com.nvidia/rapids-4-spark)
@@ -94,8 +94,8 @@ repository, usually in your `~/.m2/repository`.
 
 Add the artifact jar to the Spark, for example:
 ```bash
-ML_JAR="target/rapids-4-spark-ml_2.12-23.12.0-SNAPSHOT.jar"
-PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/23.12.0/rapids-4-spark_2.12-23.12.1.jar"
+ML_JAR="target/rapids-4-spark-ml_2.12-24.02.0-SNAPSHOT.jar"
+PLUGIN_JAR="~/.m2/repository/com/nvidia/rapids-4-spark_2.12/24.02.0/rapids-4-spark_2.12-24.02.0.jar"
 
 $SPARK_HOME/bin/spark-shell --master $SPARK_MASTER \
  --driver-memory 20G \

diff --git a/notebooks/aws-emr/init-bootstrap-action.sh b/notebooks/aws-emr/init-bootstrap-action.sh
@@ -8,7 +8,7 @@ sudo chmod a+rwx -R /sys/fs/cgroup/devices
 sudo yum install -y gcc openssl-devel bzip2-devel libffi-devel tar gzip wget make mysql-devel
 sudo bash -c "wget https://www.python.org/ftp/python/3.9.9/Python-3.9.9.tgz && tar xzf Python-3.9.9.tgz && cd Python-3.9.9 && ./configure --enable-optimizations && make altinstall"
 
-RAPIDS_VERSION=23.12.0
+RAPIDS_VERSION=24.2.0
 
 # install scikit-learn 
 sudo /usr/local/bin/pip3.9 install scikit-learn

diff --git a/notebooks/databricks/README.md b/notebooks/databricks/README.md
@@ -44,7 +44,7 @@ If you already have a Databricks account, you can run the example notebooks on a
       spark.task.resource.gpu.amount 1
       spark.databricks.delta.preview.enabled true
       spark.python.worker.reuse true
-      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-23.12.1.jar:/databricks/spark/python
+      spark.executorEnv.PYTHONPATH /databricks/jars/rapids-4-spark_2.12-24.02.0.jar:/databricks/spark/python
       spark.sql.execution.arrow.maxRecordsPerBatch 100000
       spark.rapids.memory.gpu.minAllocFraction 0.0001
       spark.plugins com.nvidia.spark.SQLPlugin

diff --git a/notebooks/databricks/init-pip-cuda-11.8.sh b/notebooks/databricks/init-pip-cuda-11.8.sh
@@ -4,8 +4,8 @@ SPARK_RAPIDS_ML_ZIP=/dbfs/path/to/zip/file
 # IMPORTANT: specify RAPIDS_VERSION fully 23.10.0 and not 23.10
 # also in general, RAPIDS_VERSION (python) fields should omit any leading 0 in month/minor field (i.e. 23.8.0 and not 23.08.0)
 # while SPARK_RAPIDS_VERSION (jar) should have leading 0 in month/minor (e.g. 23.08.2 and not 23.8.2)
-RAPIDS_VERSION=23.12.0
-SPARK_RAPIDS_VERSION=23.12.1
+RAPIDS_VERSION=24.2.0
+SPARK_RAPIDS_VERSION=24.02.0
 
 curl -L https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/${SPARK_RAPIDS_VERSION}/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}-cuda11.jar -o /databricks/jars/rapids-4-spark_2.12-${SPARK_RAPIDS_VERSION}.jar
 

diff --git a/notebooks/dataproc/README.md b/notebooks/dataproc/README.md
@@ -29,7 +29,7 @@ If you already have a Dataproc account, you can run the example notebooks on a D
 - Create a cluster with at least two single-gpu workers.  **Note**: in addition to the initialization script from above, this also uses the standard [initialization actions](https://github.com/GoogleCloudDataproc/initialization-actions) for installing the GPU drivers and RAPIDS:
   ```
   export CUDA_VERSION=11.8
-  export RAPIDS_VERSION=23.12.0
+  export RAPIDS_VERSION=24.2.0
 
   gcloud dataproc clusters create $USER-spark-rapids-ml \
   --image-version=2.1-ubuntu \

diff --git a/notebooks/dataproc/spark_rapids_ml.sh b/notebooks/dataproc/spark_rapids_ml.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-RAPIDS_VERSION=23.12.0
+RAPIDS_VERSION=24.2.0
 
 # patch existing packages
 mamba install "llvmlite<0.40,>=0.39.0dev0" "numba>=0.56.2"

diff --git a/notebooks/logistic-regression.ipynb b/notebooks/logistic-regression.ipynb
@@ -647,6 +647,124 @@
     "else:\n",
     "    print(f\"logLoss: {evaluator.evaluate(cpu_transformed_df)}\")"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Sparse Vectors"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Standardization needs to be false for now. Will be fixed in 24.02."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# prepare dataframe\n",
+    "\n",
+    "from pyspark.ml.feature import CountVectorizer, RegexTokenizer\n",
+    "from pyspark.sql import Row\n",
+    "from sklearn.datasets import fetch_20newsgroups\n",
+    "\n",
+    "try:\n",
+    "    twenty_train = fetch_20newsgroups(subset=\"train\", shuffle=True, random_state=42)\n",
+    "except:\n",
+    "    print(\"Error fetching 20 newsgroup dataset\")\n",
+    "X = twenty_train.data\n",
+    "y = twenty_train.target.tolist()\n",
+    "\n",
+    "data = [\n",
+    "    Row(\n",
+    "        label=y[i],\n",
+    "        weight=1.0,\n",
+    "        text=X[i],\n",
+    "    )\n",
+    "    for i in range(len(X))\n",
+    "]\n",
+    "\n",
+    "df = spark.createDataFrame(data)\n",
+    "\n",
+    "# convert text to sparse vector\n",
+    "tokenizer = RegexTokenizer(inputCol=\"text\", outputCol=\"tokens\")\n",
+    "df = tokenizer.transform(df)\n",
+    "cv = CountVectorizer(inputCol=\"tokens\", outputCol=\"features\")\n",
+    "cv_model = cv.fit(df)\n",
+    "df = cv_model.transform(df)\n",
+    "\n",
+    "df_train, df_test = df.randomSplit([0.8, 0.2], seed=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def sparse_vectors_compat(EstimatorClass):\n",
+    "    from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
+    "\n",
+    "    lr = EstimatorClass(\n",
+    "        regParam=0.01,\n",
+    "        maxIter=100,\n",
+    "        fitIntercept=True,\n",
+    "        standardization=False,\n",
+    "        featuresCol=\"features\",\n",
+    "        labelCol=\"label\",\n",
+    "    )\n",
+    "\n",
+    "    # fit and transform\n",
+    "    start_time = time.time()\n",
+    "    model = lr.fit(df_train)\n",
+    "    fit_time = time.time() - start_time\n",
+    "\n",
+    "    trainsformed_df_test = model.transform(df_test)\n",
+    "\n",
+    "    # evaluate\n",
+    "    evaluator = (\n",
+    "        MulticlassClassificationEvaluator()\n",
+    "        .setPredictionCol(model.getPredictionCol())\n",
+    "        .setProbabilityCol(model.getProbabilityCol())\n",
+    "        .setLabelCol(model.getLabelCol())\n",
+    "    )\n",
+    "    \n",
+    "    evaluator.setMetricName(\"logLoss\")\n",
+    "    test_logLoss = evaluator.evaluate(trainsformed_df_test)\n",
+    "    return (lr, model, fit_time, test_logLoss)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from spark_rapids_ml.classification import LogisticRegression as GPULR\n",
+    "gpu_lr, gpu_model, gpu_fit_time, gpu_test_logLoss = sparse_vectors_compat(GPULR)\n",
+    "print(f\"GPU fit took: {gpu_fit_time} sec\")\n",
+    "print(f\"GPU training objective: {gpu_model.objective}\")\n",
+    "print(f\"GPU test logLoss: {gpu_test_logLoss}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pyspark.ml.classification import LogisticRegression as CPULR\n",
+    "cpu_lr, cpu_model, cpu_fit_time, cpu_test_logLoss = sparse_vectors_compat(CPULR)\n",
+    "print(f\"CPU fit took: {cpu_fit_time} sec\")\n",
+    "print(f\"CPU training objective: {cpu_model.summary.objectiveHistory[-1]}\")\n",
+    "print(f\"CPU test logLoss: {cpu_test_logLoss}\")"
+   ]
   }
  ],
  "metadata": {

diff --git a/python/README.md b/python/README.md
@@ -8,9 +8,9 @@ For simplicity, the following instructions just use Spark local mode, assuming a
 
 First, install RAPIDS cuML per [these instructions](https://rapids.ai/start.html).   Example for CUDA Toolkit 11.8:
 ```bash
-conda create -n rapids-23.12 \
+conda create -n rapids-24.02 \
     -c rapidsai -c conda-forge -c nvidia \
-    cuml=23.12 python=3.9 cuda-version=11.8
+    cuml=24.02 python=3.9 cuda-version=11.8
 ```
 
 **Note**: while testing, we recommend using conda or docker to simplify installation and isolate your environment while experimenting.  Once you have a working environment, you can then try installing directly, if necessary.
@@ -19,7 +19,7 @@ conda create -n rapids-23.12 \
 
 Once you have the conda environment, activate it and install the required packages.
 ```bash
-conda activate rapids-23.12
+conda activate rapids-24.02
 
 ## for development access to notebooks, tests, and benchmarks
 git clone --branch main https://github.com/NVIDIA/spark-rapids-ml.git

diff --git a/python/benchmark/benchmark/base.py b/python/benchmark/benchmark/base.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/benchmark/benchmark/bench_kmeans.py b/python/benchmark/benchmark/bench_kmeans.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/benchmark/benchmark/bench_linear_regression.py b/python/benchmark/benchmark/bench_linear_regression.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/benchmark/benchmark/bench_logistic_regression.py b/python/benchmark/benchmark/bench_logistic_regression.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/benchmark/benchmark/bench_nearest_neighbors.py b/python/benchmark/benchmark/bench_nearest_neighbors.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/benchmark/benchmark/bench_pca.py b/python/benchmark/benchmark/bench_pca.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/benchmark/benchmark/bench_random_forest.py b/python/benchmark/benchmark/bench_random_forest.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/benchmark/benchmark/bench_umap.py b/python/benchmark/benchmark/bench_umap.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/python/benchmark/benchmark_runner.py b/python/benchmark/benchmark_runner.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.