ray-project
diff --git a/‎.buildkite/test-e2e.yml‎
Lines changed: 24 additions & 4 deletions b/‎.buildkite/test-e2e.yml‎
Lines changed: 24 additions & 4 deletions
diff --git a/‎.github/workflows/image-release.yaml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/image-release.yaml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎apiserver/Autoscaling.md‎
Lines changed: 103 additions & 112 deletions b/‎apiserver/Autoscaling.md‎
Lines changed: 103 additions & 112 deletions
@@ -38,7 +38,7 @@
     - KUBERAY_TEST_TIMEOUT_SHORT=1m KUBERAY_TEST_TIMEOUT_MEDIUM=5m KUBERAY_TEST_TIMEOUT_LONG=10m go test -timeout 30m -v ./test/e2erayservice 2>&1 | awk -f ../.buildkite/format.awk | tee $$KUBERAY_TEST_OUTPUT_DIR/gotest.log || (kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee $$KUBERAY_TEST_OUTPUT_DIR/kuberay-operator.log && cd $$KUBERAY_TEST_OUTPUT_DIR && find . -name "*.log" | tar -cf /artifact-mount/e2e-rayservice-log.tar -T - && exit 1)
     - echo "--- END:e2e rayservice (nightly operator) tests finished"
 
-- label: 'Test Autoscaler E2E (nightly operator)'
+- label: 'Test Autoscaler E2E Part 1 (nightly operator)'
   instance_size: large
   image: golang:1.24
   commands:
@@ -50,13 +50,33 @@
     - bash ../.buildkite/build-start-operator.sh
     - kubectl wait --timeout=90s --for=condition=Available=true deployment kuberay-operator
     # Run e2e tests and print KubeRay operator logs if tests fail
-    - echo "--- START:Running Autoscaler e2e (nightly operator) tests"
+    - echo "--- START:Running Autoscaler E2E Part 1 (nightly operator) tests"
     - if [ -n "${KUBERAY_TEST_RAY_IMAGE}"]; then echo "Using Ray Image ${KUBERAY_TEST_RAY_IMAGE}"; fi
     - set -o pipefail
     - mkdir -p "$(pwd)/tmp" && export KUBERAY_TEST_OUTPUT_DIR=$(pwd)/tmp
     - echo "KUBERAY_TEST_OUTPUT_DIR=$$KUBERAY_TEST_OUTPUT_DIR"
-    - KUBERAY_TEST_TIMEOUT_SHORT=1m KUBERAY_TEST_TIMEOUT_MEDIUM=5m KUBERAY_TEST_TIMEOUT_LONG=10m go test -timeout 60m -v ./test/e2eautoscaler 2>&1 | awk -f ../.buildkite/format.awk | tee $$KUBERAY_TEST_OUTPUT_DIR/gotest.log || (kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee $$KUBERAY_TEST_OUTPUT_DIR/kuberay-operator.log && cd $$KUBERAY_TEST_OUTPUT_DIR && find . -name "*.log" | tar -cf /artifact-mount/e2e-autoscaler-log.tar -T - && exit 1)
-    - echo "--- END:Autoscaler e2e (nightly operator) tests finished"
+    - KUBERAY_TEST_TIMEOUT_SHORT=1m KUBERAY_TEST_TIMEOUT_MEDIUM=5m KUBERAY_TEST_TIMEOUT_LONG=10m go test -timeout 60m -v ./test/e2eautoscaler/raycluster_autoscaler_test.go ./test/e2eautoscaler/support.go 2>&1 | awk -f ../.buildkite/format.awk | tee $$KUBERAY_TEST_OUTPUT_DIR/gotest.log || (kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee $$KUBERAY_TEST_OUTPUT_DIR/kuberay-operator.log && cd $$KUBERAY_TEST_OUTPUT_DIR && find . -name "*.log" | tar -cf /artifact-mount/e2e-autoscaler-log.tar -T - && exit 1)
+    - echo "--- END:Autoscaler E2E Part 1 (nightly operator) tests finished"
+
+- label: 'Test Autoscaler E2E Part 2 (nightly operator)'
+  instance_size: large
+  image: golang:1.24
+  commands:
+    - source .buildkite/setup-env.sh
+    - kind create cluster --wait 900s --config ./ci/kind-config-buildkite.yml
+    - kubectl config set clusters.kind-kind.server https://docker:6443
+    # Build nightly KubeRay operator image
+    - pushd ray-operator
+    - bash ../.buildkite/build-start-operator.sh
+    - kubectl wait --timeout=90s --for=condition=Available=true deployment kuberay-operator
+    # Run e2e tests and print KubeRay operator logs if tests fail
+    - echo "--- START:Running Autoscaler E2E Part 2 (nightly operator) tests"
+    - if [ -n "${KUBERAY_TEST_RAY_IMAGE}"]; then echo "Using Ray Image ${KUBERAY_TEST_RAY_IMAGE}"; fi
+    - set -o pipefail
+    - mkdir -p "$(pwd)/tmp" && export KUBERAY_TEST_OUTPUT_DIR=$(pwd)/tmp
+    - echo "KUBERAY_TEST_OUTPUT_DIR=$$KUBERAY_TEST_OUTPUT_DIR"
+    - KUBERAY_TEST_TIMEOUT_SHORT=1m KUBERAY_TEST_TIMEOUT_MEDIUM=5m KUBERAY_TEST_TIMEOUT_LONG=10m go test -timeout 60m -v ./test/e2eautoscaler/raycluster_autoscaler_part2_test.go ./test/e2eautoscaler/support.go 2>&1 | awk -f ../.buildkite/format.awk | tee $$KUBERAY_TEST_OUTPUT_DIR/gotest.log || (kubectl logs --tail -1 -l app.kubernetes.io/name=kuberay | tee $$KUBERAY_TEST_OUTPUT_DIR/kuberay-operator.log && cd $$KUBERAY_TEST_OUTPUT_DIR && find . -name "*.log" | tar -cf /artifact-mount/e2e-autoscaler-log.tar -T - && exit 1)
+    - echo "--- END:Autoscaler E2E Part 2 (nightly operator) tests finished"
 
 - label: 'Test E2E Operator Version Upgrade (v1.3.0)'
   instance_size: large
 
@@ -213,7 +213,8 @@ jobs:
       run: echo "::set-output name=sha_short::$(git rev-parse --short HEAD)"
 
     - name: Set up Docker
-      uses: docker-practice/actions-setup-docker@master
+      uses: docker/setup-docker-action@v4
+
 
     - name: Log in to Quay.io
       uses: docker/login-action@v2
 
@@ -1,180 +1,171 @@
-# Creating Autoscaling clusters using API server
+# Creating Autoscaling clusters using APIServer
 
-One of the fundamental features of Ray is autoscaling. This [document] describes how to set up
-autoscaling using Ray operator. Here we will describe how to set it up using API server.
+One of Ray's key features is autoscaling. This [document] explains how to set up autoscaling
+with the Ray operator. Here, we demonstrate how to configure it using the APIServer and
+run an example.
 
-## Deploy KubeRay operator and API server
+## Setup
 
-Refer to [readme](README.md) for setting up KubRay operator and API server.
+Refer to the [README](README.md) for setting up the KubeRay operator and APIServer.
 
-```shell
-make operator-image cluster load-operator-image deploy-operator
+## Example
+
+This example walks through how to trigger scale-up and scale-down for RayCluster.
+
+Before proceeding with the example, remove any running RayClusters to ensure a successful
+execution of the steps below.
+
+```sh
+kubectl delete raycluster --all
 ```
 
-Alternatively, you could build and deploy the Operator and API server from local repo for
-development purpose.
+> [!IMPORTANT]
+> All the following guidance requires you to switch your working directory to the KubeRay `apiserver`
+
+### Install ConfigMap
 
-```shell
-make operator-image cluster load-operator-image deploy-operator docker-image load-image deploy
+Install this [ConfigMap], which contains the code for our example. Simply download
+the file and run:
+
+```sh
+kubectl apply -f test/cluster/cluster/detachedactor.yaml
 ```
 
-Additionally install this [ConfigMap] containing code that we will use for testing.
+Check if the ConfigMap is successfully created. You should see `ray-example` in the list:
+
+```sh
+kubectl get configmaps
+# NAME               DATA   AGE
+# ray-example        2      8s
+```
 
-## Deploy Ray cluster
+### Deploy RayCluster
 
-Once they are set up, you first need to create a Ray cluster using the following commands:
+Before running the example, deploy a RayCluster with the following command:
 
-```shell
+```sh
+# Create compute template
 curl -X POST 'localhost:31888/apis/v1/namespaces/default/compute_templates' \
 --header 'Content-Type: application/json' \
---data '{
-  "name": "default-template",
-  "namespace": "default",
-  "cpu": 2,
-  "memory": 4
-}'
+--data  @docs/api-example/compute_template.json
+
+# Create RayCluster
 curl -X POST 'localhost:31888/apis/v1/namespaces/default/clusters' \
 --header 'Content-Type: application/json' \
---data '{
-  "name": "test-cluster",
-  "namespace": "default",
-  "user": "boris",
-  "clusterSpec": {
-    "enableInTreeAutoscaling": true,
-    "autoscalerOptions": {
-        "upscalingMode": "Default",
-        "idleTimeoutSeconds": 30,
-        "cpu": "500m",
-        "memory": "512Mi"
-    },
-    "headGroupSpec": {
-      "computeTemplate": "default-template",
-      "image": "rayproject/ray:2.9.0-py310",
-      "serviceType": "NodePort",
-      "rayStartParams": {
-         "dashboard-host": "0.0.0.0",
-         "metrics-export-port": "8080",
-         "num-cpus": "0"
-       },
-       "volumes": [
-         {
-           "name": "code-sample",
-           "mountPath": "/home/ray/samples",
-           "volumeType": "CONFIGMAP",
-           "source": "ray-example",
-           "items": {
-            "detached_actor.py": "detached_actor.py",
-            "terminate_detached_actor.py": "terminate_detached_actor.py"
-           }
-         }
-       ]
-    },
-    "workerGroupSpec": [
-      {
-        "groupName": "small-wg",
-        "computeTemplate": "default-template",
-        "image": "rayproject/ray:2.9.0-py310",
-        "replicas": 0,
-        "minReplicas": 0,
-        "maxReplicas": 5,
-        "rayStartParams": {
-           "node-ip-address": "$MY_POD_IP"
-         },
-        "volumes": [
-          {
-            "name": "code-sample",
-            "mountPath": "/home/ray/samples",
-            "volumeType": "CONFIGMAP",
-            "source": "ray-example",
-            "items": {
-                "detached_actor.py": "detached_actor.py",
-                "terminate_detached_actor.py": "terminate_detached_actor.py"
-            }
-          }
-        ]
-      }
-    ]
-  }
-}'
+--data @docs/api-example/autoscaling_clusters.json
 ```
 
-## Validate that Ray cluster is deployed correctly
+This command performs two main operations:
 
-Run:
+1. Creates a compute template `default-template` that specifies resources to use during
+   scale-up (2 CPUs and 4 GiB memory).
 
-```shell
-kubectl get pods
-```
+2. Deploys a RayCluster (test-cluster) with:
+    - A head pod that manages the cluster
+    - A worker group configured to scale between 0 and 5 replicas
+
+The worker group uses the following autoscalerOptions to control scaling behavior:
+
+- **`upscalingMode: "Default"`**: Default scaling behavior. Ray will scale up only as
+needed.
+- **`idleTimeoutSeconds: 30`**: If a worker pod remains idle (i.e., not running any tasks)
+for 30 seconds, it will be automatically removed.
+- **`cpu: "500m"`, `memory: "512Mi"`**: Defines the **minimum resource unit** Ray uses to
+assess scaling needs.  If no worker pod has at least this much free capacity, Ray will
+trigger a scale-up and launch a new worker pod.
+
+> **Note:** These values **do not determine the actual size** of the worker pod. The
+> pod size comes from the `computeTemplate` (in this case, 2 CPUs and 4 GiB memory).
 
-You should get something like this:
+### Validate that RayCluster is deployed correctly
 
-```shell
-test-cluster-head-pr25j             2/2     Running   0          2m49s
+Run the following command to get a list of pods running. You should see something like below:
+
+```sh
+kubectl get pods
+# NAME                                READY   STATUS    RESTARTS   AGE
+# kuberay-operator-545586d46c-f9grr   1/1     Running   0          49m
+# test-cluster-head                   2/2     Running   0          3m1s
 ```
 
-Note that only head pod is running and it has 2 containers
+Note that there is no worker for `test-cluster` as we set its initial replicas to 0. You
+will only see head pod with 2 containers for `test-cluster`.
 
-## Trigger RayCluster scale-up
+### Trigger RayCluster scale-up
 
-Create a detached actor:
+Create a detached actor to trigger scale-up with the following command:
 
 ```sh
 curl -X POST 'localhost:31888/apis/v1/namespaces/default/jobs' \
 --header 'Content-Type: application/json' \
 --data '{
   "name": "create-actor",
   "namespace": "default",
-  "user": "boris",
+  "user": "kuberay",
   "entrypoint": "python /home/ray/samples/detached_actor.py actor1",
   "clusterSelector": {
     "ray.io/cluster": "test-cluster"
   }
 }'
 ```
 
-Because we have specified `num_cpu: 0` for head node, this will cause creation of a worker node. Run:
+The `detached_actor.py` file is defined in the [ConfigMap] we installed earlier and
+mounted to the head node, which requires `num_cpus=1`. Recall that initially there is no
+worker pod exists, RayCluster needs to scale up a worker for running this actor.
 
-```shell
-kubectl get pods
-```
+Check if a worker is created. You should see a worker `test-cluster-small-wg-worker` spin
+up.
 
-You should get something like this:
+```sh
+kubectl get pods
 
-```shell
-test-cluster-head-pr25j              2/2     Running   0          15m
-test-cluster-worker-small-wg-qrjfm   1/1     Running   0          2m48s
+# NAME                                 READY   STATUS      RESTARTS   AGE
+# create-actor-tsvfc                   0/1     Completed   0          99s
+# kuberay-operator-545586d46c-f9grr    1/1     Running     0          55m
+# test-cluster-head                    2/2     Running     0          9m37s
+# test-cluster-small-wg-worker-j54xf   1/1     Running     0          88s
 ```
 
-You can see that a worker node have been created.
+### Trigger RayCluster scale-down
 
-## Trigger RayCluster scale-down
-
-Run:
+Run the following command to delete the actor we created earlier:
 
 ```sh
 curl -X POST 'localhost:31888/apis/v1/namespaces/default/jobs' \
 --header 'Content-Type: application/json' \
 --data '{
   "name": "delete-actor",
   "namespace": "default",
-  "user": "boris",
+  "user": "kuberay",
   "entrypoint": "python /home/ray/samples/terminate_detached_actor.py actor1",
   "clusterSelector": {
     "ray.io/cluster": "test-cluster"
   }
 }'
 ```
 
-A worker Pod will be deleted after `idleTimeoutSeconds` (default 60s, we specified 30) seconds. Run:
+Once the actor is deleted, the worker is no longer needed. The worker pod will be deleted
+after `idleTimeoutSeconds` (default 60; we specified 30) seconds.
+
+List all pods to verify that the worker pod is deleted:
 
-```shell
+```sh
 kubectl get pods
+
+# NAME                                READY   STATUS      RESTARTS   AGE
+# create-actor-tsvfc                  0/1     Completed   0          6m37s
+# delete-actor-89z8c                  0/1     Completed   0          83s
+# kuberay-operator-545586d46c-f9grr   1/1     Running     0          60m
+# test-cluster-head                   2/2     Running     0          14m
+
 ```
 
-And you should see only head node (worker node is deleted)
+### Clean up
 
-```shell
-test-cluster-head-pr25j             2/2     Running   0          27m
+```sh
+make clean-cluster
+# Remove apiserver from helm
+helm uninstall kuberay-apiserver
 ```
 
 [document]: https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/configuring-autoscaling.html