@@ -772,16 +772,24 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
772
772
return err
773
773
}
774
774
775
- // Delete all workers if worker group is suspended and skip reconcile
775
+ // Delete all workers if worker group is suspended and skip reconcile if enableInTreeAutoscaling is not enabled.
776
+ enableInTreeAutoscaling := (instance .Spec .EnableInTreeAutoscaling != nil ) && (* instance .Spec .EnableInTreeAutoscaling )
776
777
if worker .Suspend != nil && * worker .Suspend {
777
- if _ , err := r .deleteAllPods (ctx , common .RayClusterGroupPodsAssociationOptions (instance , worker .GroupName )); err != nil {
778
- r .Recorder .Eventf (instance , corev1 .EventTypeWarning , string (utils .FailedToDeleteWorkerPodCollection ),
779
- "Failed deleting worker Pods for suspended group %s in RayCluster %s/%s, %v" , worker .GroupName , instance .Namespace , instance .Name , err )
780
- return errstd .Join (utils .ErrFailedDeleteWorkerPod , err )
778
+ if enableInTreeAutoscaling {
779
+ // TODO: This can be supported in future Ray. We should check the RayVersion on the CR once we know the future version.
780
+ r .Recorder .Eventf (instance , corev1 .EventTypeWarning , string (utils .InvalidRayClusterStatus ),
781
+ "Suspending the worker group %s is not supported in RayCluster %s/%s because its Autoscaler is enabled" , worker .GroupName , instance .Namespace , instance .Name )
782
+ continue
783
+ } else {
784
+ if _ , err := r .deleteAllPods (ctx , common .RayClusterGroupPodsAssociationOptions (instance , worker .GroupName )); err != nil {
785
+ r .Recorder .Eventf (instance , corev1 .EventTypeWarning , string (utils .FailedToDeleteWorkerPodCollection ),
786
+ "Failed deleting worker Pods for suspended group %s in RayCluster %s/%s, %v" , worker .GroupName , instance .Namespace , instance .Name , err )
787
+ return errstd .Join (utils .ErrFailedDeleteWorkerPod , err )
788
+ }
789
+ r .Recorder .Eventf (instance , corev1 .EventTypeNormal , string (utils .DeletedWorkerPod ),
790
+ "Deleted all pods for suspended worker group %s in RayCluster %s/%s" , worker .GroupName , instance .Namespace , instance .Name )
791
+ continue
781
792
}
782
- r .Recorder .Eventf (instance , corev1 .EventTypeNormal , string (utils .DeletedWorkerPod ),
783
- "Deleted all pods for suspended worker group %s in RayCluster %s/%s" , worker .GroupName , instance .Namespace , instance .Name )
784
- continue
785
793
}
786
794
787
795
// Delete unhealthy worker Pods.
@@ -869,8 +877,6 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
869
877
// diff < 0 indicates the need to delete some Pods to match the desired number of replicas. However,
870
878
// randomly deleting Pods is certainly not ideal. So, if autoscaling is enabled for the cluster, we
871
879
// will disable random Pod deletion, making Autoscaler the sole decision-maker for Pod deletions.
872
- enableInTreeAutoscaling := (instance .Spec .EnableInTreeAutoscaling != nil ) && (* instance .Spec .EnableInTreeAutoscaling )
873
-
874
880
// TODO (kevin85421): `enableRandomPodDelete` is a feature flag for KubeRay v0.6.0. If users want to use
875
881
// the old behavior, they can set the environment variable `ENABLE_RANDOM_POD_DELETE` to `true`. When the
876
882
// default behavior is stable enough, we can remove this feature flag.
0 commit comments