-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Description
Description
When using volcano hierarchical queues, Spark Applications submitted by Spark Operator cannot reclaim resource between queues.
I want to confirm if it's a configuration problem or if it triggered a bug?
I need help, any suggestions would be greatly appreciated.
Thank you very much.
Steps to reproduce the issue
- volcano scheduler configmap is
kubectl get configmaps -n volcano-system volcano-scheduler-configmap -o yaml
apiVersion: v1
data:
volcano-scheduler.conf: |
actions: "enqueue, allocate, preempt, reclaim, backfill"
tiers:
- plugins:
- name: priority
- name: gang
enablePreemptable: false
- name: conformance
- plugins:
- name: overcommit
- name: drf
enablePreemptable: false
- name: predicates
- name: capacity
enabledAllocatable: true
enablePreemptive: true
enableReclaimable: true
enableQueueOrder: true
enableHierarchy: true
- name: nodeorder
- name: binpack
kind: ConfigMap
metadata:
annotations:
meta.helm.sh/release-name: volcano
meta.helm.sh/release-namespace: volcano-system
creationTimestamp: "2025-07-22T07:47:05Z"
labels:
app.kubernetes.io/managed-by: Helm
name: volcano-scheduler-configmap
namespace: volcano-system
resourceVersion: "51322036"
uid: b77c408c-5873-4858-bcca-955b093bf750
- queue
create 3 queue: queue-my-root、child-queue-a、child-queue-b
kubectl get queue queue-my-root -o yaml
apiVersion: scheduling.volcano.sh/v1beta1
kind: Queue
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"scheduling.volcano.sh/v1beta1","kind":"Queue","metadata":{"annotations":{},"name":"queue-my-root"},"spec":{"capability":{"cpu":"30","memory":"30Gi"},"deserved":{"cpu":30,"memory":"30Gi"},"parent":"r
oot","reclaimable":true}}
creationTimestamp: "2025-09-25T10:14:21Z"
generation: 1
name: queue-my-root
resourceVersion: "50912420"
uid: 8e0e5d8f-0dd9-4fab-8b9b-527c08c21802
spec:
capability:
cpu: "30"
memory: 30Gi
deserved:
cpu: 30
memory: 30Gi
parent: root
reclaimable: true
weight: 1
status:
allocated:
cpu: "0"
memory: "0"
reservation: {}
state: Open
kubectl get queue child-queue-a -o yaml
apiVersion: scheduling.volcano.sh/v1beta1
kind: Queue
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"scheduling.volcano.sh/v1beta1","kind":"Queue","metadata":{"annotations":{},"name":"child-queue-a"},"spec":{"capability":{"cpu":"20","memory":"20Gi"},"deserved":{"cpu":12,"memory":"12Gi"},"parent":"q
ueue-my-root","reclaimable":true}}
creationTimestamp: "2025-09-25T10:14:21Z"
generation: 1
name: child-queue-a
resourceVersion: "50912308"
uid: 7f2bcfd4-2786-4382-86c4-885125500a29
spec:
capability:
cpu: "20"
memory: 20Gi
deserved:
cpu: 12
memory: 12Gi
parent: queue-my-root
reclaimable: true
weight: 1
status:
allocated:
cpu: "0"
memory: "0"
reservation: {}
state: Open
kubectl get queue child-queue-b -o yaml
apiVersion: scheduling.volcano.sh/v1beta1
kind: Queue
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"scheduling.volcano.sh/v1beta1","kind":"Queue","metadata":{"annotations":{},"name":"child-queue-b"},"spec":{"capability":{"cpu":"30","memory":"30Gi"},"deserved":{"cpu":18,"memory":"18Gi"},"parent":"q
ueue-my-root","reclaimable":true}}
creationTimestamp: "2025-09-25T10:14:21Z"
generation: 1
name: child-queue-b
resourceVersion: "50912418"
uid: ac8d9f53-dbc8-4dbd-90ad-ff745465f705
spec:
capability:
cpu: "30"
memory: 30Gi
deserved:
cpu: 18
memory: 18Gi
parent: queue-my-root
reclaimable: true
weight: 1
status:
allocated:
cpu: "0"
memory: "0"
reservation: {}
state: Open
- SparkApplication
3.1 submit first SparkApplication to queue child-queue-a
kubectl apply -f app1.yaml
cat app1.yaml
apiVersion: sparkoperator.k8s.io/v1beta2
kind: SparkApplication
metadata:
name: spark-pi-1-demo
namespace: my-spark-operator-ha-apps-ns
spec:
type: Java
mode: cluster
image: harbor.demo-registry.cn/library/apache/spark:v3.4.1_arm_4
imagePullPolicy: IfNotPresent
mainClass: org.apache.spark.examples.SparkPi
mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.4.1.jar
batchScheduler: volcano
batchSchedulerOptions:
queue: child-queue-a
resources:
cpu: "16"
memory: "16Gi"
sparkConf:
spark.jars.ivy: "/tmp/.ivy"
spark.app.name: "spark-pi-1-demo"
spark.kubernetes.executor.podNamePrefix: "spark-pi-1-demo"
arguments:
- "500000"
sparkVersion: 3.4.1
driver:
labels:
version: 3.4.1
cores: 2
memory: 1664m
serviceAccount: my-spark-operator-ha-release1-spark
executor:
labels:
version: 3.4.1
instances: 7
cores: 2
memory: 1664m
3.2 submit second SparkApplication to queue child-queue-b
kubectl apply -f app3.yaml
cat app3.yaml
apiVersion: sparkoperator.k8s.io/v1beta2
kind: SparkApplication
metadata:
name: spark-pi-3-demo
namespace: my-spark-operator-ha-apps-ns
spec:
type: Java
mode: cluster
image: harbor.demo-registry.cn/library/apache/spark:v3.4.1_arm_4
imagePullPolicy: IfNotPresent
mainClass: org.apache.spark.examples.SparkPi
mainApplicationFile: local:///opt/spark/examples/jars/spark-examples_2.12-3.4.1.jar
batchScheduler: volcano
batchSchedulerOptions:
queue: child-queue-b
resources:
cpu: "16"
memory: "16Gi"
sparkConf:
spark.jars.ivy: "/tmp/.ivy"
spark.app.name: "spark-pi-3-demo"
spark.kubernetes.executor.podNamePrefix: "spark-pi-3-demo"
arguments:
- "500000"
sparkVersion: 3.4.1
driver:
labels:
version: 3.4.1
cores: 2
memory: 1664m
serviceAccount: my-spark-operator-ha-release1-spark
executor:
labels:
version: 3.4.1
instances: 7
cores: 2
memory: 1664m
Describe the results you received and expected
- Received result
1.1 after submit first SparkApplication
the first SparkApplication submit to the queue child-queue-a successfully
kubectl get all -n my-spark-operator-ha-apps-ns -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
pod/spark-pi-1-demo-driver 1/1 Running 0 20s 10.xxx.xxx.24 218 <none> <none>
pod/spark-pi-1-demo-exec-1 1/1 Running 0 12s 10.xxx.xxx.178 225 <none> <none>
pod/spark-pi-1-demo-exec-2 1/1 Running 0 12s 10.xxx.xxx.145 219 <none> <none>
pod/spark-pi-1-demo-exec-3 1/1 Running 0 11s 10.xxx.xxx.25 218 <none> <none>
pod/spark-pi-1-demo-exec-4 1/1 Running 0 11s 10.xxx.xxx.143 224 <none> <none>
pod/spark-pi-1-demo-exec-5 1/1 Running 0 11s 10.xxx.xxx.179 225 <none> <none>
pod/spark-pi-1-demo-exec-6 1/1 Running 0 10s 10.xxx.xxx.146 219 <none> <none>
pod/spark-pi-1-demo-exec-7 1/1 Running 0 10s 10.xxx.xxx.26 218 <none> <none>
kubectl get queue child-queue-a -o yaml
apiVersion: scheduling.volcano.sh/v1beta1
kind: Queue
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"scheduling.volcano.sh/v1beta1","kind":"Queue","metadata":{"annotations":{},"name":"child-queue-a"},"spec":{"capability":{"cpu":"20","memory":"20Gi"},"deserved":{"cpu":12,"memory":"12Gi"},"parent":"q
ueue-my-root","reclaimable":true}}
creationTimestamp: "2025-09-25T10:14:21Z"
generation: 1
name: child-queue-a
resourceVersion: "51338783"
uid: 7f2bcfd4-2786-4382-86c4-885125500a29
spec:
capability:
cpu: "20"
memory: 20Gi
deserved:
cpu: 12
memory: 12Gi
parent: queue-my-root
reclaimable: true
weight: 1
status:
allocated:
cpu: "16"
memory: 16Gi
pods: "8"
reservation: {}
state: Open
kubectl get queue child-queue-b -o yaml
apiVersion: scheduling.volcano.sh/v1beta1
kind: Queue
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"scheduling.volcano.sh/v1beta1","kind":"Queue","metadata":{"annotations":{},"name":"child-queue-b"},"spec":{"capability":{"cpu":"30","memory":"30Gi"},"deserved":{"cpu":18,"memory":"18Gi"},"parent":"q
ueue-my-root","reclaimable":true}}
creationTimestamp: "2025-09-25T10:14:21Z"
generation: 1
name: child-queue-b
resourceVersion: "51337693"
uid: ac8d9f53-dbc8-4dbd-90ad-ff745465f705
spec:
capability:
cpu: "30"
memory: 30Gi
deserved:
cpu: 18
memory: 18Gi
parent: queue-my-root
reclaimable: true
weight: 1
status:
allocated:
cpu: "0"
memory: "0"
reservation: {}
state: Open
kubectl get podgroup -n my-spark-operator-ha-apps-ns spark-spark-pi-1-demo-pg -o yaml
apiVersion: scheduling.volcano.sh/v1beta1
kind: PodGroup
metadata:
creationTimestamp: "2025-10-10T10:06:58Z"
generation: 8
name: spark-spark-pi-1-demo-pg
namespace: my-spark-operator-ha-apps-ns
ownerReferences:
- apiVersion: sparkoperator.k8s.io/v1beta2
blockOwnerDeletion: true
controller: true
kind: SparkApplication
name: spark-pi-1-demo
uid: 44f44722-00c1-47e9-b0a8-9bd6445be656
resourceVersion: "51338871"
uid: 3d4c61e2-c11b-4f17-ba58-82beaf095ce2
spec:
minMember: 1
minResources:
cpu: "16"
memory: 16Gi
queue: child-queue-a
status:
conditions:
- lastTransitionTime: "2025-10-10T10:07:20Z"
reason: tasks in gang are ready to be scheduled
status: "True"
transitionID: d6e7b2a1-c330-4426-84be-dc279869dca7
type: Scheduled
phase: Running
running: 8
1.2 after submit second SparkApplication
however, the second SparkApplication is pending, and cannot reclaim resource from the queue child-queue-a !!!
kubectl get all -n my-spark-operator-ha-apps-ns -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
pod/spark-pi-1-demo-driver 1/1 Running 0 73s 10.xxx.xxx.24 218 <none> <none>
pod/spark-pi-1-demo-exec-1 1/1 Running 0 65s 10.xxx.xxx.178 225 <none> <none>
pod/spark-pi-1-demo-exec-2 1/1 Running 0 65s 10.xxx.xxx.145 219 <none> <none>
pod/spark-pi-1-demo-exec-3 1/1 Running 0 64s 10.xxx.xxx.25 218 <none> <none>
pod/spark-pi-1-demo-exec-4 1/1 Running 0 64s 10.xxx.xxx.143 224 <none> <none>
pod/spark-pi-1-demo-exec-5 1/1 Running 0 64s 10.xxx.xxx.179 225 <none> <none>
pod/spark-pi-1-demo-exec-6 1/1 Running 0 63s 10.xxx.xxx.146 219 <none> <none>
pod/spark-pi-1-demo-exec-7 1/1 Running 0 63s 10.xxx.xxx.26 218 <none> <none>
pod/spark-pi-3-demo-driver 0/1 Pending 0 3s <none> <none> <none> <none>
kubectl get queue child-queue-a -o yaml
apiVersion: scheduling.volcano.sh/v1beta1
kind: Queue
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"scheduling.volcano.sh/v1beta1","kind":"Queue","metadata":{"annotations":{},"name":"child-queue-a"},"spec":{"capability":{"cpu":"20","memory":"20Gi"},"deserved":{"cpu":12,"memory":"12Gi"},"parent":"q
ueue-my-root","reclaimable":true}}
creationTimestamp: "2025-09-25T10:14:21Z"
generation: 1
name: child-queue-a
resourceVersion: "51338783"
uid: 7f2bcfd4-2786-4382-86c4-885125500a29
spec:
capability:
cpu: "20"
memory: 20Gi
deserved:
cpu: 12
memory: 12Gi
parent: queue-my-root
reclaimable: true
weight: 1
status:
allocated:
cpu: "16"
memory: 16Gi
pods: "8"
reservation: {}
state: Open
kubectl get queue child-queue-b -o yaml
apiVersion: scheduling.volcano.sh/v1beta1
kind: Queue
metadata:
annotations:
kubectl.kubernetes.io/last-applied-configuration: |
{"apiVersion":"scheduling.volcano.sh/v1beta1","kind":"Queue","metadata":{"annotations":{},"name":"child-queue-b"},"spec":{"capability":{"cpu":"30","memory":"30Gi"},"deserved":{"cpu":18,"memory":"18Gi"},"parent":"q
ueue-my-root","reclaimable":true}}
creationTimestamp: "2025-09-25T10:14:21Z"
generation: 1
name: child-queue-b
resourceVersion: "51337693"
uid: ac8d9f53-dbc8-4dbd-90ad-ff745465f705
spec:
capability:
cpu: "30"
memory: 30Gi
deserved:
cpu: 18
memory: 18Gi
parent: queue-my-root
reclaimable: true
weight: 1
status:
allocated:
cpu: "0"
memory: "0"
reservation: {}
state: Open
kubectl get podgroup -n my-spark-operator-ha-apps-ns spark-spark-pi-1-demo-pg -o yaml
apiVersion: scheduling.volcano.sh/v1beta1
kind: PodGroup
metadata:
creationTimestamp: "2025-10-10T10:06:58Z"
generation: 8
name: spark-spark-pi-1-demo-pg
namespace: my-spark-operator-ha-apps-ns
ownerReferences:
- apiVersion: sparkoperator.k8s.io/v1beta2
blockOwnerDeletion: true
controller: true
kind: SparkApplication
name: spark-pi-1-demo
uid: 44f44722-00c1-47e9-b0a8-9bd6445be656
resourceVersion: "51338871"
uid: 3d4c61e2-c11b-4f17-ba58-82beaf095ce2
spec:
minMember: 1
minResources:
cpu: "16"
memory: 16Gi
queue: child-queue-a
status:
conditions:
- lastTransitionTime: "2025-10-10T10:07:20Z"
reason: tasks in gang are ready to be scheduled
status: "True"
transitionID: d6e7b2a1-c330-4426-84be-dc279869dca7
type: Scheduled
phase: Running
running: 8
kubectl get podgroup -n my-spark-operator-ha-apps-ns spark-spark-pi-3-demo-pg -o yaml
apiVersion: scheduling.volcano.sh/v1beta1
kind: PodGroup
metadata:
creationTimestamp: "2025-10-10T10:08:07Z"
generation: 2
name: spark-spark-pi-3-demo-pg
namespace: my-spark-operator-ha-apps-ns
ownerReferences:
- apiVersion: sparkoperator.k8s.io/v1beta2
blockOwnerDeletion: true
controller: true
kind: SparkApplication
name: spark-pi-3-demo
uid: 4b2bc8ef-42f6-4427-b8fb-6699e777ac98
resourceVersion: "51339167"
uid: 58058948-b87f-4ed5-bcb2-dc68b66a3b59
spec:
minMember: 1
minResources:
cpu: "16"
memory: 16Gi
queue: child-queue-b
status:
conditions:
- lastTransitionTime: "2025-10-10T10:08:13Z"
message: '1/1 tasks in gang unschedulable: pod group is not ready, 1 Pending,
1 minAvailable; Pending: 1 Unschedulable'
reason: NotEnoughResources
status: "True"
transitionID: 41d4926a-8f8f-4321-b405-0c2b799560b1
type: Unschedulable
phase: Pending
- Expected result
after submit second SparkApplication,
the second SparkApplication can reclaim resource from the queue child-queue-a.
in the queue child-queue-a, there will be 1 driver running, 6 executor running, 1 executor pending.
in the queue child-queue-b, there will be 1 driver running, 7 executor running.
What version of Volcano are you using?
1.12.1
Any other relevant information
Environment :
k8s 1.24.10
spark operator 2.2.0
spark 3.4.1
volcano 1.12.1
arch : arm