diff --git a/setup.RHOAI-v2.11/mlbatch-subscription.yaml b/setup.RHOAI-v2.11/mlbatch-subscription.yaml index 87d36b2..8625a11 100644 --- a/setup.RHOAI-v2.11/mlbatch-subscription.yaml +++ b/setup.RHOAI-v2.11/mlbatch-subscription.yaml @@ -158,7 +158,7 @@ data: burst: 100 #pprofBindAddress: :8082 waitForPodsReady: - enable: true + enable: false blockAdmission: false manageJobsWithoutQueueName: true #internalCertManagement: diff --git a/setup.k8s-v1.25/CLUSTER-SETUP.md b/setup.k8s-v1.25/CLUSTER-SETUP.md index dbd0993..9e9c0a7 100644 --- a/setup.k8s-v1.25/CLUSTER-SETUP.md +++ b/setup.k8s-v1.25/CLUSTER-SETUP.md @@ -46,7 +46,43 @@ kubectl patch deployment -n scheduler-plugins --type=json --patch-file setup.k8s ## Install Operators -TODO: *** UNDER CONSTRUCTION ** +Create the mlbatch-system namespace +```sh +kubectl create namespace mlbatch-system +``` + +Install the Kubeflow Training Operator +```sh +kubectl apply --server-side -k setup.k8s-v1.25/training-operator +``` + +Install the KubeRay Operator +```sh +kubectl apply --server-side -k setup.k8s-v1.25/kuberay +``` + +Install Kueue +```sh +kubectl apply --server-side -k setup.k8s-v1.25/kueue +``` + +Install the AppWrapper Operator +```sh +kubectl apply --server-side -k setup.k8s-v1.25/appwrapper +``` +The provided configuration differs from the default configuration of the +operators as follows: +- Kubeflow Training Operator: + - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`, +- Kueue: + - `manageJobsWithoutQueueName` is enabled, + - `batch/job` integration is disabled, + - `waitForPodsReady` is disabled, +- AppWrapper operator: + - `userRBACAdmissionCheck` is disabled, + - `schedulerName` is set to `scheduler-plugins-scheduler`, + - `queueName` is set to `default-queue`, +- pod priorities, resource requests and limits have been adjusted. ## Kueue Configuration diff --git a/setup.k8s-v1.25/appwrapper/config_patch.yaml b/setup.k8s-v1.25/appwrapper/config_patch.yaml new file mode 100644 index 0000000..2f6fb03 --- /dev/null +++ b/setup.k8s-v1.25/appwrapper/config_patch.yaml @@ -0,0 +1,22 @@ +kind: ConfigMap +apiVersion: v1 +metadata: + name: appwrapper-operator-config + namespace: appwrapper-system +data: + config.yaml: | + appwrapper: + enableKueueIntegrations: true + kueueJobReconciller: + manageJobsWithoutQueueName: true + waitForPodsReady: + enable: false + queueName: default-queue + schedulerName: scheduler-plugins-scheduler + userRBACAdmissionCheck: false + controllerManager: + health: + bindAddress: ":8081" + metrics: + bindAddress: "127.0.0.1:8080" + leaderElection: true diff --git a/setup.k8s-v1.25/appwrapper/kustomization.yaml b/setup.k8s-v1.25/appwrapper/kustomization.yaml new file mode 100644 index 0000000..74e22d1 --- /dev/null +++ b/setup.k8s-v1.25/appwrapper/kustomization.yaml @@ -0,0 +1,15 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: mlbatch-system + +resources: +- "https://github.com/project-codeflare/appwrapper/config/default?ref=v0.21.0" + +images: +- name: quay.io/ibm/appwrapper + newTag: v0.21.0 + +patches: +- path: manager_resources_patch.yaml +- path: config_patch.yaml diff --git a/setup.k8s-v1.25/appwrapper/manager_resources_patch.yaml b/setup.k8s-v1.25/appwrapper/manager_resources_patch.yaml new file mode 100644 index 0000000..1b26c3c --- /dev/null +++ b/setup.k8s-v1.25/appwrapper/manager_resources_patch.yaml @@ -0,0 +1,18 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: controller-manager + namespace: system +spec: + template: + spec: + priorityClassName: system-node-critical + containers: + - name: manager + resources: + requests: + cpu: 250m + memory: 250Mi + limits: + cpu: 1000m + memory: 1000Mi diff --git a/setup.k8s-v1.25/kuberay/kustomization.yaml b/setup.k8s-v1.25/kuberay/kustomization.yaml new file mode 100644 index 0000000..2d17535 --- /dev/null +++ b/setup.k8s-v1.25/kuberay/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: mlbatch-system + +resources: +- "https://github.com/ray-project/kuberay/ray-operator/config/default?ref=v1.1.0" + +patches: +- path: remove_default_namespace.yaml +- path: manager_resources_patch.yaml diff --git a/setup.k8s-v1.25/kuberay/manager_resources_patch.yaml b/setup.k8s-v1.25/kuberay/manager_resources_patch.yaml new file mode 100644 index 0000000..7bb80d9 --- /dev/null +++ b/setup.k8s-v1.25/kuberay/manager_resources_patch.yaml @@ -0,0 +1,20 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kuberay-operator + namespace: system +spec: + template: + spec: + priorityClassName: system-node-critical + containers: + - name: kuberay-operator + args: + - "--zap-log-level=2" + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 1000Mi diff --git a/setup.k8s-v1.25/kuberay/remove_default_namespace.yaml b/setup.k8s-v1.25/kuberay/remove_default_namespace.yaml new file mode 100644 index 0000000..b5977cc --- /dev/null +++ b/setup.k8s-v1.25/kuberay/remove_default_namespace.yaml @@ -0,0 +1,5 @@ +$patch: delete +apiVersion: v1 +kind: Namespace +metadata: + name: ray-system diff --git a/setup.k8s-v1.25/kueue/controller_manager_config.yaml b/setup.k8s-v1.25/kueue/controller_manager_config.yaml new file mode 100644 index 0000000..1e3813a --- /dev/null +++ b/setup.k8s-v1.25/kueue/controller_manager_config.yaml @@ -0,0 +1,64 @@ +apiVersion: config.kueue.x-k8s.io/v1beta1 +kind: Configuration +health: + healthProbeBindAddress: :8081 +metrics: + bindAddress: :8080 +# enableClusterQueueResources: true +webhook: + port: 9443 +leaderElection: + leaderElect: true + resourceName: c1f6bfd2.kueue.x-k8s.io +controller: + groupKindConcurrency: + Job.batch: 5 + Pod: 5 + Workload.kueue.x-k8s.io: 5 + LocalQueue.kueue.x-k8s.io: 1 + ClusterQueue.kueue.x-k8s.io: 1 + ResourceFlavor.kueue.x-k8s.io: 1 +clientConnection: + qps: 50 + burst: 100 +#pprofBindAddress: :8083 +waitForPodsReady: + enable: false +# timeout: 5m +# blockAdmission: false +# requeuingStrategy: +# timestamp: Eviction +# backoffLimitCount: null # null indicates infinite requeuing +# backoffBaseSeconds: 60 +# backoffMaxSeconds: 3600 +manageJobsWithoutQueueName: true +#internalCertManagement: +# enable: false +# webhookServiceName: "" +# webhookSecretName: "" +integrations: + frameworks: +# - "batch/job" + - "kubeflow.org/mpijob" + - "ray.io/rayjob" + - "ray.io/raycluster" + - "jobset.x-k8s.io/jobset" + - "kubeflow.org/mxjob" + - "kubeflow.org/paddlejob" + - "kubeflow.org/pytorchjob" + - "kubeflow.org/tfjob" + - "kubeflow.org/xgboostjob" + # - "pod" + externalFrameworks: + - "AppWrapper.v1beta2.workload.codeflare.dev" +# podOptions: +# namespaceSelector: +# matchExpressions: +# - key: kubernetes.io/metadata.name +# operator: NotIn +# values: [ kube-system, kueue-system ] +#fairSharing: +# enable: true +# preemptionStrategies: [LessThanOrEqualToFinalShare, LessThanInitialShare] +#resources: +# excludeResourcePrefixes: [] diff --git a/setup.k8s-v1.25/kueue/kustomization.yaml b/setup.k8s-v1.25/kueue/kustomization.yaml new file mode 100644 index 0000000..8eeea78 --- /dev/null +++ b/setup.k8s-v1.25/kueue/kustomization.yaml @@ -0,0 +1,37 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +namespace: mlbatch-system + +resources: +- "https://github.com/kubernetes-sigs/kueue/config/default?ref=v0.7.1" + +configMapGenerator: +- name: manager-config + namespace: kueue-system + behavior: replace + files: + - controller_manager_config.yaml + +images: +- name: gcr.io/k8s-staging-kueue/kueue + newName: registry.k8s.io/kueue/kueue + newTag: v0.7.1 + +patches: +- target: + kind: ClusterRole + name: manager-role + patch: | + - op: add + path: /rules/- + value: + apiGroups: + - workload.codeflare.dev + resources: + - appwrappers + verbs: + - get + - list + - watch +- path: manager_resources_patch.yaml diff --git a/setup.k8s-v1.25/kueue/manager_resources_patch.yaml b/setup.k8s-v1.25/kueue/manager_resources_patch.yaml new file mode 100644 index 0000000..c9dc8c1 --- /dev/null +++ b/setup.k8s-v1.25/kueue/manager_resources_patch.yaml @@ -0,0 +1,9 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kueue-manager + namespace: system +spec: + template: + spec: + priorityClassName: system-node-critical diff --git a/setup.k8s-v1.25/training-operator/kustomization.yaml b/setup.k8s-v1.25/training-operator/kustomization.yaml new file mode 100644 index 0000000..12a9821 --- /dev/null +++ b/setup.k8s-v1.25/training-operator/kustomization.yaml @@ -0,0 +1,13 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: mlbatch-system + +resources: +- "https://github.com/kubeflow/training-operator/manifests/base?ref=v1.7.0" + +images: +- name: kubeflow/training-operator + newTag: "v1-855e096" + +patches: +- path: manager_resources_patch.yaml diff --git a/setup.k8s-v1.25/training-operator/manager_resources_patch.yaml b/setup.k8s-v1.25/training-operator/manager_resources_patch.yaml new file mode 100644 index 0000000..5bc1f6d --- /dev/null +++ b/setup.k8s-v1.25/training-operator/manager_resources_patch.yaml @@ -0,0 +1,20 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: training-operator +spec: + template: + spec: + priorityClassName: system-node-critical + containers: + - name: training-operator + args: + - "--zap-log-level=2" + - "--gang-scheduler-name=scheduler-plugins-scheduler" + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 500m + memory: 1000Mi diff --git a/setup.tmpl/CLUSTER-SETUP.md.tmpl b/setup.tmpl/CLUSTER-SETUP.md.tmpl index a2d6f80..0a4c211 100644 --- a/setup.tmpl/CLUSTER-SETUP.md.tmpl +++ b/setup.tmpl/CLUSTER-SETUP.md.tmpl @@ -124,7 +124,43 @@ kueue-controller-manager's log: {{- else -}} ## Install Operators -TODO: *** UNDER CONSTRUCTION ** +Create the mlbatch-system namespace +```sh +{{ .KUBECTL }} create namespace mlbatch-system +``` + +Install the Kubeflow Training Operator +```sh +{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/training-operator +``` + +Install the KubeRay Operator +```sh +{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/kuberay +``` + +Install Kueue +```sh +{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/kueue +``` + +Install the AppWrapper Operator +```sh +{{ .KUBECTL }} apply --server-side -k setup.{{ .VERSION }}/appwrapper +``` +The provided configuration differs from the default configuration of the +operators as follows: +- Kubeflow Training Operator: + - `gang-scheduler-name` is set to `scheduler-plugins-scheduler`, +- Kueue: + - `manageJobsWithoutQueueName` is enabled, + - `batch/job` integration is disabled, + - `waitForPodsReady` is disabled, +- AppWrapper operator: + - `userRBACAdmissionCheck` is disabled, + - `schedulerName` is set to `scheduler-plugins-scheduler`, + - `queueName` is set to `default-queue`, +- pod priorities, resource requests and limits have been adjusted. {{- end }}