From 7b8a4c9b2c8d9fe3a08523ab0f948277ee6a7b7d Mon Sep 17 00:00:00 2001 From: David Grove Date: Thu, 20 Mar 2025 12:40:21 -0400 Subject: [PATCH 1/4] fill in top-level TODO items --- setup.KubeConEU25/README.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/setup.KubeConEU25/README.md b/setup.KubeConEU25/README.md index 6021e12..1ec137f 100644 --- a/setup.KubeConEU25/README.md +++ b/setup.KubeConEU25/README.md @@ -121,10 +121,10 @@ cd mlbatch # Setup priority classes kubectl apply -f setup.k8s/mlbatch-priorities.yaml -# Deploy scheduler plugins +# Deploy scheduler-plugins helm install scheduler-plugins --namespace scheduler-plugins --create-namespace scheduler-plugins/manifests/install/charts/as-a-second-scheduler/ --set-json pluginConfig='[{"args":{"scoringStrategy":{"resources":[{"name":"nvidia.com/GPU","weight":1}],"requestedToCapacityRatio":{"shape":[{"utilization":0,"score":0},{"utilization":100,"score":10}]},"type":"RequestedToCapacityRatio"}},"name":"NodeResourcesFit"},{"args":{"permitWaitingTimeSeconds":300},"name":"Coscheduling"}]' -# Wait for scheduler-plugins pods to be running +# Wait for scheduler-plugins pods to be ready while [[ $(kubectl get pods -n scheduler-plugins -o 'jsonpath={..status.conditions[?(@.type=="Ready")].status}' | tr ' ' '\n' | sort -u) != "True" ]] do echo -n "." && sleep 1; @@ -154,8 +154,6 @@ do done echo "" -kubectl get pods -n mlbatch-system - # Deploy AppWrapper kubectl apply --server-side -k setup.k8s/appwrapper/coscheduling @@ -496,7 +494,8 @@ kubectl label servicemonitors.monitoring.coreos.com -n nvidia-GPU-operator nvidi ## Workload Management -TODO +- We will now demonstrate the queueing, quota management, and fault recovery capabilities + of MLBatch using synthetic workloads.
@@ -627,7 +626,8 @@ The two containers are synchronized as follows: `load-generator` waits for ### Pre-Training with PyTorch -TODO +In this example, `alice` uses [PyTorch]() to pre-training a model using the +[Kubeflow Training Operator](https://github.com/kubeflow/training-operator).
@@ -637,7 +637,8 @@ TODO ### Fine-Tuning with Ray -TODO +In this example, `alice` uses [Ray](https://github.com/ray-project/ray) to fine tune a model using +[KubeRay](https://github.com/ray-project/kuberay).
From 9552b28844916f1c31f1c773551d752d45a5d853 Mon Sep 17 00:00:00 2001 From: David Grove Date: Thu, 20 Mar 2025 12:48:20 -0400 Subject: [PATCH 2/4] tweaks --- setup.KubeConEU25/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.KubeConEU25/README.md b/setup.KubeConEU25/README.md index 1ec137f..c406677 100644 --- a/setup.KubeConEU25/README.md +++ b/setup.KubeConEU25/README.md @@ -505,7 +505,8 @@ TODO ## Example Workloads -We now run a few example workloads. +We now will now run some sample workloads that are representative of what is run on +a typical AI GPU Cluster. ### Batch Inference with vLLM From 191b6335936548a4e5c9c179b56115f52e6ddd51 Mon Sep 17 00:00:00 2001 From: David Grove Date: Thu, 20 Mar 2025 12:53:04 -0400 Subject: [PATCH 3/4] tweaks --- setup.KubeConEU25/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/setup.KubeConEU25/README.md b/setup.KubeConEU25/README.md index c406677..7de94f2 100644 --- a/setup.KubeConEU25/README.md +++ b/setup.KubeConEU25/README.md @@ -506,7 +506,7 @@ TODO ## Example Workloads We now will now run some sample workloads that are representative of what is run on -a typical AI GPU Cluster. +an AI GPU Cluster. ### Batch Inference with vLLM @@ -627,8 +627,8 @@ The two containers are synchronized as follows: `load-generator` waits for ### Pre-Training with PyTorch -In this example, `alice` uses [PyTorch]() to pre-training a model using the -[Kubeflow Training Operator](https://github.com/kubeflow/training-operator). +In this example, `alice` uses the [Kubeflow Training Operator](https://github.com/kubeflow/training-operator) +to run a job that uses [PyTorch](https://pytorch.org) to train a machine learning model.
@@ -638,8 +638,8 @@ TODO ### Fine-Tuning with Ray -In this example, `alice` uses [Ray](https://github.com/ray-project/ray) to fine tune a model using -[KubeRay](https://github.com/ray-project/kuberay). +In this example, `alice` uses [KubeRay](https://github.com/ray-project/kuberay) to run a job that +uses [Ray](https://github.com/ray-project/ray) to fine tune a machine learning model.
From 8ee775b20746acb4ea10ddb0a0157009ddef652d Mon Sep 17 00:00:00 2001 From: David Grove Date: Thu, 20 Mar 2025 12:55:04 -0400 Subject: [PATCH 4/4] fix indent --- setup.KubeConEU25/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.KubeConEU25/README.md b/setup.KubeConEU25/README.md index 7de94f2..b5acc8f 100644 --- a/setup.KubeConEU25/README.md +++ b/setup.KubeConEU25/README.md @@ -494,8 +494,8 @@ kubectl label servicemonitors.monitoring.coreos.com -n nvidia-GPU-operator nvidi ## Workload Management -- We will now demonstrate the queueing, quota management, and fault recovery capabilities - of MLBatch using synthetic workloads. +We will now demonstrate the queueing, quota management, and fault recovery capabilities +of MLBatch using synthetic workloads.