Add e2e test for train API #52
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: E2E Test with train API | |
on: | |
- pull_request | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
e2e-test: | |
runs-on: ubuntu-latest | |
strategy: | |
fail-fast: false | |
matrix: | |
kubernetes-version: ["v1.28.7"] | |
python-version: ["3.9", "3.10", "3.11"] | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v4 | |
- name: Free-Up Disk Space | |
uses: ./.github/workflows/free-up-disk-space | |
- name: Setup Python | |
uses: actions/setup-python@v5 | |
with: | |
python-version: ${{ matrix.python-version }} | |
- name: Create k8s Kind Cluster | |
uses: helm/[email protected] | |
with: | |
node_image: kindest/node:${{ matrix.kubernetes-version }} | |
cluster_name: training-operator-cluster | |
kubectl_version: ${{ matrix.kubernetes-version }} | |
- name: Build training-operator | |
run: | | |
./scripts/gha/build-image.sh | |
env: | |
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test | |
- name: Deploy training operator | |
run: | | |
./scripts/gha/setup-training-operator.sh | |
docker system df | |
df -h | |
env: | |
KIND_CLUSTER: training-operator-cluster | |
TRAINING_CI_IMAGE: kubeflowtraining/training-operator:test | |
GANG_SCHEDULER_NAME: "none" | |
KUBERNETES_VERSION: ${{ matrix.kubernetes-version }} | |
- name: Prune docker images | |
shell: bash | |
run: | | |
docker image prune -a -f | |
docker system df | |
df -h | |
- name: Build trainer | |
run: | | |
./scripts/gha/build-trainer.sh | |
docker system df | |
df -h | |
env: | |
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test | |
- name: Clean up build cache | |
run: | | |
docker builder prune --all --force | |
docker volume ls | |
docker system df | |
df -h | |
- name: Load trainer | |
run: | | |
kind load docker-image ${{ env.TRAINER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} | |
docker image prune -a -f | |
docker volume prune -f | |
docker system df | |
df -h | |
env: | |
KIND_CLUSTER: training-operator-cluster | |
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test | |
- name: Build storage initializer | |
run: | | |
./scripts/gha/build-storage-initializer.sh | |
docker system df | |
df -h | |
env: | |
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test | |
TRAINER_CI_IMAGE: kubeflowtraining/trainer:test | |
- name: Clean up build cache | |
run: | | |
docker builder prune --all --force | |
docker volume ls | |
docker system df | |
df -h | |
- name: Load storage initializer | |
run: | | |
kind load docker-image ${{ env.STORAGE_INITIALIZER_CI_IMAGE }} --name ${{ env.KIND_CLUSTER }} | |
docker image prune -a -f | |
docker volume prune -f | |
docker system df | |
df -h | |
env: | |
KIND_CLUSTER: training-operator-cluster | |
STORAGE_INITIALIZER_CI_IMAGE: kubeflowtraining/storage-initializer:test | |
- name: Monitor resources usage of node | |
run: | | |
echo "Monitor resources usage of node" | |
kubectl describe nodes training-operator-cluster-control-plane | |
echo "Monitor resources usage of pods" | |
kubectl get pods --all-namespaces | |
- name: Run tests | |
run: | | |
pip install pytest | |
python3 -m pip install -e sdk/python[huggingface] | |
pytest -s sdk/python/test/e2e-train-api/test_e2e_train_api.py --log-cli-level=debug | |
env: | |
STORAGE_INITIALIZER_IMAGE: kubeflowtraining/storage-initializer:test | |
TRAINER_TRANSFORMER_IMAGE_DEFAULT: kubeflowtraining/trainer:test |