Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 10 additions & 9 deletions .github/workflows/e2e-nvidia-l4-x1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ on:
- release-*
paths:
# note this should match the merging criteria in 'mergify.yml'
- '**.py'
- 'pyproject.toml'
- 'requirements**.txt'
- '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
- "**.py"
- "pyproject.toml"
- "requirements**.txt"
- ".github/workflows/e2e-nvidia-l4-x1.yml" # This workflow
- "!tests/**" # we don't need to run e2e if we're just changing the tests.

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
Expand Down Expand Up @@ -72,7 +73,7 @@ jobs:
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
]

e2e-medium-test:
needs:
- start-medium-ec2-runner
Expand Down Expand Up @@ -153,7 +154,7 @@ jobs:
. venv/bin/activate
# set preserve to true so we can retain the logs
./scripts/e2e-ci.sh -mp

# HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so, if this job doesn't use the training library should we remove it?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this uses --pipeline full which uses the full loop from ilab (sorry I did this 😆 )

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this might be tangential to this PR but might be nice to see a green CI on this by just removing the test.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we do that in a follow-up so we're doing on thing at a time in this PR?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure!

# Therefore we must disable the upload of the training logs, as they will not exist in the same location.
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
Expand Down Expand Up @@ -200,7 +201,7 @@ jobs:
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-medium-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}

# - name: Download loss data
# id: download-logs
# uses: actions/download-artifact@v4
Expand All @@ -211,12 +212,12 @@ jobs:
# - name: Install dependencies
# run: |
# pip install -r requirements-dev.txt

# - name: Try to upload to s3
# id: upload-s3
# continue-on-error: true
# run: |
# output_file='./test.md'
# output_file='./test.md'
# python scripts/create-loss-graph.py \
# --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
# --output-file "${output_file}" \
Expand Down
146 changes: 146 additions & 0 deletions .github/workflows/smoke.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# SPDX-License-Identifier: Apache-2.0

name: "Run smoke tests via Tox::pytest"
# These tests will be long running and require accelerated hardware.

on:
workflow_dispatch:
inputs:
branch:
type: string
default: main
# using this rather than pull_request because this workflow
# needs to run in the context of the base branch (main) and
# access the repo's secrets to start the AWS instances.
pull_request_target:
branches:
- main
- release-*

permissions:
contents: read

defaults:
run:
shell: bash

env:
ec2_runner_variant: "g6e.12xlarge" # 4x L40s

jobs:
start-ec2-runner:
runs-on: ubuntu-latest
outputs:
label: ${{ steps.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id}}

steps:
- name: "Harden runner"
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
with:
egress-policy: audit

- name: "Configure AWS credentials"
uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ vars.AWS_REGION }}

- name: "Start EC2 runner"
id: start-ec2-runner
uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
with:
mode: start
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
ec2-instance-type: ${{ env.ec2_runner_variant }}
subnet-id: subnet-024298cefa3bedd61
security-group-id: sg-06300447c4a5fbef3
iam-role-name: instructlab-ci-runner
aws-resource-tags: >
[
{"Key": "Name", "Value": "instructlab-ci-github-smoketest-runner"},
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
]
run-smoke-tests:
needs:
- start-ec2-runner
runs-on: ${{needs.start-ec2-runner.outputs.label}}
# It is important that this job has no write permissions and has
# no access to any secrets. This part is where we are running
# untrusted code from PRs.
permissions: {}
steps:
- name: "Harden runner"
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
with:
egress-policy: audit

- name: "Install packages"
run: |
cat /etc/os-release
sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel
- name: "Verify cuda environment is setup"
run: |
export CUDA_HOME="/usr/local/cuda"
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
export PATH="${PATH}:${CUDA_HOME}/bin"
nvidia-smi
- name: "Checkout code"
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
with:
fetch-depth: 0
ref: ${{inputs.branch}}

# installs in $GITHUB_WORKSPACE/venv.
# only has to install Tox because Tox will do the other virtual environment management.
- name: "Setup Python virtual environment"
run: |
python3.11 -m venv --upgrade-deps venv
. venv/bin/activate
pip install tox
- name: "Show disk utilization BEFORE tests"
run: |
df -h
- name: "Run smoke tests with Tox and Pytest"
run: |
source venv/bin/activate
tox -e py3-smoke
- name: "Show disk utilization AFTER tests"
run: |
df -h
stop-ec2-runner:
needs:
- start-ec2-runner
- run-smoke-tests
runs-on: ubuntu-latest
if: ${{ always() }}
steps:
- name: "Harden runner"
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
with:
egress-policy: audit

- name: "Configure AWS credentials"
uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
with:
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
aws-region: ${{ vars.AWS_REGION }}

- name: "Stop EC2 runner"
uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
with:
mode: stop
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
label: ${{ needs.start-ec2-runner.outputs.label }}
ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }}
File renamed without changes.
5 changes: 5 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,3 +107,8 @@ exclude = [
]
# honor excludes by not following there through imports
follow_imports = "silent"

[tool.pytest.ini_options]
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
]
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ ipython
ipykernel
jupyter

huggingface_hub
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we want a version for that?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only if we know there's a minimal version that we have to have.

Loading