Skip to content

Commit ced2123

Browse files
committed
adds smoke test workflow
users can dispatch a workflow that runs smoke tests against a selected branch Signed-off-by: James Kunstle <[email protected]>
1 parent 3df8441 commit ced2123

File tree

2 files changed

+154
-9
lines changed

2 files changed

+154
-9
lines changed

.github/workflows/e2e-nvidia-l4-x1.yml

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@ on:
1515
- release-*
1616
paths:
1717
# note this should match the merging criteria in 'mergify.yml'
18-
- '**.py'
19-
- 'pyproject.toml'
20-
- 'requirements**.txt'
21-
- '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
18+
- "**.py"
19+
- "pyproject.toml"
20+
- "requirements**.txt"
21+
- ".github/workflows/e2e-nvidia-l4-x1.yml" # This workflow
22+
- "!tests/**" # we don't need to run e2e if we're just changing the tests.
2223

2324
concurrency:
2425
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -72,7 +73,7 @@ jobs:
7273
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
7374
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
7475
]
75-
76+
7677
e2e-medium-test:
7778
needs:
7879
- start-medium-ec2-runner
@@ -156,7 +157,7 @@ jobs:
156157
. venv/bin/activate
157158
# set preserve to true so we can retain the logs
158159
./scripts/e2e-ci.sh -mp
159-
160+
160161
# HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
161162
# Therefore we must disable the upload of the training logs, as they will not exist in the same location.
162163
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
@@ -203,7 +204,7 @@ jobs:
203204
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
204205
label: ${{ needs.start-medium-ec2-runner.outputs.label }}
205206
ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
206-
207+
207208
# - name: Download loss data
208209
# id: download-logs
209210
# uses: actions/download-artifact@v4
@@ -214,12 +215,12 @@ jobs:
214215
# - name: Install dependencies
215216
# run: |
216217
# pip install -r requirements-dev.txt
217-
218+
218219
# - name: Try to upload to s3
219220
# id: upload-s3
220221
# continue-on-error: true
221222
# run: |
222-
# output_file='./test.md'
223+
# output_file='./test.md'
223224
# python scripts/create-loss-graph.py \
224225
# --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
225226
# --output-file "${output_file}" \

.github/workflows/smoke-tests.yaml

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
name: "Run smoke tests via Tox::pytest"
4+
# These tests will be long running and require accelerated hardware.
5+
# They will help to verify that the library is *functionally* correct but
6+
# will not try to verify that the libary is *correct*.
7+
8+
on:
9+
# TEMP - only runs when manually invoked
10+
# and only runs against branches in the repo.
11+
workflow_dispatch:
12+
inputs:
13+
branch:
14+
type: string
15+
default: main
16+
17+
permissions:
18+
contents: read
19+
20+
defaults:
21+
run:
22+
shell: bash
23+
24+
env:
25+
ec2_runner_variant: "g6e.12xlarge" # 4x L40s
26+
27+
jobs:
28+
start-ec2-runner:
29+
runs-on: ubuntu-latest
30+
outputs:
31+
label: ${{ steps.start-ec2-runner.outputs.label }}
32+
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id}}
33+
34+
steps:
35+
- name: "Harden runner"
36+
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
37+
with:
38+
egress-policy: audit
39+
40+
- name: "Configure AWS credentials"
41+
uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
42+
with:
43+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
44+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
45+
aws-region: ${{ vars.AWS_REGION }}
46+
47+
- name: "Start EC2 runner"
48+
id: start-ec2-runner
49+
uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
50+
with:
51+
mode: start
52+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
53+
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
54+
ec2-instance-type: ${{ env.ec2_runner_variant }}
55+
subnet-id: subnet-024298cefa3bedd61
56+
security-group-id: sg-06300447c4a5fbef3
57+
iam-role-name: instructlab-ci-runner
58+
aws-resource-tags: >
59+
[
60+
{"Key": "Name", "Value": "instructlab-ci-github-smoketest-runner"},
61+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
62+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
63+
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
64+
]
65+
66+
run-smoke-tests:
67+
needs:
68+
- start-ec2-runner
69+
runs-on: ${{needs.start-ec2-runner.outputs.runner_label}}
70+
# It is important that this job has no write permissions and has
71+
# no access to any secrets. This part is where we are running
72+
# untrusted code from PRs.
73+
permissions: {}
74+
steps:
75+
- name: "Harden runner"
76+
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
77+
with:
78+
egress-policy: audit
79+
80+
- name: "Install packages"
81+
run: |
82+
cat /etc/os-release
83+
sudo dnf install -y gcc gcc-c++ make git python3.11 python3.11-devel
84+
85+
- name: "Verify cuda environment is setup"
86+
run: |
87+
export CUDA_HOME="/usr/local/cuda"
88+
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
89+
export PATH="$PATH:$CUDA_HOME/bin"
90+
nvidia-smi
91+
92+
- name: "Checkout code"
93+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
94+
with:
95+
fetch-depth: 0
96+
ref: ${{inputs.branch}}
97+
98+
# installs in $GITHUB_WORKSPACE/venv.
99+
# only has to install Tox because Tox will do the other virtual environment management.
100+
- name: "Setup Python virtual environment"
101+
run: |
102+
python3.11 -m venv --upgrade-deps venv
103+
. venv/bin/activate
104+
pip install tox
105+
106+
- name: "Show disk utilization BEFORE tests"
107+
run: |
108+
df -h
109+
110+
- name: "Run unit tests with Tox and Pytest"
111+
run: |
112+
source venv/bin/activate
113+
tox -e py3-smoke
114+
115+
- name: "Show disk utilization AFTER tests"
116+
run: |
117+
df -h
118+
119+
stop-ec2-runner:
120+
needs:
121+
- start-ec2-runner
122+
- run-smoke-tests
123+
runs-on: ubuntu-latest
124+
if: ${{ always() }}
125+
steps:
126+
- name: "Harden runner"
127+
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
128+
with:
129+
egress-policy: audit
130+
131+
- name: "Configure AWS credentials"
132+
uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
133+
with:
134+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
135+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
136+
aws-region: ${{ vars.AWS_REGION }}
137+
138+
- name: "Stop EC2 runner"
139+
uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
140+
with:
141+
mode: stop
142+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
143+
label: ${{ needs.start-ec2-runner.outputs.label }}
144+
ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }}

0 commit comments

Comments
 (0)