Skip to content

Commit fce38cf

Browse files
authored
Merge pull request #424 from instructlab/smoke-testing
Adds smoke test workflow and tests
2 parents 694ca16 + b95099b commit fce38cf

File tree

8 files changed

+427
-11
lines changed

8 files changed

+427
-11
lines changed

.github/workflows/e2e-nvidia-l4-x1.yml

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@ on:
1515
- release-*
1616
paths:
1717
# note this should match the merging criteria in 'mergify.yml'
18-
- '**.py'
19-
- 'pyproject.toml'
20-
- 'requirements**.txt'
21-
- '.github/workflows/e2e-nvidia-l4-x1.yml' # This workflow
18+
- "**.py"
19+
- "pyproject.toml"
20+
- "requirements**.txt"
21+
- ".github/workflows/e2e-nvidia-l4-x1.yml" # This workflow
22+
- "!tests/**" # we don't need to run e2e if we're just changing the tests.
2223

2324
concurrency:
2425
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
@@ -72,7 +73,7 @@ jobs:
7273
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
7374
{"Key": "GitHubPR", "Value": "${{ github.event.number }}"}
7475
]
75-
76+
7677
e2e-medium-test:
7778
needs:
7879
- start-medium-ec2-runner
@@ -153,7 +154,7 @@ jobs:
153154
. venv/bin/activate
154155
# set preserve to true so we can retain the logs
155156
./scripts/e2e-ci.sh -mp
156-
157+
157158
# HACK(osilkin): The above test runs the medium workflow test which does not actually test the training library.
158159
# Therefore we must disable the upload of the training logs, as they will not exist in the same location.
159160
# we know that the file will be named something like f"/training_params_and_metrics_global{os.environ['RANK']}.jsonl" in python
@@ -200,7 +201,7 @@ jobs:
200201
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
201202
label: ${{ needs.start-medium-ec2-runner.outputs.label }}
202203
ec2-instance-id: ${{ needs.start-medium-ec2-runner.outputs.ec2-instance-id }}
203-
204+
204205
# - name: Download loss data
205206
# id: download-logs
206207
# uses: actions/download-artifact@v4
@@ -211,12 +212,12 @@ jobs:
211212
# - name: Install dependencies
212213
# run: |
213214
# pip install -r requirements-dev.txt
214-
215+
215216
# - name: Try to upload to s3
216217
# id: upload-s3
217218
# continue-on-error: true
218219
# run: |
219-
# output_file='./test.md'
220+
# output_file='./test.md'
220221
# python scripts/create-loss-graph.py \
221222
# --log-file "${{ steps.download-logs.outputs.download-path }}/training-log.jsonl" \
222223
# --output-file "${output_file}" \

.github/workflows/smoke.yaml

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
3+
name: "Run smoke tests via Tox::pytest"
4+
# These tests will be long running and require accelerated hardware.
5+
6+
on:
7+
workflow_dispatch:
8+
inputs:
9+
branch:
10+
type: string
11+
default: main
12+
# using this rather than pull_request because this workflow
13+
# needs to run in the context of the base branch (main) and
14+
# access the repo's secrets to start the AWS instances.
15+
pull_request_target:
16+
branches:
17+
- main
18+
- release-*
19+
20+
permissions:
21+
contents: read
22+
23+
defaults:
24+
run:
25+
shell: bash
26+
27+
env:
28+
ec2_runner_variant: "g6e.12xlarge" # 4x L40s
29+
30+
jobs:
31+
start-ec2-runner:
32+
runs-on: ubuntu-latest
33+
outputs:
34+
label: ${{ steps.start-ec2-runner.outputs.label }}
35+
ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id}}
36+
37+
steps:
38+
- name: "Harden runner"
39+
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
40+
with:
41+
egress-policy: audit
42+
43+
- name: "Configure AWS credentials"
44+
uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
45+
with:
46+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
47+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
48+
aws-region: ${{ vars.AWS_REGION }}
49+
50+
- name: "Start EC2 runner"
51+
id: start-ec2-runner
52+
uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
53+
with:
54+
mode: start
55+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
56+
ec2-image-id: ${{ vars.AWS_EC2_AMI }}
57+
ec2-instance-type: ${{ env.ec2_runner_variant }}
58+
subnet-id: subnet-024298cefa3bedd61
59+
security-group-id: sg-06300447c4a5fbef3
60+
iam-role-name: instructlab-ci-runner
61+
aws-resource-tags: >
62+
[
63+
{"Key": "Name", "Value": "instructlab-ci-github-smoketest-runner"},
64+
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"},
65+
{"Key": "GitHubRef", "Value": "${{ github.ref }}"},
66+
]
67+
68+
run-smoke-tests:
69+
needs:
70+
- start-ec2-runner
71+
runs-on: ${{needs.start-ec2-runner.outputs.label}}
72+
# It is important that this job has no write permissions and has
73+
# no access to any secrets. This part is where we are running
74+
# untrusted code from PRs.
75+
permissions: {}
76+
steps:
77+
- name: "Harden runner"
78+
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
79+
with:
80+
egress-policy: audit
81+
82+
- name: "Install packages"
83+
run: |
84+
cat /etc/os-release
85+
sudo dnf install -y gcc gcc-c++ make git-core python3.11 python3.11-devel
86+
87+
- name: "Verify cuda environment is setup"
88+
run: |
89+
export CUDA_HOME="/usr/local/cuda"
90+
export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CUDA_HOME}/lib64:${CUDA_HOME}/extras/CUPTI/lib64"
91+
export PATH="${PATH}:${CUDA_HOME}/bin"
92+
nvidia-smi
93+
94+
- name: "Checkout code"
95+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
96+
with:
97+
fetch-depth: 0
98+
ref: ${{inputs.branch}}
99+
100+
# installs in $GITHUB_WORKSPACE/venv.
101+
# only has to install Tox because Tox will do the other virtual environment management.
102+
- name: "Setup Python virtual environment"
103+
run: |
104+
python3.11 -m venv --upgrade-deps venv
105+
. venv/bin/activate
106+
pip install tox
107+
108+
- name: "Show disk utilization BEFORE tests"
109+
run: |
110+
df -h
111+
112+
- name: "Run smoke tests with Tox and Pytest"
113+
run: |
114+
source venv/bin/activate
115+
tox -e py3-smoke
116+
117+
- name: "Show disk utilization AFTER tests"
118+
run: |
119+
df -h
120+
121+
stop-ec2-runner:
122+
needs:
123+
- start-ec2-runner
124+
- run-smoke-tests
125+
runs-on: ubuntu-latest
126+
if: ${{ always() }}
127+
steps:
128+
- name: "Harden runner"
129+
uses: step-security/harden-runner@cb605e52c26070c328afc4562f0b4ada7618a84e # v2.10.1
130+
with:
131+
egress-policy: audit
132+
133+
- name: "Configure AWS credentials"
134+
uses: "aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502" # v4.0.2
135+
with:
136+
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
137+
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
138+
aws-region: ${{ vars.AWS_REGION }}
139+
140+
- name: "Stop EC2 runner"
141+
uses: machulav/ec2-github-runner@28fbe1c4d7d9ba74134ca5ebc559d5b0a989a856 # v2.3.8
142+
with:
143+
mode: stop
144+
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }}
145+
label: ${{ needs.start-ec2-runner.outputs.label }}
146+
ec2-instance-id: ${{ needs.start-ec2-runner.outputs.ec2-instance-id }}

pyproject.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,8 @@ exclude = [
107107
]
108108
# honor excludes by not following there through imports
109109
follow_imports = "silent"
110+
111+
[tool.pytest.ini_options]
112+
markers = [
113+
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
114+
]

requirements-dev.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@ ipython
1313
ipykernel
1414
jupyter
1515

16+
huggingface_hub

0 commit comments

Comments
 (0)