chore: remove etcd in favor of redis #345
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: CI/CD | |
on: | |
push: | |
branches: [ "main" ] | |
pull_request: | |
branches: [ "**" ] # Adjust branches as needed | |
release: | |
types: [published] | |
permissions: | |
id-token: write # Required for OIDC | |
contents: read # Required for checkout | |
packages: read # Required for GHCR access | |
jobs: | |
test: | |
runs-on: ubuntu-latest | |
steps: | |
- uses: actions/checkout@v4 | |
- uses: astral-sh/setup-uv@v4 | |
with: | |
enable-cache: true | |
cache-dependency-glob: "**/pyproject.toml" | |
- name: Cache dependencies | |
uses: actions/cache@v3 | |
with: | |
path: ${{ env.UV_CACHE_DIR }} | |
key: ${{ runner.os }}-uv-${{ hashFiles('**/pyproject.toml') }} | |
restore-keys: | | |
${{ runner.os }}-uv- | |
- name: Install dependencies | |
run: | | |
export ACLOCAL=aclocal | |
export AUTOMAKE=automake | |
uv sync | |
- name: Run Ruff format check | |
run: uv run ruff format --check | |
- name: Run Ruff linting | |
run: uv run ruff check --exclude packages/verifier/ | |
- name: Create .env for tests | |
run: | | |
cp .env.ci .env | |
# Set dummy secrets for unit tests | |
sed -i 's/HF_TOKEN=.*/HF_TOKEN=dummy_token/' .env | |
sed -i 's/BRAVE_SEARCH_API=.*/BRAVE_SEARCH_API=dummy_api/' .env | |
- name: pyright | |
run: uv run pyright | |
- name: Run unit tests | |
run: uv run pytest -v tests/unit | |
- name: Run integration tests | |
run: uv run pytest -v tests/integration | |
start-runner: | |
name: Start self-hosted EC2 runner | |
runs-on: ubuntu-24.04 | |
needs: test | |
outputs: | |
label: ${{ steps.start-ec2-runner.outputs.label }} | |
ec2-instances-ids: ${{ steps.start-ec2-runner.outputs.ec2-instances-ids }} | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/[email protected] | |
with: | |
aws-access-key-id: ${{ secrets.GH_AWS_ACCESS_KEY }} | |
aws-secret-access-key: ${{ secrets.GH_AWS_SECRET_KEY }} | |
aws-region: "us-east-1" | |
- name: Start EC2 runner | |
id: start-ec2-runner | |
uses: NillionNetwork/[email protected] | |
with: | |
mode: start | |
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
runners-per-machine: 3 | |
number-of-machines: 1 | |
ec2-image-id: ami-0e70d84403fc045d7 | |
ec2-instance-type: g6.xlarge | |
subnet-id: subnet-0bb357f46d1bc355c | |
security-group-id: sg-022a5cdcf57e9618b | |
key-name: us-east-1-github-runner-key | |
iam-role-name: github-runners-us-east-1-github-runner-ec2 | |
aws-resource-tags: > | |
[ | |
{"Key": "Name", "Value": "github-runner-${{ github.run_id }}-${{ github.run_number }}"}, | |
{"Key": "GitHubRepository", "Value": "${{ github.repository }}"}, | |
{"Key": "KeyName", "Value": "github-runners-key"}, | |
{"Key": "Deployment", "Value": "github-runners"}, | |
{"Key": "Type", "Value": "GithubRunner"}, | |
{"Key": "User", "Value": "ec2-user"}, | |
{"Key": "Environment", "Value": "production"} | |
] | |
download-image: | |
name: Download images | |
needs: start-runner | |
runs-on: ${{ needs.start-runner.outputs.label }} | |
steps: | |
# Pull all images required for the docker compose file | |
- name: Compose docker-compose.yml | |
run: python3 ./scripts/docker-composer.py --dev -f docker/compose/docker-compose.llama-1b-gpu.ci.yml -o development-compose.yml | |
- name: Pull images | |
run: docker compose -f development-compose.yml pull | |
build-images: | |
name: Build ${{ matrix.component }} image | |
needs: start-runner | |
runs-on: ${{ needs.start-runner.outputs.label }} | |
strategy: | |
matrix: | |
component: [vllm, api] | |
include: | |
- component: api | |
build_args: "--target nilai --platform linux/amd64" | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v2 | |
- name: Build ${{ matrix.component }} image | |
run: | | |
echo "Building ${{ matrix.component }} image..." | |
docker build -t nillion/nilai-${{ matrix.component }}:latest -f docker/${{ matrix.component }}.Dockerfile ${{ matrix.build_args || '' }} . | |
echo "✅ ${{ matrix.component }} build completed successfully" | |
e2e-tests: | |
name: E2E Tests | |
needs: [start-runner, build-images] | |
runs-on: ${{ needs.start-runner.outputs.label }} | |
steps: | |
- name: Checkout | |
uses: actions/checkout@v2 | |
- uses: astral-sh/setup-uv@v4 | |
with: | |
enable-cache: true | |
cache-dependency-glob: "**/pyproject.toml" | |
- name: Install dependencies | |
run: | | |
apt-get update && apt-get install curl git pkg-config automake file python3.12-dev -y | |
export ACLOCAL=aclocal | |
export AUTOMAKE=automake | |
uv sync | |
- name: Create .env | |
run: | | |
cp .env.ci .env | |
# Copy secret into .env replacing the existing HF_TOKEN | |
sed -i 's/HF_TOKEN=.*/HF_TOKEN=${{ secrets.HF_TOKEN }}/' .env | |
sed -i 's/BRAVE_SEARCH_API=.*/BRAVE_SEARCH_API=${{ secrets.BRAVE_SEARCH_API }}/' .env | |
sed -i 's/NILDB_BUILDER_PRIVATE_KEY=.*/NILDB_BUILDER_PRIVATE_KEY=${{ secrets.NILDB_BUILDER_PRIVATE_KEY }}/' .env | |
sed -i 's/NILDB_COLLECTION=.*/NILDB_COLLECTION=${{ secrets.NILDB_COLLECTION }}/' .env | |
- name: Login to GitHub Container Registry | |
uses: docker/login-action@v3 | |
with: | |
registry: ghcr.io | |
username: ${{ github.actor }} | |
password: ${{ secrets.GITHUB_TOKEN }} | |
- name: Compose docker-compose.yml | |
run: python3 ./scripts/docker-composer.py --dev -f docker/compose/docker-compose.llama-1b-gpu.ci.yml -o development-compose.yml | |
- name: GPU stack versions (non-fatal) | |
shell: bash | |
run: | | |
set +e # never fail this step | |
echo "::group::Host & kernel" | |
uname -a || true | |
echo "Kernel: $(uname -r 2>/dev/null || echo unknown)" | |
test -e /var/run/reboot-required && echo "Reboot flag: PRESENT" || echo "Reboot flag: none" | |
echo "::endgroup::" | |
echo "::group::NVIDIA driver" | |
if command -v nvidia-smi >/dev/null 2>&1; then | |
nvidia-smi || true | |
echo "Driver version (nvidia-smi): $(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -n1 || echo unknown)" | |
echo "GPU(s):"; nvidia-smi -L || true | |
else | |
echo "nvidia-smi: not found" | |
fi | |
if [ -r /proc/driver/nvidia/version ]; then | |
echo "--- /proc/driver/nvidia/version ---" | |
cat /proc/driver/nvidia/version || true | |
else | |
echo "/proc/driver/nvidia/version: not present" | |
fi | |
command -v modinfo >/dev/null 2>&1 && { echo "--- modinfo nvidia (head) ---"; modinfo nvidia 2>/dev/null | head -n 20 || true; } || true | |
echo "::endgroup::" | |
echo "::group::DKMS status" | |
command -v dkms >/dev/null 2>&1 && dkms status | grep -i nvidia || echo "dkms or nvidia dkms info not present" | |
echo "::endgroup::" | |
echo "::group::CUDA toolkit/runtime" | |
if command -v nvcc >/dev/null 2>&1; then | |
nvcc --version || true | |
else | |
echo "nvcc: not found" | |
fi | |
echo "libcudart in ldconfig:" | |
ldconfig -p 2>/dev/null | grep -i libcudart || echo "libcudart not found in ldconfig cache" | |
echo "NCCL packages:" | |
dpkg -l 2>/dev/null | grep -iE '^ii\s+libnccl' || echo "NCCL not installed (Debian/Ubuntu dpkg check)" | |
echo "::endgroup::" | |
echo "::group::Container stack" | |
docker --version || echo "docker: not found" | |
docker info 2>/dev/null | grep -iE 'Runtimes|nvidia' || echo "docker info: no nvidia runtime line found" | |
containerd --version 2>/dev/null || echo "containerd: not found" | |
runc --version 2>/dev/null || echo "runc: not found" | |
echo "::endgroup::" | |
echo "::group::NVIDIA container runtime/toolkit" | |
# Legacy/runtime binaries | |
if command -v nvidia-container-runtime >/dev/null 2>&1; then | |
nvidia-container-runtime --version || nvidia-container-runtime -v || true | |
else | |
echo "nvidia-container-runtime: not found" | |
fi | |
# Toolkit binaries (newer distros) | |
if command -v nvidia-ctk >/dev/null 2>&1; then | |
nvidia-ctk --version || true | |
nvidia-ctk runtime configure --help >/dev/null 2>&1 || true | |
else | |
echo "nvidia-ctk: not found" | |
fi | |
if command -v nvidia-container-toolkit >/dev/null 2>&1; then | |
nvidia-container-toolkit --version || true | |
else | |
echo "nvidia-container-toolkit: not found" | |
fi | |
echo "libnvidia-container packages:" | |
dpkg -l 2>/dev/null | grep -iE '^ii\s+(libnvidia-container1|libnvidia-container-tools)\s' || echo "libnvidia-container packages not found (dpkg)" | |
# Show runtime config if present | |
if [ -f /etc/nvidia-container-runtime/config.toml ]; then | |
echo "--- /etc/nvidia-container-runtime/config.toml (head) ---" | |
sed -n '1,120p' /etc/nvidia-container-runtime/config.toml || true | |
else | |
echo "/etc/nvidia-container-runtime/config.toml: not present" | |
fi | |
echo "::endgroup::" | |
echo "::group::Apt logs (NVIDIA-related entries)" | |
for f in /var/log/apt/history.log /var/log/apt/term.log /var/log/unattended-upgrades/unattended-upgrades.log; do | |
if [[ -f "$f" ]]; then | |
echo "--- scanning $f" | |
grep -H -i -E 'nvidia|cuda|container-toolkit' "$f" || echo "no recent NVIDIA entries" | |
else | |
echo "missing: $f" | |
fi | |
done | |
echo "::endgroup::" | |
- name: Start Services | |
run: | | |
docker-compose -f development-compose.yml up -d | |
docker ps -a | |
- name: Wait for services to be healthy | |
run: bash scripts/wait_for_ci_services.sh | |
- name: Run E2E tests for NUC | |
run: | | |
set -e | |
export ENVIRONMENT=ci | |
export AUTH_STRATEGY=nuc | |
uv run pytest -v tests/e2e | |
- name: Run E2E tests for API Key | |
run: | | |
set -e | |
# Create a user with a rate limit of 1000 requests per minute, hour, and day | |
export AUTH_TOKEN=$(docker exec nilai-api uv run src/nilai_api/commands/add_user.py --name test1 --ratelimit-minute 1000 --ratelimit-hour 1000 --ratelimit-day 1000 | jq ".apikey" -r) | |
export ENVIRONMENT=ci | |
# Set the environment variable for the API key | |
export AUTH_STRATEGY=api_key | |
uv run pytest -v tests/e2e | |
- name: Stop Services | |
run: | | |
docker-compose -f development-compose.yml down -v | |
push-images: | |
name: Push ${{ matrix.component }} to ECR | |
needs: [start-runner, build-images, e2e-tests] | |
runs-on: ${{ needs.start-runner.outputs.label }} | |
if: (github.event_name == 'push' && github.ref == 'refs/heads/main') || github.event_name == 'release' | |
strategy: | |
matrix: | |
component: [vllm, api] | |
steps: | |
- name: Configure AWS credentials for ECR | |
uses: aws-actions/configure-aws-credentials@v4 | |
with: | |
role-to-assume: "arn:aws:iam::054037142884:role/nilAI-github" | |
aws-region: "us-east-1" | |
- name: Login to Amazon ECR | |
id: login-ecr | |
uses: aws-actions/amazon-ecr-login@v2 | |
with: | |
registry-type: public | |
- name: Set image tags | |
id: image-tags | |
run: | | |
IMAGE_TAG="${{ github.event_name == 'release' && github.ref_name || github.sha }}" | |
echo "image_tag=${IMAGE_TAG}" >> $GITHUB_OUTPUT | |
- name: Tag and push ${{ matrix.component }} to ECR | |
env: | |
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} | |
ECR_REGISTRY_ALIAS: k5d9x2g2 | |
IMAGE_TAG: ${{ steps.image-tags.outputs.image_tag }} | |
run: | | |
echo "Tagging and pushing ${{ matrix.component }} image to ECR..." | |
# Tag for ECR | |
docker tag nillion/nilai-${{ matrix.component }}:latest ${ECR_REGISTRY}/${ECR_REGISTRY_ALIAS}/nilai-${{ matrix.component }}:${IMAGE_TAG} | |
# Push to ECR | |
docker push ${ECR_REGISTRY}/${ECR_REGISTRY_ALIAS}/nilai-${{ matrix.component }}:${IMAGE_TAG} | |
echo "## Pushed ${{ matrix.component }} Image" >> $GITHUB_STEP_SUMMARY | |
echo "- ${{ matrix.component }}: \`${ECR_REGISTRY}/${ECR_REGISTRY_ALIAS}/nilai-${{ matrix.component }}:${IMAGE_TAG}\`" >> $GITHUB_STEP_SUMMARY | |
stop-runner: | |
name: Stop self-hosted EC2 runner | |
needs: [start-runner, build-images, e2e-tests, push-images] | |
runs-on: ubuntu-24.04 | |
if: ${{ always() }} | |
steps: | |
- name: Configure AWS credentials | |
uses: aws-actions/configure-aws-credentials@v1 | |
with: | |
aws-access-key-id: ${{ secrets.GH_AWS_ACCESS_KEY }} | |
aws-secret-access-key: ${{ secrets.GH_AWS_SECRET_KEY }} | |
aws-region: "eu-west-1" | |
- name: Stop EC2 runner | |
uses: NillionNetwork/[email protected] | |
with: | |
mode: stop | |
github-token: ${{ secrets.GH_PERSONAL_ACCESS_TOKEN }} | |
label: ${{ needs.start-runner.outputs.label }} | |
ec2-instances-ids: ${{ needs.start-runner.outputs.ec2-instances-ids }} |