feat: added vLLM

TimPietruskyRunPod · TimPietruskyRunPod · commit 1fc3fd749c17 · 2025-08-20T10:46:46.000+02:00
diff --git a/Dockerfile b/Dockerfile
@@ -10,6 +10,14 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 
 RUN rm -rf /root/.cache/pip
 
+# Install uv for faster package management
+RUN pip install uv
+
+# Create separate venv for vLLM using uv to avoid flash-attn conflicts with Axolotl
+# Match CUDA version with base image (CUDA 12.6)
+RUN uv venv /opt/vllm-venv && \
+    uv pip install --python /opt/vllm-venv/bin/python vllm --torch-backend=cu126
+
 # Expose vLLM port (not started automatically)
 EXPOSE 8000
 
diff --git a/README.md b/README.md
@@ -41,13 +41,10 @@ axolotl train config.yaml
 4. **Optional - Start vLLM server** (after training):
 
 ```bash
-# Option A: Using YAML config (recommended)
-cp vllm_config_example.yaml my_config.yaml
-# Edit my_config.yaml with your model path
-./start_vllm.sh my_config.yaml
-
-# Option B: Command line
-./start_vllm.sh ./outputs/lora-out --lora-modules lora_name=./outputs/lora-out
+# Create your vLLM config based on the example
+cp vllm_config_example.yaml my_vllm_config.yaml
+# Edit my_vllm_config.yaml with your trained model path and settings
+./start_vllm.sh my_vllm_config.yaml
 ```
 
 ## 🏗️ Local Development
@@ -150,30 +147,14 @@ After training, you can serve your model using the built-in vLLM server:
 
 ### Quick Start vLLM
 
-#### Option A: Using YAML Config (Recommended)
-
 ```bash
 # 1. Copy and customize the example config
 cp vllm_config_example.yaml my_vllm_config.yaml
-# Edit my_vllm_config.yaml with your model path and settings
-
-# 2. Start vLLM with config
+# 2. Edit my_vllm_config.yaml with your trained model path and settings
+# 3. Start vLLM with your config
 ./start_vllm.sh my_vllm_config.yaml
 ```
 
-#### Option B: Command Line Arguments
-
-```bash
-# For LoRA models
-./start_vllm.sh ./outputs/lora-out --lora-modules lora_name=./outputs/lora-out
-
-# For merged/full fine-tuned models
-./start_vllm.sh ./outputs/merged-model
-
-# With custom settings
-./start_vllm.sh ./outputs/my-model --max-model-len 4096 --gpu-memory-utilization 0.8
-```
-
 ### vLLM Features
 
 - **OpenAI-compatible API** at `http://localhost:8000`
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -0,0 +1,56 @@
+services:
+  llm-finetuning:
+    image: runpod/llm-finetuning-axolotl:dev
+    platform: linux/amd64
+
+    # GPU access
+    # deploy:
+    #   resources:
+    #     reservations:
+    #       devices:
+    #         - driver: nvidia
+    #           count: all
+    #           capabilities: [gpu]
+
+    # Port mapping for vLLM
+    ports:
+      - "8000:8000" # vLLM API server
+      - "8888:8888" # Jupyter Lab (from base image)
+      - "2222:22" # SSH access (from base image)
+
+    # Environment variables for training configuration
+    environment:
+      # Required credentials
+      - HF_TOKEN=${HF_TOKEN}
+      # - WANDB_API_KEY=${WANDB_API_KEY}
+
+      # Training configuration (examples - customize as needed)
+      - AXOLOTL_BASE_MODEL=TinyLlama/TinyLlama_v1.1
+      - AXOLOTL_DATASETS=[{"path":"mhenrichsen/alpaca_2k_test","type":"alpaca"}]
+      - AXOLOTL_OUTPUT_DIR=./outputs/my_training
+      - AXOLOTL_ADAPTER=lora
+      - AXOLOTL_LORA_R=8
+      - AXOLOTL_LORA_ALPHA=16
+      - AXOLOTL_NUM_EPOCHS=1
+      - AXOLOTL_MICRO_BATCH_SIZE=2
+      - AXOLOTL_GRADIENT_ACCUMULATION_STEPS=1
+      - AXOLOTL_LEARNING_RATE=0.0002
+      - AXOLOTL_LOAD_IN_8BIT=true
+
+      # Optional: Disable Jupyter if not needed
+      # - JUPYTER_DISABLE=1
+
+      # Optional: SSH key for access
+      # - PUBLIC_KEY=${PUBLIC_KEY}
+
+    # Volume mounts for persistent data
+    volumes:
+      - ./outputs:/workspace/data/axolotl-artifacts
+      - ./configs:/workspace/fine-tuning/configs
+
+    # Keep container running
+    tty: true
+    stdin_open: true
+
+    # Optional: Override command for debugging
+    # command: ["sleep", "infinity"]
diff --git a/env.example b/env.example
@@ -0,0 +1,9 @@
+# Copy this file to .env and fill in your values
+# cp env.example .env
+
+# Required credentials
+HF_TOKEN=your-huggingface-token-here
+WANDB_API_KEY=your-wandb-api-key-here
+
+# Optional: SSH public key for container access
+# PUBLIC_KEY=ssh-rsa AAAAB3NzaC1yc2E... your-email@example.com
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1 @@
-runpod~=1.7.0
-
-# vLLM for inference serving
-# The base image already has flash-attn from axolotl installation
-# vLLM will use the existing flash-attn if it's compatible
-vllm
+runpod~=1.7.0
diff --git a/scripts/WELCOME b/scripts/WELCOME
@@ -8,10 +8,9 @@ You've successfully configured your training environment! 🎉
 1️⃣  Familiarize yourself with the examples/ and outputs/ directories.
 2️⃣  Carefully review your config.yaml settings, verifying both format and values. As a best practice, ensure that all hyperparameters are tuned according to your specific use case to prevent potential errors.
 3️⃣  Start fine-tuning when you're ready with `axolotl train config.yaml`
-4️⃣  After training, serve your model with `./start_vllm.sh ./outputs/your-model`
+4️⃣  After training, serve your model with `./start_vllm.sh your_vllm_config.yaml`
 
 ────────────────────────────────────
-✨ POWERED BY AXOLOTL 🦎 + vLLM 🚀
+✨ POWERED BY AXOLOTL 🦎 
 ────────────────────────────────────
-📄 Axolotl Docs: https://axolotl-ai-cloud.github.io/axolotl/docs/config.html
-🌐 vLLM Server: http://localhost:8000 (after starting) 
+📄 Axolotl Docs: https://axolotl-ai-cloud.github.io/axolotl/docs/config.html 
diff --git a/scripts/start_vllm.sh b/scripts/start_vllm.sh
@@ -80,8 +80,8 @@ except:
     echo "🔧 Additional args: $*"
     echo ""
     
-    # Start vLLM with config file
-    python -m vllm.entrypoints.openai.api_server \
+    # Start vLLM with config file (using dedicated venv)
+    /opt/vllm-venv/bin/python -m vllm.entrypoints.openai.api_server \
         --config "$INPUT" \
         "$@"
         
@@ -95,8 +95,8 @@ else
     echo "🌐 Server will be available at: http://0.0.0.0:8000"
     echo ""
     
-    # Start vLLM with the provided model and any additional arguments
-    python -m vllm.entrypoints.openai.api_server \
+    # Start vLLM with the provided model and any additional arguments (using dedicated venv)
+    /opt/vllm-venv/bin/python -m vllm.entrypoints.openai.api_server \
         --model "$MODEL_PATH" \
         --host 0.0.0.0 \
         --port 8000 \