runpod-workers
diff --git a/‎Dockerfile‎
Lines changed: 18 additions & 30 deletions b/‎Dockerfile‎
Lines changed: 18 additions & 30 deletions
diff --git a/‎README.md‎
Lines changed: 48 additions & 25 deletions b/‎README.md‎
Lines changed: 48 additions & 25 deletions
diff --git a/‎builder/requirements.txt‎
Lines changed: 4 additions & 1 deletion b/‎builder/requirements.txt‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/constants.py‎
Lines changed: 1 addition & 0 deletions b/‎src/constants.py‎
Lines changed: 1 addition & 0 deletions
@@ -1,52 +1,40 @@
-# syntax = docker/dockerfile:1.3
-ARG WORKER_CUDA_VERSION=11.8
-FROM runpod/base:0.4.4-cuda${WORKER_CUDA_VERSION}.0 as builder
-
-ARG WORKER_CUDA_VERSION=11.8 # Required duplicate to keep in scope
-
-# Set Environment Variables
-ENV WORKER_CUDA_VERSION=${WORKER_CUDA_VERSION} \
-    HF_DATASETS_CACHE="/runpod-volume/huggingface-cache/datasets" \
-    HUGGINGFACE_HUB_CACHE="/runpod-volume/huggingface-cache/hub" \
-    TRANSFORMERS_CACHE="/runpod-volume/huggingface-cache/hub" \
-    HF_TRANSFER=1
+ARG WORKER_CUDA_VERSION=11.8.0
+FROM runpod/worker-vllm:base-0.2.0-cuda${WORKER_CUDA_VERSION} AS vllm-base
 
+RUN apt-get update -y \
+    && apt-get install -y python3-pip
 
 # Install Python dependencies
 COPY builder/requirements.txt /requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3.11 -m pip install --upgrade pip && \
-    python3.11 -m pip install --upgrade -r /requirements.txt && \
-    rm /requirements.txt
-
-# Install torch and vllm based on CUDA version
-RUN if [[ "${WORKER_CUDA_VERSION}" == 11.8* ]]; then \
-        python3.11 -m pip install -U --force-reinstall torch==2.1.2 xformers==0.0.23.post1 --index-url https://download.pytorch.org/whl/cu118; \
-        python3.11 -m pip install -e git+https://github.com/runpod/[email protected]#egg=vllm; \
-    else \
-        python3.11 -m pip install -e git+https://github.com/runpod/[email protected]#egg=vllm; \
-    fi && \
-    rm -rf /root/.cache/pip
+    python3 -m pip install --upgrade pip && \
+    python3 -m pip install --upgrade -r /requirements.txt
 
 # Add source files
-COPY src .
+COPY src /src
 
 # Setup for Option 2: Building the Image with the Model included
 ARG MODEL_NAME=""
-ARG MODEL_BASE_PATH="/runpod-volume/"
+ARG MODEL_BASE_PATH="/runpod-volume"
 ARG QUANTIZATION=""
 
 ENV MODEL_BASE_PATH=$MODEL_BASE_PATH \
     MODEL_NAME=$MODEL_NAME \
-    QUANTIZATION=$QUANTIZATION 
-
+    QUANTIZATION=$QUANTIZATION \
+    HF_DATASETS_CACHE="${MODEL_BASE_PATH}/huggingface-cache/datasets" \
+    HUGGINGFACE_HUB_CACHE="${MODEL_BASE_PATH}/huggingface-cache/hub" \
+    HF_HOME="${MODEL_BASE_PATH}/huggingface-cache/hub" \
+    HF_TRANSFER=1 
+    
 RUN --mount=type=secret,id=HF_TOKEN,required=false \
     if [ -f /run/secrets/HF_TOKEN ]; then \
         export HF_TOKEN=$(cat /run/secrets/HF_TOKEN); \
     fi && \
     if [ -n "$MODEL_NAME" ]; then \
-        python3.11 /download_model.py --model $MODEL_NAME; \
+        python3 /src/download_model.py --model $MODEL_NAME; \
     fi
 
+ENV PYTHONPATH="/:/vllm-installation"
+
 # Start the handler
-CMD ["python3.11", "/handler.py"]
+CMD ["python3", "/src/handler.py"]
@@ -1,15 +1,25 @@
 <div align="center">
 
-<h1>vLLM 0.2.6 Endpoint | Serverless Worker </h1>
+<h1> vLLM Serverless Endpoint Worker </h1>
 
 [![CD | Docker-Build-Release](https://github.com/runpod-workers/worker-vllm/actions/workflows/docker-build-release.yml/badge.svg)](https://github.com/runpod-workers/worker-vllm/actions/workflows/docker-build-release.yml)
 
-🚀 | This serverless worker utilizes vLLM behind the scenes and is integrated into RunPod's serverless environment. It supports dynamic auto-scaling using the built-in RunPod autoscaling feature.
+Deploy Blazing-fast LLMs powered by [vLLM](https://github.com/vllm-project/vllm) on RunPod Serverless in a few clicks.
 </div>
 
+### Worker vLLM 0.2.0 - What's New
+- You no longer need a linux-based machine or NVIDIA GPUs to build the worker.
+- Over 3x lighter Docker image size.
+- OpenAI Chat Completion output format (optional to use).
+- Extremely fast image build time.
+- Docker Secrets-protected Hugging Face token support for building the image with a model baked in without exposing your token.
+- Support for `n` and `best_of` sampling parameters, which allow you to generate multiple responses from a single prompt.
+- New environment variables for various configuration.
+- vLLM Version: 0.2.7
+
 ## Table of Contents
 - [Setting up the Serverless Worker](#setting-up-the-serverless-worker)
-  - [Option 1: Deploy Any Model Using Pre-Built Docker Image](#option-1-deploy-any-model-using-pre-built-docker-image)
+  - [Option 1: Deploy Any Model Using Pre-Built Docker Image [**RECOMMENDED**]](#option-1-deploy-any-model-using-pre-built-docker-image-recommended)
     - [Prerequisites](#prerequisites)
     - [Environment Variables](#environment-variables)
   - [Option 2: Build Docker Image with Model Inside](#option-2-build-docker-image-with-model-inside)
@@ -26,13 +36,13 @@
 
 ## Setting up the Serverless Worker
 
-### Option 1: Deploy Any Model Using Pre-Built Docker Image
+### Option 1: Deploy Any Model Using Pre-Built Docker Image [Recommended]
 
 We now offer a pre-built Docker Image for the vLLM Worker that you can configure entirely with Environment Variables when creating the RunPod Serverless Endpoint:
 
 <div align="center">
 
-Stable Image: ```runpod/worker-vllm:0.1.0```
+Stable Image: ```runpod/worker-vllm:0.2.0```
 
 Development Image: ```runpod/worker-vllm:dev```
 
@@ -43,37 +53,52 @@ Development Image: ```runpod/worker-vllm:dev```
 
 #### Environment Variables
 
-- **Required**:
+**Required**:
    - `MODEL_NAME`: Hugging Face Model Repository (e.g., `openchat/openchat-3.5-1210`).
 
-- **Optional**:
+**Optional**:
+- Model Settings:
   - `MAX_MODEL_LENGTH`: Maximum number of tokens for the engine to be able to handle. (default: maximum supported by the model)
   - `MODEL_BASE_PATH`: Model storage directory (default: `/runpod-volume`).
+  - `LOAD_FORMAT`: Format to load model in (default: `auto`).
   - `HF_TOKEN`: Hugging Face token for private and gated models (e.g., Llama, Falcon).
-  - `NUM_GPU_SHARD`: Number of GPUs to split the model across. (default: `1`)
   - `QUANTIZATION`: AWQ (`awq`), SqueezeLLM (`squeezellm`) or GPTQ (`gptq`) Quantization. The specified Model Repo must be of a quantized model. (default: `None`)
+  - `TRUST_REMOTE_CODE`: Trust remote code for Hugging Face (default: `0`)
+  
+- Tensor Parallelism:
+
+  Note that the more GPUs you split a model's weights accross, the slower it will be due to inter-GPU communication overhead. If you can fit the model on a single GPU, it is recommended to do so. 
+  - `USE_TENSOR_PARALLEL`: Enable (`1`) or disable (`0`) Tensor Parallelism. (default: `0`)
+  - `TENSOR_PARALLEL_SIZE`: Number of GPUs to shard the model across (default: `1`).
+  
+- System Settings:
+  - `GPU_MEMORY_UTILIZATION`: GPU VRAM utilization (default: `0.98`).
+  - `MAX_PARALLEL_LOADING_WORKERS`: Maximum number of parallel workers for loading models (default: `number of available CPU cores`).
+
+
+- Serverless Settings:
   - `MAX_CONCURRENCY`: Max concurrent requests. (default: `100`)
   - `DEFAULT_BATCH_SIZE`: Token streaming batch size (default: `30`). This reduces the number of HTTP calls, increasing speed 8-10x vs non-batching, matching non-streaming performance.
+  - `ALLOW_OPENAI_FORMAT`: Whether to allow users to specify `use_openai_format` to get output in OpenAI format. (default: `1`)
   - `DISABLE_LOG_STATS`: Enable (`0`) or disable (`1`) vLLM stats logging.
   - `DISABLE_LOG_REQUESTS`: Enable (`0`) or disable (`1`) request logging.
 
 ### Option 2: Build Docker Image with Model Inside
 To build an image with the model baked in, you must specify the following docker arguments when building the image.
 
 #### Prerequisites
+- RunPod Account
 - Docker
-- Linux
-- NVIDIA GPU
-> [!NOTE] 
-> We will be adding support for building on any OS without a GPU.
 
 #### Arguments:
 - **Required**
   - `MODEL_NAME`
 - **Optional**
   - `MODEL_BASE_PATH`: Defaults to `/runpod-volume` for network storage. Use `/models` or for local container storage.
   - `QUANTIZATION`
-  - `WORKER_CUDA_VERSION`: `11.8` or `12.1` (default: `11.8` due to a small amount of workers not having CUDA 12.1 support yet. `12.1` is recommended for optimal performance).
+  - `WORKER_CUDA_VERSION`: `11.8.0` or `12.1.0` (default: `11.8.0` due to a small amount of workers not having CUDA 12.1 support yet. `12.1.0` is recommended for optimal performance).
+
+For the remaining settings, you may apply them as environment variables when running the container. Supported environment variables are listed in the [Environment Variables](#environment-variables) section.
 
 #### Example: Building an image with OpenChat-3.5
 ```bash
@@ -88,22 +113,27 @@ export DOCKER_BUILDKIT=1
 ```
 2. Export your Hugging Face token as an environment variable
 ```bash
-export HF_TOKEN="your_secret_value_here"
+export HF_TOKEN="your_token_here"
 ```
 2. Add the token as a secret when building
 ```bash
 docker build -t username/image:tag --secret id=HF_TOKEN --build-arg MODEL_NAME="openchat/openchat_3.5" .
 ```
 
-### Compatible Models
-
-- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+### Compatible Model Architectures
 - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
 - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
+- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
+- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+- Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
+- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
+- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
+- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
 - Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
 - Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
 - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
 - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
+- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
 - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
 - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
 - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
@@ -112,14 +142,6 @@ docker build -t username/image:tag --secret id=HF_TOKEN --build-arg MODEL_NAME="
 - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
 - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
-- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
-- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
-- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
-
-And any other models supported by vLLM 0.2.6.
-
-
-Ensure that you have Docker installed and properly set up before running the docker build commands. Once built, you can deploy this serverless worker in your desired environment with confidence that it will automatically scale based on demand. For further inquiries or assistance, feel free to contact our support team.
 
 
 ## Usage
@@ -129,6 +151,7 @@ You may either use a `prompt` or a list of `messages` as input. If you use `mess
 |-----------------------|----------------------|--------------------|--------------------------------------------------------------------------------------------------------|
 | `prompt`              | str                  |                    | Prompt string to generate text based on.                                                               |
 | `messages`            | list[dict[str, str]] |                    | List of messages, which will automatically have the model's chat template applied. Overrides `prompt`. |
+| `use_openai_format`   | bool                 | False              | Whether to return output in OpenAI format. `ALLOW_OPENAI_FORMAT` environment variable must be `1`, the input must be a `messages` list, and `stream` enabled.                                                              |
 | `apply_chat_template` | bool                 | False              | Whether to apply the model's chat template to the `prompt`.                                            |
 | `sampling_params`     | dict                 | {}                 | Sampling parameters to control the generation, like temperature, top_p, etc.                           |
 | `stream`              | bool                 | False              | Whether to enable streaming of output. If True, responses are streamed as they are generated.          |
 
@@ -1,6 +1,9 @@
 hf_transfer
+ray
+pandas
+pyarrow
 runpod==1.5.2
 huggingface-hub
 packaging
 typing-extensions==4.7.1
-pydantic
+pydantic
@@ -24,4 +24,5 @@
     "prompt_logprobs": int,
     "skip_special_tokens": bool,
     "spaces_between_special_tokens": bool,
+    "include_stop_str_in_output": bool
 }
Original file line number	Diff line number	Diff line change
`@@ -24,4 +24,5 @@`
`24`	`24`	`"prompt_logprobs": int,`
`25`	`25`	`"skip_special_tokens": bool,`
`26`	`26`	`"spaces_between_special_tokens": bool,`
	`27`	`+ "include_stop_str_in_output": bool`
`27`	`28`	`}`