diff --git a/comps/llms/README.md b/comps/llms/README.md index 15c7c366c..584f2ba12 100644 --- a/comps/llms/README.md +++ b/comps/llms/README.md @@ -32,7 +32,7 @@ docker run -p 8008:80 -v ./data:/data --name tgi_service --shm-size 1g ghcr.io/h ```bash export HUGGINGFACEHUB_API_TOKEN=${your_hf_api_token} -docker run -it --name vllm_service -p 8008:80 -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -v ./data:/data vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model ${your_hf_llm_model} --port 80" +docker run -it --name vllm_service -p 8008:80 -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -v ./data:/data opea/vllm:cpu /bin/bash -c "cd / && export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --model ${your_hf_llm_model} --port 80" ``` ## 1.2.3 Start Ray Service diff --git a/comps/llms/text-generation/vllm-ray/build_docker_vllmray.sh b/comps/llms/text-generation/vllm-ray/build_docker_vllmray.sh index 9e9fe3b71..8c4c13d3b 100755 --- a/comps/llms/text-generation/vllm-ray/build_docker_vllmray.sh +++ b/comps/llms/text-generation/vllm-ray/build_docker_vllmray.sh @@ -5,7 +5,7 @@ cd ../../../../ docker build \ -f comps/llms/text-generation/vllm-ray/docker/Dockerfile.vllmray \ - -t vllm_ray:habana \ + -t opea/vllm_ray:habana \ --network=host \ --build-arg http_proxy=${http_proxy} \ --build-arg https_proxy=${https_proxy} \ diff --git a/comps/llms/text-generation/vllm-ray/docker_compose_llm.yaml b/comps/llms/text-generation/vllm-ray/docker_compose_llm.yaml index a3ae3ec04..76d3423f1 100644 --- a/comps/llms/text-generation/vllm-ray/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm-ray/docker_compose_llm.yaml @@ -5,7 +5,7 @@ version: "3.8" services: vllm-ray-service: - image: vllm_ray:habana + image: opea/vllm_ray:habana container_name: vllm-ray-gaudi-server ports: - "8006:8000" diff --git a/comps/llms/text-generation/vllm-ray/launch_vllmray.sh b/comps/llms/text-generation/vllm-ray/launch_vllmray.sh index fcff33265..895e6a066 100755 --- a/comps/llms/text-generation/vllm-ray/launch_vllmray.sh +++ b/comps/llms/text-generation/vllm-ray/launch_vllmray.sh @@ -39,5 +39,5 @@ docker run -d --rm \ -e HTTPS_PROXY=$https_proxy \ -e HTTP_PROXY=$https_proxy \ -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \ - vllm_ray:habana \ + opea/vllm_ray:habana \ /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $model_name --tensor_parallel_size $parallel_number --enforce_eager $enforce_eager" diff --git a/comps/llms/text-generation/vllm-ray/requirements.txt b/comps/llms/text-generation/vllm-ray/requirements.txt index 2e8b8c578..083a2910b 100644 --- a/comps/llms/text-generation/vllm-ray/requirements.txt +++ b/comps/llms/text-generation/vllm-ray/requirements.txt @@ -11,7 +11,7 @@ opentelemetry-exporter-otlp opentelemetry-sdk prometheus-fastapi-instrumentator ray[serve]>=2.10 -setuptools==69.5.1 +setuptools shortuuid transformers uvicorn diff --git a/comps/llms/text-generation/vllm/README.md b/comps/llms/text-generation/vllm/README.md index 1445d1bd1..3f0184ed9 100644 --- a/comps/llms/text-generation/vllm/README.md +++ b/comps/llms/text-generation/vllm/README.md @@ -50,6 +50,12 @@ bash ./build_docker_vllm.sh hpu Set `hw_mode` to `hpu`. +Note: If you want to enable tensor parallel, please set `setuptools==69.5.1` in Dockerfile.hpu before build docker with following command. + +``` +sed -i "s/RUN pip install setuptools/RUN pip install setuptools==69.5.1/g" docker/Dockerfile.hpu +``` + #### Launch vLLM service on single node For small model, we can just use single node. diff --git a/comps/llms/text-generation/vllm/build_docker_vllm.sh b/comps/llms/text-generation/vllm/build_docker_vllm.sh index 3680f076c..c1037a5c7 100644 --- a/comps/llms/text-generation/vllm/build_docker_vllm.sh +++ b/comps/llms/text-generation/vllm/build_docker_vllm.sh @@ -30,9 +30,9 @@ fi # Build the docker image for vLLM based on the hardware mode if [ "$hw_mode" = "hpu" ]; then - docker build -f docker/Dockerfile.hpu -t vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + docker build -f docker/Dockerfile.hpu -t opea/vllm:hpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy else git clone https://github.com/vllm-project/vllm.git cd ./vllm/ - docker build -f Dockerfile.cpu -t vllm:cpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy + docker build -f Dockerfile.cpu -t opea/vllm:cpu --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy fi diff --git a/comps/llms/text-generation/vllm/docker/Dockerfile.hpu b/comps/llms/text-generation/vllm/docker/Dockerfile.hpu index c7093d4c0..af5d70852 100644 --- a/comps/llms/text-generation/vllm/docker/Dockerfile.hpu +++ b/comps/llms/text-generation/vllm/docker/Dockerfile.hpu @@ -9,7 +9,7 @@ RUN pip install --upgrade-strategy eager optimum[habana] RUN pip install -v git+https://github.com/HabanaAI/vllm-fork.git@cf6952d -RUN pip install setuptools==69.5.1 +RUN pip install setuptools RUN sed -i 's/#PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \ service ssh restart diff --git a/comps/llms/text-generation/vllm/docker_compose_llm.yaml b/comps/llms/text-generation/vllm/docker_compose_llm.yaml index 818fdf54a..205c9293a 100644 --- a/comps/llms/text-generation/vllm/docker_compose_llm.yaml +++ b/comps/llms/text-generation/vllm/docker_compose_llm.yaml @@ -5,7 +5,7 @@ version: "3.8" services: vllm-service: - image: vllm:hpu + image: opea/vllm:hpu container_name: vllm-gaudi-server ports: - "8008:80" diff --git a/comps/llms/text-generation/vllm/launch_vllm_service.sh b/comps/llms/text-generation/vllm/launch_vllm_service.sh index 0b225023c..0c7ed90de 100644 --- a/comps/llms/text-generation/vllm/launch_vllm_service.sh +++ b/comps/llms/text-generation/vllm/launch_vllm_service.sh @@ -38,7 +38,7 @@ volume=$PWD/data # Build the Docker run command based on hardware mode if [ "$hw_mode" = "hpu" ]; then - docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture " + docker run -d --rm --runtime=habana --name="vllm-service" -p $port_number:80 -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} opea/vllm:hpu /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $model_name --tensor-parallel-size $parallel_number --host 0.0.0.0 --port 80 --block-size $block_size --max-num-seqs $max_num_seqs --max-seq_len-to-capture $max_seq_len_to_capture " else - docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 vllm:cpu --model $model_name --host 0.0.0.0 --port 80 + docker run -d --rm --name="vllm-service" -p $port_number:80 --network=host -v $volume:/data -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} -e VLLM_CPU_KVCACHE_SPACE=40 opea/vllm:cpu --model $model_name --host 0.0.0.0 --port 80 fi diff --git a/tests/test_llms_text-generation_vllm-ray.sh b/tests/test_llms_text-generation_vllm-ray.sh index 8ecb487e9..7ab235a93 100644 --- a/tests/test_llms_text-generation_vllm-ray.sh +++ b/tests/test_llms_text-generation_vllm-ray.sh @@ -12,7 +12,7 @@ function build_docker_images() { cd $WORKPATH docker build \ -f comps/llms/text-generation/vllm-ray/docker/Dockerfile.vllmray \ - -t vllm_ray:habana --network=host . + -t opea/vllm_ray:habana --network=host . ## Build OPEA microservice docker cd $WORKPATH @@ -34,7 +34,7 @@ function start_service() { --ipc=host \ -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN \ -p $port_number:8000 \ - vllm_ray:habana \ + opea/vllm_ray:habana \ /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL --tensor_parallel_size 2 --enforce_eager False" export vLLM_RAY_ENDPOINT="http://${ip_address}:${port_number}" diff --git a/tests/test_llms_text-generation_vllm.sh b/tests/test_llms_text-generation_vllm.sh index c5e7faa4b..48bee9ae8 100644 --- a/tests/test_llms_text-generation_vllm.sh +++ b/tests/test_llms_text-generation_vllm.sh @@ -12,7 +12,7 @@ function build_docker_images() { cd $WORKPATH/comps/llms/text-generation/vllm docker build \ -f docker/Dockerfile.hpu \ - -t vllm:hpu \ + -t opea/vllm:hpu \ --shm-size=128g . ## Build OPEA microservice docker @@ -35,7 +35,7 @@ function start_service() { --cap-add=sys_nice \ --ipc=host \ -e HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN} \ - vllm:hpu \ + opea/vllm:hpu \ /bin/bash -c "export VLLM_CPU_KVCACHE_SPACE=40 && python3 -m vllm.entrypoints.openai.api_server --enforce-eager --model $LLM_MODEL --tensor-parallel-size 1 --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048" export vLLM_ENDPOINT="http://${ip_address}:${port_number}"