Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,5 @@ grafana/runtime-data/*

prometheus/data/*
!prometheus/data/.gitkeep

*.swp
2 changes: 1 addition & 1 deletion caddy/Caddyfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
}
}

https://nilai.sandbox.nilogy.xyz {
https://npw.tee.nilai.sandbox.nilogy.xyz {
import ssl_config
reverse_proxy api:8443
}
17 changes: 17 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,19 @@ services:
retries: 3
start_period: 10s

gpuverifier:
container_name: gpuverifier-api
build:
context: .
dockerfile: docker/verifier.Dockerfile
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/"]
interval: 10s
timeout: 5s
retries: 3
start_period: 5s
networks:
- backend_net
api:
container_name: nilai-api
build:
Expand All @@ -113,6 +126,8 @@ services:
condition: service_healthy
postgres:
condition: service_healthy
gpuverifier:
condition: service_healthy
deploy:
resources:
reservations:
Expand All @@ -124,6 +139,8 @@ services:
networks:
- frontend_net
- backend_net
environment:
- GPUVERIFIER_API=http://gpuverifier:8000
env_file:
- .env
healthcheck:
Expand Down
3 changes: 2 additions & 1 deletion docker/api.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@ apt-get install build-essential curl -y && \
apt-get clean && \
apt-get autoremove && \
rm -rf /var/lib/apt/lists/* && \
pip install uv && \
pip install --upgrade uv && \
uv sync


EXPOSE 8080 8443

CMD ["./launch.sh"]
1 change: 1 addition & 0 deletions docker/compose/docker-compose.deepseek-14b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ services:
- ETCD_HOST=etcd
- ETCD_PORT=2379
- TOOL_SUPPORT=false
- MODEL_ROLE=reasoning
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
networks:
Expand Down
54 changes: 54 additions & 0 deletions docker/compose/docker-compose.deepseek-14b-tools.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
services:
deepseek_14b_gpu:
build:
context: .
dockerfile: docker/vllm.Dockerfile
deploy:
resources:
reservations:
devices:
- capabilities:
- gpu
driver: nvidia
ipc: host
depends_on:
etcd:
condition: service_healthy
watt_tool_gpu:
condition: service_healthy
command:
- --model
- deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
- --max-model-len
- "10000"
- --device
- cuda
- --gpu-memory-utilization
- "0.45"
env_file:
- .env
environment:
SVC_HOST: "deepseek_14b_gpu"
SVC_PORT: "8000"
ETCD_HOST: "etcd"
ETCD_PORT: "2379"
TOOL_SUPPORT: true
MODEL_ROLE: "reasoning"
networks:
- backend_net
volumes:
- type: volume
source: hugging_face_models
target: /root/.cache/huggingface
volume: {}
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
retries: 3
start_period: 60s
timeout: 10s
volumes:
hugging_face_models:

networks:
backend_net:
58 changes: 58 additions & 0 deletions docker/compose/docker-compose.llama-1b-gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
services:
llama_1b_gpu:
build:
context: .
dockerfile: docker/vllm.Dockerfile
deploy:
resources:
reservations:
devices:
- capabilities:
- gpu
driver: nvidia
ipc: host
depends_on:
etcd:
condition: service_healthy
watt_tool_gpu:
condition: service_healthy
command:
- --model
- meta-llama/Llama-3.2-1B-Instruct
- --max-model-len
- "10000"
- --device
- cuda
- --gpu-memory-utilization
- "0.45"
- --enable-auto-tool-choice
- --tool-call-parser
- llama3_json
- --chat-template
- /tmp/tool_chat_template.jinja
env_file:
- .env
environment:
SVC_HOST: "llama_32_tool_gpu"
SVC_PORT: "8000"
ETCD_HOST: "etcd"
ETCD_PORT: "2379"
TOOL_SUPPORT: true
MODEL_ROLE: "generation"
networks:
- backend_net
volumes:
- type: volume
source: hugging_face_models
target: /root/.cache/huggingface
volume: {}
- type: bind
source: $PWD/docker/compose/tool_chat_template_llama3.2_json.jinja
target: /tmp/tool_chat_template.jinja
bind:
create_host_path: true
volumes:
hugging_face_models:

networks:
backend_net:
3 changes: 2 additions & 1 deletion docker/compose/docker-compose.llama-3b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ services:
- ETCD_HOST=etcd
- ETCD_PORT=2379
- TOOL_SUPPORT=true
- MODEL_ROLE=generation
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
networks:
Expand All @@ -47,4 +48,4 @@ volumes:
hugging_face_models:

networks:
backend_net:
backend_net:
3 changes: 2 additions & 1 deletion docker/compose/docker-compose.llama-8b-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ services:
- ETCD_HOST=etcd
- ETCD_PORT=2379
- TOOL_SUPPORT=true
- MODEL_ROLE=generation
volumes:
- hugging_face_models:/root/.cache/huggingface # cache models
networks:
Expand All @@ -47,4 +48,4 @@ volumes:
hugging_face_models:

networks:
backend_net:
backend_net:
62 changes: 62 additions & 0 deletions docker/compose/docker-compose.watt-8b-gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
services:
watt_tool_gpu:
build:
context: .
dockerfile: docker/vllm.Dockerfile
deploy:
resources:
reservations:
devices:
- capabilities:
- gpu
driver: nvidia
ipc: host
depends_on:
etcd:
condition: service_healthy
command:
- --model
- watt-ai/watt-tool-8B
- --max-model-len
- "10000"
- --device
- cuda
- --gpu-memory-utilization
- "0.45"
- --enable-auto-tool-choice
- --tool-call-parser
- llama3_json
- --chat-template
- /tmp/tool_chat_template.jinja
env_file:
- .env
environment:
SVC_HOST: "watt_tool_gpu"
SVC_PORT: "8000"
ETCD_HOST: "etcd"
ETCD_PORT: "2379"
TOOL_SUPPORT: true
MODEL_ROLE: "worker"
networks:
- backend_net
volumes:
- type: volume
source: hugging_face_models
target: /root/.cache/huggingface
volume: {}
- type: bind
source: $PWD/docker/compose/tool_chat_template_llama3.1_json.jinja
target: /tmp/tool_chat_template.jinja
bind:
create_host_path: true
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
retries: 4
start_period: 180s
timeout: 10s
volumes:
hugging_face_models:

networks:
backend_net:
Loading