NillionNetwork · wwwehr · Feb 14, 2025 · Feb 21, 2025 · Feb 25, 2025 · Feb 25, 2025
diff --git a/.gitignore b/.gitignore
@@ -170,3 +170,5 @@ grafana/runtime-data/*
 
 prometheus/data/*
 !prometheus/data/.gitkeep
+
+*.swp
diff --git a/caddy/Caddyfile b/caddy/Caddyfile
@@ -4,7 +4,7 @@
 	}
  }
 
- https://nilai.sandbox.nilogy.xyz {
+ https://npw.tee.nilai.sandbox.nilogy.xyz {
 	import ssl_config
 	reverse_proxy api:8443
  }
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -102,6 +102,19 @@ services:
       retries: 3
       start_period: 10s
 
+  gpuverifier:
+    container_name: gpuverifier-api
+    build:
+      context: .
+      dockerfile: docker/verifier.Dockerfile
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/"]
+      interval: 10s
+      timeout: 5s
+      retries: 3
+      start_period: 5s
+    networks:
+      - backend_net
   api:
     container_name: nilai-api
     build:
@@ -113,6 +126,8 @@ services:
         condition: service_healthy
       postgres:
         condition: service_healthy
+      gpuverifier:
+        condition: service_healthy
     deploy:
       resources:
         reservations:
@@ -124,6 +139,8 @@ services:
     networks:
       - frontend_net
       - backend_net
+    environment:
+      - GPUVERIFIER_API=http://gpuverifier:8000
     env_file:
       - .env
     healthcheck:

diff --git a/docker/api.Dockerfile b/docker/api.Dockerfile
@@ -18,9 +18,10 @@ apt-get install build-essential curl -y && \
 apt-get clean && \
 apt-get autoremove && \
 rm -rf /var/lib/apt/lists/* && \
-pip install uv && \
+pip install --upgrade uv && \
 uv sync
 
+
 EXPOSE 8080 8443
 
 CMD ["./launch.sh"]
diff --git a/docker/compose/docker-compose.deepseek-14b-gpu.yml b/docker/compose/docker-compose.deepseek-14b-gpu.yml
@@ -31,6 +31,7 @@ services:
       - ETCD_HOST=etcd
       - ETCD_PORT=2379
       - TOOL_SUPPORT=false
+      - MODEL_ROLE=reasoning
     volumes:
       - hugging_face_models:/root/.cache/huggingface  # cache models
     networks:

diff --git a/docker/compose/docker-compose.deepseek-14b-tools.yml b/docker/compose/docker-compose.deepseek-14b-tools.yml
@@ -0,0 +1,54 @@
+services:
+  deepseek_14b_gpu:
+    build:
+      context: .
+      dockerfile: docker/vllm.Dockerfile
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities:
+                - gpu
+              driver: nvidia
+    ipc: host
+    depends_on:
+      etcd:
+        condition: service_healthy
+      watt_tool_gpu:
+        condition: service_healthy
+    command:
+      - --model
+      - deepseek-ai/DeepSeek-R1-Distill-Qwen-14B
+      - --max-model-len
+      - "10000"
+      - --device
+      - cuda
+      - --gpu-memory-utilization
+      - "0.45"
+    env_file:
+      - .env
+    environment:
+      SVC_HOST: "deepseek_14b_gpu"
+      SVC_PORT: "8000"
+      ETCD_HOST: "etcd"
+      ETCD_PORT: "2379"
+      TOOL_SUPPORT: true
+      MODEL_ROLE: "reasoning"
+    networks:
+      - backend_net
+    volumes:
+      - type: volume
+        source: hugging_face_models
+        target: /root/.cache/huggingface
+        volume: {}
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"] 
+      interval: 30s
+      retries: 3
+      start_period: 60s
+      timeout: 10s
+volumes:
+  hugging_face_models:
+
+networks:
+  backend_net:
diff --git a/docker/compose/docker-compose.llama-1b-gpu.yml b/docker/compose/docker-compose.llama-1b-gpu.yml
@@ -0,0 +1,58 @@
+services:
+  llama_1b_gpu:
+    build:
+      context: .
+      dockerfile: docker/vllm.Dockerfile
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities:
+                - gpu
+              driver: nvidia
+    ipc: host
+    depends_on:
+      etcd:
+        condition: service_healthy
+      watt_tool_gpu:
+        condition: service_healthy
+    command:
+      - --model
+      - meta-llama/Llama-3.2-1B-Instruct
+      - --max-model-len
+      - "10000"
+      - --device
+      - cuda
+      - --gpu-memory-utilization
+      - "0.45"
+      - --enable-auto-tool-choice
+      - --tool-call-parser
+      - llama3_json
+      - --chat-template
+      - /tmp/tool_chat_template.jinja
+    env_file:
+      - .env
+    environment:
+      SVC_HOST: "llama_32_tool_gpu"
+      SVC_PORT: "8000"
+      ETCD_HOST: "etcd"
+      ETCD_PORT: "2379"
+      TOOL_SUPPORT: true
+      MODEL_ROLE: "generation"
+    networks:
+      - backend_net
+    volumes:
+      - type: volume
+        source: hugging_face_models
+        target: /root/.cache/huggingface
+        volume: {}
+      - type: bind
+        source: $PWD/docker/compose/tool_chat_template_llama3.2_json.jinja
+        target: /tmp/tool_chat_template.jinja
+        bind:
+          create_host_path: true
+volumes:
+  hugging_face_models:
+
+networks:
+  backend_net:
diff --git a/docker/compose/docker-compose.llama-3b-gpu.yml b/docker/compose/docker-compose.llama-3b-gpu.yml
@@ -33,6 +33,7 @@ services:
       - ETCD_HOST=etcd
       - ETCD_PORT=2379
       - TOOL_SUPPORT=true
+      - MODEL_ROLE=generation
     volumes:
       - hugging_face_models:/root/.cache/huggingface  # cache models
     networks:
@@ -47,4 +48,4 @@ volumes:
   hugging_face_models:
 
 networks:
-  backend_net:
+  backend_net:
diff --git a/docker/compose/docker-compose.llama-8b-gpu.yml b/docker/compose/docker-compose.llama-8b-gpu.yml
@@ -33,6 +33,7 @@ services:
       - ETCD_HOST=etcd
       - ETCD_PORT=2379
       - TOOL_SUPPORT=true
+      - MODEL_ROLE=generation
     volumes:
       - hugging_face_models:/root/.cache/huggingface  # cache models
     networks:
@@ -47,4 +48,4 @@ volumes:
   hugging_face_models:
 
 networks:
-  backend_net:
+  backend_net:
diff --git a/docker/compose/docker-compose.watt-8b-gpu.yml b/docker/compose/docker-compose.watt-8b-gpu.yml
@@ -0,0 +1,62 @@
+services:
+  watt_tool_gpu:
+    build:
+      context: .
+      dockerfile: docker/vllm.Dockerfile
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - capabilities:
+                - gpu
+              driver: nvidia
+    ipc: host
+    depends_on:
+      etcd:
+        condition: service_healthy
+    command:
+      - --model
+      - watt-ai/watt-tool-8B
+      - --max-model-len
+      - "10000"
+      - --device
+      - cuda
+      - --gpu-memory-utilization
+      - "0.45"
+      - --enable-auto-tool-choice
+      - --tool-call-parser
+      - llama3_json
+      - --chat-template
+      - /tmp/tool_chat_template.jinja
+    env_file:
+      - .env
+    environment:
+      SVC_HOST: "watt_tool_gpu"
+      SVC_PORT: "8000"
+      ETCD_HOST: "etcd"
+      ETCD_PORT: "2379"
+      TOOL_SUPPORT: true
+      MODEL_ROLE: "worker"
+    networks:
+      - backend_net
+    volumes:
+      - type: volume
+        source: hugging_face_models
+        target: /root/.cache/huggingface
+        volume: {}
+      - type: bind
+        source: $PWD/docker/compose/tool_chat_template_llama3.1_json.jinja
+        target: /tmp/tool_chat_template.jinja
+        bind:
+          create_host_path: true
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"] 
+      interval: 30s
+      retries: 4
+      start_period: 180s
+      timeout: 10s
+volumes:
+  hugging_face_models:
+
+networks:
+  backend_net:
Original file line number	Diff line number	Diff line change
Expand Up		@@ -170,3 +170,5 @@ grafana/runtime-data/*

		prometheus/data/*
		!prometheus/data/.gitkeep

		*.swp