Re-order Containerfile to improve cache hits

RedTopper · RedTopper · commit a9ccb351fc0d · 2023-03-24T01:08:59.000-05:00
diff --git a/Containerfile b/Containerfile
@@ -1,49 +1,49 @@
 FROM nvidia/cuda:11.7.0-devel-ubuntu22.04 as builder
 
-ENV NVIDIA_VISIBLE_DEVICES=all
-ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
-
-# If you are running something modern, reducing this to 8.6 will speed up build times slightly.
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
-
 RUN apt-get update && \
     apt-get install --no-install-recommends -y git ninja-build build-essential python3-dev python3-pip && \
     rm -rf /var/lib/apt/lists/*
 
+RUN --mount=type=cache,target=/root/.cache/pip,Z pip3 install torch
 RUN git clone https://github.com/qwopqwop200/GPTQ-for-LLaMa /build
 
 WORKDIR /build
 
-# Fix until new parameter "groupsize" is figured out
-RUN git reset --hard 468c47c01b4fe370616747b6d69a2d3f48bab5e4
+ARG GPTQ_SHA=HEAD
+RUN git reset --hard ${GPTQ_SHA}
 
-RUN --mount=type=cache,target=/root/.cache/pip,Z pip3 install torch
 RUN --mount=type=cache,target=/root/.cache/pip,Z pip3 install -r requirements.txt
+
+ARG TORCH_CUDA_ARCH_LIST="8.6+PTX"
 RUN python3 setup_cuda.py bdist_wheel -d .
 
 FROM ubuntu:22.04
 
-ENV CLI_ARGS=""
-ENV NVIDIA_VISIBLE_DEVICES=all
-ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
-
-VOLUME /data
-VOLUME /output
-
 RUN apt-get update && \
     apt-get install --no-install-recommends -y git python3 python3-pip && \
     rm -rf /var/lib/apt/lists/*
 
+RUN --mount=type=cache,target=/root/.cache/pip,Z pip install torch torchvision torchaudio
+
 RUN git clone https://github.com/oobabooga/text-generation-webui /app
 
 WORKDIR /app
 
-COPY --from=builder /build /app/repositories/GPTQ-for-LLaMa
+ARG WEBUI_SHA=HEAD
+RUN git reset --hard ${WEBUI_SHA}
 
-RUN --mount=type=cache,target=/root/.cache/pip,Z pip install torch torchvision torchaudio
 RUN --mount=type=cache,target=/root/.cache/pip,Z pip install -r requirements.txt
+
+COPY --from=builder /build /app/repositories/GPTQ-for-LLaMa
 RUN --mount=type=cache,target=/root/.cache/pip,Z pip install /app/repositories/GPTQ-for-LLaMa/*.whl
 
 COPY entrypoint.sh .
+
+VOLUME /data
+VOLUME /output
+
+ENV CLI_ARGS=""
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
 ENTRYPOINT ["/app/entrypoint.sh"]
 CMD python3 server.py ${CLI_ARGS}
diff --git a/podman-compose.yaml b/podman-compose.yaml
@@ -2,8 +2,19 @@ version: "3.3"
 
 services:
   text-generation-webui:
-    build: .
+    build:
+      context: .
+      args:
+        # Use HEAD instead of a sha hash to use the latest
+        # GPTQ_SHA below is the last good commit for older models
+        - GPTQ_SHA=468c47c01b4fe370616747b6d69a2d3f48bab5e4
+        - WEBUI_SHA=HEAD
+        # If you know which specific architecture your GPU is using,
+        # specifying the exact TORCH_CUDA_ARCH_LIST version below can
+        # speed up build times slightly.
+        - "TORCH_CUDA_ARCH_LIST=7.0 7.5 8.0 8.6+PTX"
     environment:
+      # Feel free to customize CLI_ARGS. Below is an example for running 4bit llama.
       - CLI_ARGS=--gptq-bits 4 --auto-devices --gpu-memory 8 --listen --no-stream --listen-port 7861 --extensions llama_prompts api sd_api_pictures --cai-chat --model llama-13b
     # May be needed in some instances with Docker on machines with selinux
     # privileged: true