unionai · thomasjpfan · Mar 31, 2025 · Mar 31, 2025
diff --git a/tutorials/serving_ollama_pod_template/app.py b/tutorials/serving_ollama_pod_template/app.py
@@ -0,0 +1,69 @@
+from union.app import App, Input, ScalingMetric
+from union import ImageSpec, Artifact, Resources
+from flytekit.extras.accelerators import L4
+
+MoonDreamArtifact = Artifact(name="ollama-moondream")
+
+ollama_image = ImageSpec(
+    name="ollama-serve",
+    apt_packages=["curl"],
+    packages=["union-runtime>=0.1.11"],
+    registry="ghcr.io/unionai-oss",
+    commands=[
+        "curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz",
+        "tar -C /usr -xzf ollama-linux-amd64.tgz",
+    ],
+)
+
+ollama = App(
+    name="ollama-serve",
+    inputs=[
+        Input(
+            value=MoonDreamArtifact.query(),
+            mount="/home/.ollama",
+        )
+    ],
+    container_image=ollama_image,
+    port=11434,
+    args="ollama serve",
+    limits=Resources(cpu="8", mem="10Gi", ephemeral_storage="20Gi", gpu="1"),
+    accelerator=L4,
+    env={
+        "OLLAMA_HOST": "0.0.0.0",
+        "OLLAMA_ORIGINS": "*",
+        "OLLAMA_MODELS": "/home/.ollama/models",
+    },
+    min_replicas=0,
+    max_replicas=1,
+    scaledown_after=200,
+)
+
+
+streamlit_image = ImageSpec(
+    name="streamlit-chat",
+    packages=["streamlit==1.41.1", "union-runtime>=0.1.11", "ollama==0.4.7"],
+    registry="ghcr.io/unionai-oss",
+)
+
+streamlit_app = App(
+    name="ollama-streamlit",
+    inputs=[
+        Input(
+            value=ollama.query_endpoint(),
+            env_var="OLLAMA_ENDPOINT",
+        )
+    ],
+    limits=Resources(cpu="2", mem="4Gi"),
+    container_image=streamlit_image,
+    port=8082,
+    include=["./streamlit_app.py"],
+    args=[
+        "streamlit",
+        "run",
+        "streamlit_app.py",
+        "--server.port",
+        "8082",
+    ],
+    min_replicas=0,
+    max_replicas=1,
+)
diff --git a/tutorials/serving_ollama_pod_template/download_model.py b/tutorials/serving_ollama_pod_template/download_model.py
@@ -0,0 +1,65 @@
+from flytekit import Artifact, ImageSpec, PodTemplate, task, Resources
+from typing import Annotated
+from kubernetes.client.models import (
+    V1Container,
+    V1PodSpec,
+    V1ResourceRequirements,
+    V1VolumeMount,
+    V1Volume,
+    V1SecurityContext,
+    V1Probe,
+    V1HTTPGetAction,
+)
+from flytekit.types.directory import FlyteDirectory
+
+
+MoonDreamArtifact = Artifact(name="ollama-moondream")
+image = ImageSpec(
+    name="ollama-serve",
+    packages=["ollama==0.4.4", "kubernetes==31.0.0"],
+    registry="ghcr.io/unionai-oss",
+)
+
+
+template = PodTemplate(
+    pod_spec=V1PodSpec(
+        containers=[
+            V1Container(
+                name="primary",
+                image=image,
+                volume_mounts=[
+                    V1VolumeMount(name="ollama-cache", mount_path="/root/.ollama")
+                ],
+                security_context=V1SecurityContext(
+                    run_as_user=0,
+                ),
+            ),
+        ],
+        init_containers=[
+            V1Container(
+                name="ollama",
+                image="ollama/ollama:0.6.3",
+                resources=V1ResourceRequirements(
+                    requests={"cpu": "2", "memory": "6Gi"},
+                    limits={"cpu": "2", "memory": "6Gi"},
+                ),
+                restart_policy="Always",
+                volume_mounts=[
+                    V1VolumeMount(name="ollama-cache", mount_path="/root/.ollama")
+                ],
+                startup_probe=V1Probe(http_get=V1HTTPGetAction(path="/", port=11434)),
+            ),
+        ],
+        volumes=[V1Volume(name="ollama-cache", empty_dir={})],
+    )
+)
+
+
+@task(pod_template=template, limits=Resources(cpu="1", mem="4Gi"))
+def download_model(
+    model: str = "moondream",
+) -> Annotated[FlyteDirectory, MoonDreamArtifact]:
+    import ollama
+
+    ollama.pull(model)
+    return FlyteDirectory(path="/root/.ollama")
diff --git a/tutorials/serving_ollama_pod_template/streamlit_app.py b/tutorials/serving_ollama_pod_template/streamlit_app.py
@@ -0,0 +1,26 @@
+import os
+import streamlit as st
+from ollama import Client
+
+client = Client(
+    host=os.getenv("OLLAMA_ENDPOINT", "http://localhost"),
+)
+
+
+MODEL = "moondream"
+
+
+def stream_parser(stream):
+    for chunk in stream:
+        yield chunk["message"]["content"]
+
+
+query = st.text_input("Enter your question:", value="What is VSCode?")
+
+if query:
+    stream = client.chat(
+        model=MODEL,
+        messages=[{"role": "user", "content": query}],
+        stream=True,
+    )
+    response = st.write_stream(chunk["message"]["content"] for chunk in stream)