unionai · EngHabu · Mar 30, 2025
diff --git a/tutorials/serving_ollama/app.py b/tutorials/serving_ollama/app.py
@@ -0,0 +1,107 @@
+# # Run Ollama on Union Serving platform
+#
+# In this example, we use Union Serving to Ollama and serve DeepSeek R1 Distilled Qwen 1.58b Model
+
+# ## Caching the model from hugging face
+# In order to speed up Ollama startup. We will cache the model from Hugging Face. This is
+# important because it allows us to avoid downloading the model every time we start
+# the Ollama service. Caching the model will significantly reduce the startup time
+# for the Ollama service. This is especially important if we are deploying the
+# application to a production environment where we need to ensure that the service
+# starts up quickly and efficiently.
+
+# To cache the model, we will use the `download_model` task defined in the `model_wf.py`
+# file. This task will download the specified model from Hugging Face and publish
+# it as an artifact. The `download_model` task is decorated with the `@task` decorator,
+# which allows us to define a task that can be executed as part of a workflow. The
+# `container_image` parameter specifies the container image to use for the task.
+# The `cache=True` parameter indicates that we want to cache the results of the
+# task. The `cache_version` parameter specifies the version of the cache. This is
+# important because it allows us to invalidate the cache if we make changes to the
+# task or if we want to download a different version of the model. In this example,
+# we are using version "1.0" for the cache. The `download_model` task takes a single
+# parameter, `model`, which specifies the name of the model to download. The task
+# will download the model from Hugging Face and save it to a directory. The task
+# will then publish the directory as an artifact. This artifact can be used in
+# subsequent tasks or workflows. The `MyOllamaModel` class is used to define the
+# input for the task. The `MyOllamaModel` class is a subclass of `Artifact` that
+# represents the model that will be downloaded. The `partition_keys` parameter
+# specifies the keys that will be used to partition the artifact. In this case, we
+# are using the `model` key to partition the artifact. This allows us to download
+# different versions of the model by specifying different values for the `model`
+# parameter. The `download_model` task will download the specified model and save
+# it to a directory. The directory will be published as an artifact. This artifact
+# can then be referenced in our application using the `MyOllamaModel` class.
+# The `MyOllamaModel` class is used to reference the downloaded model in our
+# application.
+# The `query()` method of the `MyOllamaModel` class will return the path to the
+# downloaded model. This path can be used to reference the model in our application.
+# 
+# Once the model is cached, we can use it in our application. The `Artifact` class
+# provides a way to reference the cached model. We can use the `query()` method to
+# get the path to the cached model. This is useful when we want to use the model in
+# our application. The `query()` method will return the path to the cached model, which
+# can be used to reference the model in our application.
+#
+# ## Defining the Application
+# 
+# Now that we have the model cached, we can define our application. We will use the
+# `Artifact` class to reference the cached model. We will also define the application
+# configuration, including the container image, the input for the model, and the
+# resources required for the application. We will also include the necessary arguments
+# to start the application.
+
+
+# Import the necessary classes from the union package
+from union import Artifact, Resources, ImageSpec
+from union.app import App, Input
+import os
+import model_wf
+
+
+# We will define the application configuration. This includes the name of the
+# application, the inputs required for the application, the arguments to start
+# the application, the container image to use, the port to expose, the resource
+# limits for the application, and the environment variables required for the
+# application to run. We will also set the `OLLAMA_MODELS` environment variable
+# to the path where the model will be downloaded to. This is important because
+# the Ollama model needs to be accessible to the application when it starts up.
+# file: app.py
+
+MODEL = os.environ.get("MODEL", "llama3.1")
+
+ollama_app = App(
+    name="ollama-serve",
+    inputs=[
+        Input(
+            name="my_model",
+            value=model_wf.MyOllamaModel.query(model="llama3.1"),
+            download=True,
+            mount="/home/union/.ollama/models",
+        )
+    ],
+    container_image=model_wf.image.with_packages(["union-runtime"]),
+    args=[
+        "ollama",
+        "serve",
+    ],
+    limits=Resources(cpu="2", mem="8Gi", ephemeral_storage="40Gi"),
+    requests=Resources(ephemeral_storage="40Gi"),
+    port=11434,
+    min_replicas=1,
+    env={
+        "UNION_SDK_LOGGING_LEVEL": "10",
+        "OLLAMA_HOST": "0.0.0.0",
+        "OLLAMA_ORIGINS": "*",
+    },
+)
+
+# Finally, we can deploy the application. The `deploy` method will package the
+# application and deploy it to the Union Serving platform. This will include
+# building the container image, packaging the application, and deploying it to
+# the Union Serving platform. Once the deployment is complete, we will see
+# a message indicating that the application has been successfully deployed.
+
+# ```bash
+# union deploy apps app.py ollama-serve
+# ```
diff --git a/tutorials/serving_ollama/model_wf.py b/tutorials/serving_ollama/model_wf.py
@@ -0,0 +1,97 @@
+from pathlib import Path
+import subprocess
+from typing import Annotated
+
+from union import task, workflow, ImageSpec
+from flytekit import Artifact
+from flytekit.types.directory import FlyteDirectory
+from flytekit.core.artifact import Artifact, Inputs
+import flytekit
+
+import typing
+
+MyOllamaModel = Artifact(name="ollama_model", partition_keys=["model"])
+
+# We will use the `ImageSpec` class to define the container image for our application.
+# We will use the `unionai` builder to build the image. The packages required for
+# our application are `union-runtime` and `ollama`. The `union-runtime` package
+# provides the necessary runtime environment for our application, while the `ollama`
+# package provides the necessary tools to run the Ollama model.
+image = ImageSpec(
+    builder="union",
+    name="ollama-serve",
+    apt_packages=["curl", "systemctl"],
+    commands=[  # from https://github.com/ollama/ollama/blob/main/docs/linux.md
+        "curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz",
+        "tar -C /usr -xzf ollama-linux-amd64.tgz",
+        "rm -f ollama-linux-amd64.tgz",
+        "useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama",
+        "usermod -a -G ollama ollama",
+        "mkdir -p /usr/share/ollama/.ollama/models",
+    ],
+    packages=["union", "ollama", "httpx", "loguru"],
+)
+
+
+def wait_for_ollama(timeout: int = 30, interval: int = 2) -> None:
+    """Wait for Ollama service to be ready.
+
+    :param timeout: Maximum time to wait in seconds
+    :param interval: Time between checks in seconds
+    """
+    import httpx
+    from loguru import logger
+    import time
+
+    start_time = time.time()
+    while True:
+        try:
+            response = httpx.get("http://localhost:11434/api/version")
+            if response.status_code == 200:
+                logger.info("Ollama service is ready")
+                return
+        except httpx.ConnectError:
+            if time.time() - start_time > timeout:
+                raise TimeoutError("Ollama service failed to start")
+            logger.info(
+                f"Waiting for Ollama service... ({int(time.time() - start_time)}s)"
+            )
+            time.sleep(interval)
+
+
+# Use cache=True if we will use model versions
+@task(container_image=image, cache=True, cache_version="1.0")
+def download_model(
+    model: str,
+) -> Annotated[FlyteDirectory, MyOllamaModel(model=Inputs.model)]:
+    """
+    Downloads a model using Ollama. This publishes the model as an artifact.
+
+    :param model: Model to download
+    :return: Directory containing the model
+    """
+
+    subprocess.Popen(["ollama", "serve"])
+    flytekit.current_context().logging.info("Waiting for Ollama service to be ready")
+    wait_for_ollama()
+
+    flytekit.current_context().logging.info("Pulling model")
+    subprocess.run(["ollama", "pull", model], stdout=subprocess.PIPE)
+
+    flytekit.current_context().logging.info("Stopping Ollama service")
+    subprocess.run(["ollama", "stop"])
+
+    flytekit.current_context().logging.info("Exporting model")
+    print("Check models")
+    model_dir = Path("/home/flytekit/.ollama/models")
+    for path in model_dir.glob("*/**"):
+        print(path)
+    return model_dir
+
+
+@workflow
+def download_multiple_models(models: typing.List[str] = ["llama3.1"]) -> typing.List[FlyteDirectory]:
+    """
+    Downloads multiple models using Ollama. This publishes each model as a separate artifact.
+    """
+    return flytekit.map_task(download_model)(model=models)