Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions tutorials/serving_ollama/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# # Run Ollama on Union Serving platform
#
# In this example, we use Union Serving to Ollama and serve DeepSeek R1 Distilled Qwen 1.58b Model

# ## Caching the model from hugging face
# In order to speed up Ollama startup. We will cache the model from Hugging Face. This is
# important because it allows us to avoid downloading the model every time we start
# the Ollama service. Caching the model will significantly reduce the startup time
# for the Ollama service. This is especially important if we are deploying the
# application to a production environment where we need to ensure that the service
# starts up quickly and efficiently.

# To cache the model, we will use the `download_model` task defined in the `model_wf.py`
# file. This task will download the specified model from Hugging Face and publish
# it as an artifact. The `download_model` task is decorated with the `@task` decorator,
# which allows us to define a task that can be executed as part of a workflow. The
# `container_image` parameter specifies the container image to use for the task.
# The `cache=True` parameter indicates that we want to cache the results of the
# task. The `cache_version` parameter specifies the version of the cache. This is
# important because it allows us to invalidate the cache if we make changes to the
# task or if we want to download a different version of the model. In this example,
# we are using version "1.0" for the cache. The `download_model` task takes a single
# parameter, `model`, which specifies the name of the model to download. The task
# will download the model from Hugging Face and save it to a directory. The task
# will then publish the directory as an artifact. This artifact can be used in
# subsequent tasks or workflows. The `MyOllamaModel` class is used to define the
# input for the task. The `MyOllamaModel` class is a subclass of `Artifact` that
# represents the model that will be downloaded. The `partition_keys` parameter
# specifies the keys that will be used to partition the artifact. In this case, we
# are using the `model` key to partition the artifact. This allows us to download
# different versions of the model by specifying different values for the `model`
# parameter. The `download_model` task will download the specified model and save
# it to a directory. The directory will be published as an artifact. This artifact
# can then be referenced in our application using the `MyOllamaModel` class.
# The `MyOllamaModel` class is used to reference the downloaded model in our
# application.
# The `query()` method of the `MyOllamaModel` class will return the path to the
# downloaded model. This path can be used to reference the model in our application.
#
# Once the model is cached, we can use it in our application. The `Artifact` class
# provides a way to reference the cached model. We can use the `query()` method to
# get the path to the cached model. This is useful when we want to use the model in
# our application. The `query()` method will return the path to the cached model, which
# can be used to reference the model in our application.
#
# ## Defining the Application
#
# Now that we have the model cached, we can define our application. We will use the
# `Artifact` class to reference the cached model. We will also define the application
# configuration, including the container image, the input for the model, and the
# resources required for the application. We will also include the necessary arguments
# to start the application.


# Import the necessary classes from the union package
from union import Artifact, Resources, ImageSpec
from union.app import App, Input
import os
import model_wf


# We will define the application configuration. This includes the name of the
# application, the inputs required for the application, the arguments to start
# the application, the container image to use, the port to expose, the resource
# limits for the application, and the environment variables required for the
# application to run. We will also set the `OLLAMA_MODELS` environment variable
# to the path where the model will be downloaded to. This is important because
# the Ollama model needs to be accessible to the application when it starts up.
# file: app.py

MODEL = os.environ.get("MODEL", "llama3.1")

ollama_app = App(
name="ollama-serve",
inputs=[
Input(
name="my_model",
value=model_wf.MyOllamaModel.query(model="llama3.1"),
download=True,
mount="/home/union/.ollama/models",
)
],
container_image=model_wf.image.with_packages(["union-runtime"]),
args=[
"ollama",
"serve",
],
limits=Resources(cpu="2", mem="8Gi", ephemeral_storage="40Gi"),
requests=Resources(ephemeral_storage="40Gi"),
port=11434,
min_replicas=1,
env={
"UNION_SDK_LOGGING_LEVEL": "10",
"OLLAMA_HOST": "0.0.0.0",
"OLLAMA_ORIGINS": "*",
},
)

# Finally, we can deploy the application. The `deploy` method will package the
# application and deploy it to the Union Serving platform. This will include
# building the container image, packaging the application, and deploying it to
# the Union Serving platform. Once the deployment is complete, we will see
# a message indicating that the application has been successfully deployed.

# ```bash
# union deploy apps app.py ollama-serve
# ```
97 changes: 97 additions & 0 deletions tutorials/serving_ollama/model_wf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from pathlib import Path
import subprocess
from typing import Annotated

from union import task, workflow, ImageSpec
from flytekit import Artifact
from flytekit.types.directory import FlyteDirectory
from flytekit.core.artifact import Artifact, Inputs
import flytekit

import typing

MyOllamaModel = Artifact(name="ollama_model", partition_keys=["model"])

# We will use the `ImageSpec` class to define the container image for our application.
# We will use the `unionai` builder to build the image. The packages required for
# our application are `union-runtime` and `ollama`. The `union-runtime` package
# provides the necessary runtime environment for our application, while the `ollama`
# package provides the necessary tools to run the Ollama model.
image = ImageSpec(
builder="union",
name="ollama-serve",
apt_packages=["curl", "systemctl"],
commands=[ # from https://github.com/ollama/ollama/blob/main/docs/linux.md
"curl -L https://ollama.com/download/ollama-linux-amd64.tgz -o ollama-linux-amd64.tgz",
"tar -C /usr -xzf ollama-linux-amd64.tgz",
"rm -f ollama-linux-amd64.tgz",
"useradd -r -s /bin/false -U -m -d /usr/share/ollama ollama",
"usermod -a -G ollama ollama",
"mkdir -p /usr/share/ollama/.ollama/models",
],
packages=["union", "ollama", "httpx", "loguru"],
)


def wait_for_ollama(timeout: int = 30, interval: int = 2) -> None:
"""Wait for Ollama service to be ready.

:param timeout: Maximum time to wait in seconds
:param interval: Time between checks in seconds
"""
import httpx
from loguru import logger
import time

start_time = time.time()
while True:
try:
response = httpx.get("http://localhost:11434/api/version")
if response.status_code == 200:
logger.info("Ollama service is ready")
return
except httpx.ConnectError:
if time.time() - start_time > timeout:
raise TimeoutError("Ollama service failed to start")
logger.info(
f"Waiting for Ollama service... ({int(time.time() - start_time)}s)"
)
time.sleep(interval)


# Use cache=True if we will use model versions
@task(container_image=image, cache=True, cache_version="1.0")
def download_model(
model: str,
) -> Annotated[FlyteDirectory, MyOllamaModel(model=Inputs.model)]:
"""
Downloads a model using Ollama. This publishes the model as an artifact.

:param model: Model to download
:return: Directory containing the model
"""

subprocess.Popen(["ollama", "serve"])
flytekit.current_context().logging.info("Waiting for Ollama service to be ready")
wait_for_ollama()

flytekit.current_context().logging.info("Pulling model")
subprocess.run(["ollama", "pull", model], stdout=subprocess.PIPE)

flytekit.current_context().logging.info("Stopping Ollama service")
subprocess.run(["ollama", "stop"])

flytekit.current_context().logging.info("Exporting model")
print("Check models")
model_dir = Path("/home/flytekit/.ollama/models")
for path in model_dir.glob("*/**"):
print(path)
return model_dir


@workflow
def download_multiple_models(models: typing.List[str] = ["llama3.1"]) -> typing.List[FlyteDirectory]:
"""
Downloads multiple models using Ollama. This publishes each model as a separate artifact.
"""
return flytekit.map_task(download_model)(model=models)