elastic
diff --git a/‎.github/workflows/test.yaml‎
Lines changed: 3 additions & 4 deletions b/‎.github/workflows/test.yaml‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎api/types.go‎
Lines changed: 3 additions & 2 deletions b/‎api/types.go‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎cmd/cmd.go‎
Lines changed: 4 additions & 2 deletions b/‎cmd/cmd.go‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎cmd/cmd_test.go‎
Lines changed: 2 additions & 7 deletions b/‎cmd/cmd_test.go‎
Lines changed: 2 additions & 7 deletions
diff --git a/‎convert/tokenizer.go‎
Lines changed: 23 additions & 4 deletions b/‎convert/tokenizer.go‎
Lines changed: 23 additions & 4 deletions
diff --git a/‎convert/tokenizer_test.go‎
Lines changed: 56 additions & 0 deletions b/‎convert/tokenizer_test.go‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎discover/types.go‎
Lines changed: 14 additions & 0 deletions b/‎discover/types.go‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎docs/api.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/api.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/faq.md‎
Lines changed: 26 additions & 2 deletions b/‎docs/faq.md‎
Lines changed: 26 additions & 2 deletions
@@ -243,7 +243,7 @@ jobs:
           $env:PATH="$gopath;$gccpath;$env:PATH"
           echo $env:PATH
           if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make -j 4      
+          make -j 4
       - name: 'Build Unix Go Runners'
         if: ${{ ! startsWith(matrix.os, 'windows-') }}
         run: make -j 4
@@ -310,8 +310,7 @@ jobs:
             arm64) echo ARCH=arm64 ;;
           esac >>$GITHUB_ENV
         shell: bash
-      - run: go build
-      - run: go test -v ./...
+      - run: go test ./...
 
   patches:
     needs: [changes]
@@ -323,4 +322,4 @@ jobs:
           submodules: recursive
       - name: Verify patches carry all the changes
         run: |
-          make apply-patches sync && git diff --compact-summary --exit-code llama
+          make apply-patches sync && git diff --compact-summary --exit-code llama
@@ -359,6 +359,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Nosia](https://github.com/nosia-ai/nosia) (Easy to install and use RAG platform based on Ollama)
 - [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux)
 - [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)
+- [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
 
 ### Cloud
 
 
@@ -67,7 +67,7 @@ type GenerateRequest struct {
 	Raw bool `json:"raw,omitempty"`
 
 	// Format specifies the format to return a response in.
-	Format string `json:"format"`
+	Format json.RawMessage `json:"format,omitempty"`
 
 	// KeepAlive controls how long the model will stay loaded in memory following
 	// this request.
@@ -94,7 +94,7 @@ type ChatRequest struct {
 	Stream *bool `json:"stream,omitempty"`
 
 	// Format is the format to return the response in (e.g. "json").
-	Format string `json:"format"`
+	Format json.RawMessage `json:"format,omitempty"`
 
 	// KeepAlive controls how long the model will stay loaded into memory
 	// following the request.
@@ -146,6 +146,7 @@ type ToolCall struct {
 }
 
 type ToolCallFunction struct {
+	Index     int                       `json:"index,omitempty"`
 	Name      string                    `json:"name"`
 	Arguments ToolCallFunctionArguments `json:"arguments"`
 }
 
@@ -8,6 +8,7 @@ import (
 	"crypto/ed25519"
 	"crypto/rand"
 	"crypto/sha256"
+	"encoding/json"
 	"encoding/pem"
 	"errors"
 	"fmt"
@@ -1038,7 +1039,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
 	req := &api.ChatRequest{
 		Model:    opts.Model,
 		Messages: opts.Messages,
-		Format:   opts.Format,
+		Format:   json.RawMessage(opts.Format),
 		Options:  opts.Options,
 	}
 
@@ -1125,7 +1126,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
 		Prompt:    opts.Prompt,
 		Context:   generateContext,
 		Images:    opts.Images,
-		Format:    opts.Format,
+		Format:    json.RawMessage(opts.Format),
 		System:    opts.System,
 		Options:   opts.Options,
 		KeepAlive: opts.KeepAlive,
@@ -1445,6 +1446,7 @@ func NewCLI() *cobra.Command {
 				envVars["OLLAMA_SCHED_SPREAD"],
 				envVars["OLLAMA_TMPDIR"],
 				envVars["OLLAMA_FLASH_ATTENTION"],
+				envVars["OLLAMA_KV_CACHE_TYPE"],
 				envVars["OLLAMA_LLM_LIBRARY"],
 				envVars["OLLAMA_GPU_OVERHEAD"],
 				envVars["OLLAMA_LOAD_TIMEOUT"],
 
@@ -8,7 +8,6 @@ import (
 	"net/http"
 	"net/http/httptest"
 	"os"
-	"path/filepath"
 	"strings"
 	"testing"
 
@@ -180,18 +179,14 @@ Weigh anchor!
 
 	t.Run("license", func(t *testing.T) {
 		var b bytes.Buffer
-		license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
-		if err != nil {
-			t.Fatal(err)
-		}
-
+		license := "MIT License\nCopyright (c) Ollama\n"
 		if err := showInfo(&api.ShowResponse{
 			Details: api.ModelDetails{
 				Family:            "test",
 				ParameterSize:     "7B",
 				QuantizationLevel: "FP16",
 			},
-			License: string(license),
+			License: license,
 		}, &b); err != nil {
 			t.Fatal(err)
 		}
 
@@ -10,6 +10,7 @@ import (
 	"log/slog"
 	"os"
 	"slices"
+	"strings"
 
 	"golang.org/x/exp/maps"
 )
@@ -60,7 +61,25 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 			addedTokens[t.Content] = t
 		}
 
-		t.Merges = tt.Model.Merges
+		if len(tt.Model.Merges) == 0 {
+			// noop; merges is empty
+		} else if err := json.Unmarshal(tt.Model.Merges, &t.Merges); err == nil {
+			// noop; merges is []string
+		} else if merges, err := func() ([][]string, error) {
+			var merges [][]string
+			if err := json.Unmarshal(tt.Model.Merges, &merges); err != nil {
+				return nil, err
+			}
+
+			return merges, nil
+		}(); err == nil {
+			t.Merges = make([]string, len(merges))
+			for i := range merges {
+				t.Merges[i] = strings.Join(merges[i], " ")
+			}
+		} else {
+			return nil, fmt.Errorf("could not parse tokenizer merges. expected []string or [][]string: %w", err)
+		}
 
 		sha256sum := sha256.New()
 		for _, pt := range tt.PreTokenizer.PreTokenizers {
@@ -156,9 +175,9 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 type tokenizer struct {
 	AddedTokens []token `json:"added_tokens"`
 	Model       struct {
-		Type   string         `json:"type"`
-		Vocab  map[string]int `json:"vocab"`
-		Merges []string       `json:"merges"`
+		Type   string          `json:"type"`
+		Vocab  map[string]int  `json:"vocab"`
+		Merges json.RawMessage `json:"merges"`
 	} `json:"model"`
 
 	PreTokenizer struct {
 
@@ -191,6 +191,62 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
+		{
+			name: "list string merges",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"model": {
+						"merges": [
+							"a b",
+							"c d",
+							"e f"
+						]
+					}
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model: "gpt2",
+				},
+				Merges: []string{
+					"a b",
+					"c d",
+					"e f",
+				},
+				Pre: "default",
+			},
+		},
+		{
+			name: "list list string merges",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"model": {
+						"merges": [
+							[
+								"a", "b"
+							],
+							[
+								"c", "d"
+							],
+							[
+								"e", "f"
+							]
+						]
+					}
+				}`),
+			}),
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model: "gpt2",
+				},
+				Merges: []string{
+					"a b",
+					"c d",
+					"e f",
+				},
+				Pre: "default",
+			},
+		},
 	}
 
 	for _, tt := range cases {
 
@@ -183,3 +183,17 @@ func (si SystemInfo) GetOptimalThreadCount() int {
 
 	return coreCount
 }
+
+// For each GPU, check if it does NOT support flash attention
+func (l GpuInfoList) FlashAttentionSupported() bool {
+	for _, gpu := range l {
+		supportsFA := gpu.Library == "metal" ||
+			(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
+			gpu.Library == "rocm"
+
+		if !supportsFA {
+			return false
+		}
+	}
+	return true
+}
@@ -49,10 +49,10 @@ Advanced parameters (optional):
 - `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
 - `system`: system message to (overrides what is defined in the `Modelfile`)
 - `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
-- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 - `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
 - `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API
 - `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
+- `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
 
 #### JSON mode
 
 
@@ -151,7 +151,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
 
 Ollama runs an HTTP server and can be exposed using a proxy server such as Nginx. To do so, configure the proxy to forward requests and optionally set required headers (if not exposing Ollama on the network). For example, with Nginx:
 
-```
+```nginx
 server {
     listen 80;
     server_name example.com;  # Replace with your domain or IP
@@ -285,4 +285,28 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit
 
 ## How does Ollama load models on multiple GPUs?
 
-Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models.  When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
+When loading a new model, Ollama evaluates the required VRAM for the model against what is currently available.  If the model will entirely fit on any single GPU, Ollama will load the model on that GPU.  This typically provides the best performance as it reduces the amount of data transferring across the PCI bus during inference.  If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
+
+## How can I enable Flash Attention?
+
+Flash Attention is a feature of most modern models that can significantly reduce memory usage as the context size grows.  To enable Flash Attention, set the `OLLAMA_FLASH_ATTENTION` environment variable to `1` when starting the Ollama server.
+
+## How can I set the quantization type for the K/V cache?
+
+The K/V context cache can be quantized to significantly reduce memory usage when Flash Attention is enabled.
+
+To use quantized K/V cache with Ollama you can set the following environment variable:
+
+- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache.  Default is `f16`.
+
+> Note: Currently this is a global option - meaning all models will run with the specified quantization type.
+
+The currently available K/V cache quantization types are:
+
+- `f16` - high precision and memory usage (default).
+- `q8_0` - 8-bit quantization, uses approximately 1/2 the memory of `f16` with a very small loss in precision, this usually has no noticeable impact on the model's quality (recommended if not using f16).
+- `q4_0` - 4-bit quantization, uses approximately 1/4 the memory of `f16` with a small-medium loss in precision that may be more noticeable at higher context sizes.
+
+How much the cache quantization impacts the model's response quality will depend on the model and the task.  Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
+
+You may need to experiment with different quantization types to find the best balance between memory usage and quality.