Skip to content

Commit b2c9288

Browse files
committed
Merge branch 'main' of https://github.com/ollama/ollama into testing
2 parents 14a6857 + aed1419 commit b2c9288

File tree

25 files changed

+517
-83
lines changed

25 files changed

+517
-83
lines changed

.github/workflows/test.yaml

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ jobs:
243243
$env:PATH="$gopath;$gccpath;$env:PATH"
244244
echo $env:PATH
245245
if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
246-
make -j 4
246+
make -j 4
247247
- name: 'Build Unix Go Runners'
248248
if: ${{ ! startsWith(matrix.os, 'windows-') }}
249249
run: make -j 4
@@ -310,8 +310,7 @@ jobs:
310310
arm64) echo ARCH=arm64 ;;
311311
esac >>$GITHUB_ENV
312312
shell: bash
313-
- run: go build
314-
- run: go test -v ./...
313+
- run: go test ./...
315314

316315
patches:
317316
needs: [changes]
@@ -323,4 +322,4 @@ jobs:
323322
submodules: recursive
324323
- name: Verify patches carry all the changes
325324
run: |
326-
make apply-patches sync && git diff --compact-summary --exit-code llama
325+
make apply-patches sync && git diff --compact-summary --exit-code llama

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
359359
- [Nosia](https://github.com/nosia-ai/nosia) (Easy to install and use RAG platform based on Ollama)
360360
- [Witsy](https://github.com/nbonamy/witsy) (An AI Desktop application avaiable for Mac/Windows/Linux)
361361
- [Abbey](https://github.com/US-Artificial-Intelligence/abbey) (A configurable AI interface server with notebooks, document storage, and YouTube support)
362+
- [Minima](https://github.com/dmayboroda/minima) (RAG with on-premises or fully local workflow)
362363

363364
### Cloud
364365

api/types.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ type GenerateRequest struct {
6767
Raw bool `json:"raw,omitempty"`
6868

6969
// Format specifies the format to return a response in.
70-
Format string `json:"format"`
70+
Format json.RawMessage `json:"format,omitempty"`
7171

7272
// KeepAlive controls how long the model will stay loaded in memory following
7373
// this request.
@@ -94,7 +94,7 @@ type ChatRequest struct {
9494
Stream *bool `json:"stream,omitempty"`
9595

9696
// Format is the format to return the response in (e.g. "json").
97-
Format string `json:"format"`
97+
Format json.RawMessage `json:"format,omitempty"`
9898

9999
// KeepAlive controls how long the model will stay loaded into memory
100100
// following the request.
@@ -146,6 +146,7 @@ type ToolCall struct {
146146
}
147147

148148
type ToolCallFunction struct {
149+
Index int `json:"index,omitempty"`
149150
Name string `json:"name"`
150151
Arguments ToolCallFunctionArguments `json:"arguments"`
151152
}

cmd/cmd.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"crypto/ed25519"
99
"crypto/rand"
1010
"crypto/sha256"
11+
"encoding/json"
1112
"encoding/pem"
1213
"errors"
1314
"fmt"
@@ -1038,7 +1039,7 @@ func chat(cmd *cobra.Command, opts runOptions) (*api.Message, error) {
10381039
req := &api.ChatRequest{
10391040
Model: opts.Model,
10401041
Messages: opts.Messages,
1041-
Format: opts.Format,
1042+
Format: json.RawMessage(opts.Format),
10421043
Options: opts.Options,
10431044
}
10441045

@@ -1125,7 +1126,7 @@ func generate(cmd *cobra.Command, opts runOptions) error {
11251126
Prompt: opts.Prompt,
11261127
Context: generateContext,
11271128
Images: opts.Images,
1128-
Format: opts.Format,
1129+
Format: json.RawMessage(opts.Format),
11291130
System: opts.System,
11301131
Options: opts.Options,
11311132
KeepAlive: opts.KeepAlive,
@@ -1445,6 +1446,7 @@ func NewCLI() *cobra.Command {
14451446
envVars["OLLAMA_SCHED_SPREAD"],
14461447
envVars["OLLAMA_TMPDIR"],
14471448
envVars["OLLAMA_FLASH_ATTENTION"],
1449+
envVars["OLLAMA_KV_CACHE_TYPE"],
14481450
envVars["OLLAMA_LLM_LIBRARY"],
14491451
envVars["OLLAMA_GPU_OVERHEAD"],
14501452
envVars["OLLAMA_LOAD_TIMEOUT"],

cmd/cmd_test.go

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import (
88
"net/http"
99
"net/http/httptest"
1010
"os"
11-
"path/filepath"
1211
"strings"
1312
"testing"
1413

@@ -180,18 +179,14 @@ Weigh anchor!
180179

181180
t.Run("license", func(t *testing.T) {
182181
var b bytes.Buffer
183-
license, err := os.ReadFile(filepath.Join("..", "LICENSE"))
184-
if err != nil {
185-
t.Fatal(err)
186-
}
187-
182+
license := "MIT License\nCopyright (c) Ollama\n"
188183
if err := showInfo(&api.ShowResponse{
189184
Details: api.ModelDetails{
190185
Family: "test",
191186
ParameterSize: "7B",
192187
QuantizationLevel: "FP16",
193188
},
194-
License: string(license),
189+
License: license,
195190
}, &b); err != nil {
196191
t.Fatal(err)
197192
}

convert/tokenizer.go

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import (
1010
"log/slog"
1111
"os"
1212
"slices"
13+
"strings"
1314

1415
"golang.org/x/exp/maps"
1516
)
@@ -60,7 +61,25 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
6061
addedTokens[t.Content] = t
6162
}
6263

63-
t.Merges = tt.Model.Merges
64+
if len(tt.Model.Merges) == 0 {
65+
// noop; merges is empty
66+
} else if err := json.Unmarshal(tt.Model.Merges, &t.Merges); err == nil {
67+
// noop; merges is []string
68+
} else if merges, err := func() ([][]string, error) {
69+
var merges [][]string
70+
if err := json.Unmarshal(tt.Model.Merges, &merges); err != nil {
71+
return nil, err
72+
}
73+
74+
return merges, nil
75+
}(); err == nil {
76+
t.Merges = make([]string, len(merges))
77+
for i := range merges {
78+
t.Merges[i] = strings.Join(merges[i], " ")
79+
}
80+
} else {
81+
return nil, fmt.Errorf("could not parse tokenizer merges. expected []string or [][]string: %w", err)
82+
}
6483

6584
sha256sum := sha256.New()
6685
for _, pt := range tt.PreTokenizer.PreTokenizers {
@@ -156,9 +175,9 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
156175
type tokenizer struct {
157176
AddedTokens []token `json:"added_tokens"`
158177
Model struct {
159-
Type string `json:"type"`
160-
Vocab map[string]int `json:"vocab"`
161-
Merges []string `json:"merges"`
178+
Type string `json:"type"`
179+
Vocab map[string]int `json:"vocab"`
180+
Merges json.RawMessage `json:"merges"`
162181
} `json:"model"`
163182

164183
PreTokenizer struct {

convert/tokenizer_test.go

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,62 @@ func TestParseTokenizer(t *testing.T) {
191191
Pre: "default",
192192
},
193193
},
194+
{
195+
name: "list string merges",
196+
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
197+
"tokenizer.json": strings.NewReader(`{
198+
"model": {
199+
"merges": [
200+
"a b",
201+
"c d",
202+
"e f"
203+
]
204+
}
205+
}`),
206+
}),
207+
want: &Tokenizer{
208+
Vocabulary: &Vocabulary{
209+
Model: "gpt2",
210+
},
211+
Merges: []string{
212+
"a b",
213+
"c d",
214+
"e f",
215+
},
216+
Pre: "default",
217+
},
218+
},
219+
{
220+
name: "list list string merges",
221+
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
222+
"tokenizer.json": strings.NewReader(`{
223+
"model": {
224+
"merges": [
225+
[
226+
"a", "b"
227+
],
228+
[
229+
"c", "d"
230+
],
231+
[
232+
"e", "f"
233+
]
234+
]
235+
}
236+
}`),
237+
}),
238+
want: &Tokenizer{
239+
Vocabulary: &Vocabulary{
240+
Model: "gpt2",
241+
},
242+
Merges: []string{
243+
"a b",
244+
"c d",
245+
"e f",
246+
},
247+
Pre: "default",
248+
},
249+
},
194250
}
195251

196252
for _, tt := range cases {

discover/types.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,3 +183,17 @@ func (si SystemInfo) GetOptimalThreadCount() int {
183183

184184
return coreCount
185185
}
186+
187+
// For each GPU, check if it does NOT support flash attention
188+
func (l GpuInfoList) FlashAttentionSupported() bool {
189+
for _, gpu := range l {
190+
supportsFA := gpu.Library == "metal" ||
191+
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
192+
gpu.Library == "rocm"
193+
194+
if !supportsFA {
195+
return false
196+
}
197+
}
198+
return true
199+
}

docs/api.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,10 @@ Advanced parameters (optional):
4949
- `options`: additional model parameters listed in the documentation for the [Modelfile](./modelfile.md#valid-parameters-and-values) such as `temperature`
5050
- `system`: system message to (overrides what is defined in the `Modelfile`)
5151
- `template`: the prompt template to use (overrides what is defined in the `Modelfile`)
52-
- `context`: the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
5352
- `stream`: if `false` the response will be returned as a single response object, rather than a stream of objects
5453
- `raw`: if `true` no formatting will be applied to the prompt. You may choose to use the `raw` parameter if you are specifying a full templated prompt in your request to the API
5554
- `keep_alive`: controls how long the model will stay loaded into memory following the request (default: `5m`)
55+
- `context` (deprecated): the context parameter returned from a previous request to `/generate`, this can be used to keep a short conversational memory
5656

5757
#### JSON mode
5858

docs/faq.md

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ Refer to the section [above](#how-do-i-configure-ollama-server) for how to set e
151151

152152
Ollama runs an HTTP server and can be exposed using a proxy server such as Nginx. To do so, configure the proxy to forward requests and optionally set required headers (if not exposing Ollama on the network). For example, with Nginx:
153153

154-
```
154+
```nginx
155155
server {
156156
listen 80;
157157
server_name example.com; # Replace with your domain or IP
@@ -285,4 +285,28 @@ Note: Windows with Radeon GPUs currently default to 1 model maximum due to limit
285285
286286
## How does Ollama load models on multiple GPUs?
287287
288-
Installing multiple GPUs of the same brand can be a great way to increase your available VRAM to load larger models. When you load a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transfering across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
288+
When loading a new model, Ollama evaluates the required VRAM for the model against what is currently available. If the model will entirely fit on any single GPU, Ollama will load the model on that GPU. This typically provides the best performance as it reduces the amount of data transferring across the PCI bus during inference. If the model does not fit entirely on one GPU, then it will be spread across all the available GPUs.
289+
290+
## How can I enable Flash Attention?
291+
292+
Flash Attention is a feature of most modern models that can significantly reduce memory usage as the context size grows. To enable Flash Attention, set the `OLLAMA_FLASH_ATTENTION` environment variable to `1` when starting the Ollama server.
293+
294+
## How can I set the quantization type for the K/V cache?
295+
296+
The K/V context cache can be quantized to significantly reduce memory usage when Flash Attention is enabled.
297+
298+
To use quantized K/V cache with Ollama you can set the following environment variable:
299+
300+
- `OLLAMA_KV_CACHE_TYPE` - The quantization type for the K/V cache. Default is `f16`.
301+
302+
> Note: Currently this is a global option - meaning all models will run with the specified quantization type.
303+
304+
The currently available K/V cache quantization types are:
305+
306+
- `f16` - high precision and memory usage (default).
307+
- `q8_0` - 8-bit quantization, uses approximately 1/2 the memory of `f16` with a very small loss in precision, this usually has no noticeable impact on the model's quality (recommended if not using f16).
308+
- `q4_0` - 4-bit quantization, uses approximately 1/4 the memory of `f16` with a small-medium loss in precision that may be more noticeable at higher context sizes.
309+
310+
How much the cache quantization impacts the model's response quality will depend on the model and the task. Models that have a high GQA count (e.g. Qwen2) may see a larger impact on precision from quantization than models with a low GQA count.
311+
312+
You may need to experiment with different quantization types to find the best balance between memory usage and quality.

0 commit comments

Comments
 (0)