feature: add support for using yzma package to call llama.cpp

deadprogram · deadprogram · commit 28ef5f4d2221 · 2025-10-31T17:10:37.000+01:00
This adds a new llm subpackage to use the yzma package to call llama.cpp libraries
directly using the FFI interface.

Signed-off-by: deadprogram &lt;ron@hybridgroup.com&gt;
diff --git a/examples/yzma-llm-example/README.md b/examples/yzma-llm-example/README.md
@@ -0,0 +1,35 @@
+# yzmq LLM Example 🚀
+
+## What Does This Example Do? 🤔
+
+This example shows you how to:
+
+1. Set up a local LLM client
+2. Generate text using a simple prompt
+3. Customize the LLM configuration (with some cool commented-out options)
+
+## The Magic Explained ✨
+
+Here's what's happening in our main function:
+
+1. We create a new yzma LLM client using `yzma.New()`. This uses default settings from your environment.
+
+2. We set up a context for our LLM operations.
+
+3. We generate text by asking the LLM a simple question: "How many sides does a square have?"
+
+4. Finally, we print the LLM's response!
+
+## Cool Features to Explore 🕵️‍♀️
+
+While the example uses default settings, it also shows you how to customize your LLM:
+
+- There are options to set top-k, top-p, and temperature values for text generation.
+
+## Running the Example 🏃‍♂️
+
+Just compile and run the Go file, and you'll see the LLM's response to the square question. It's that simple!
+
+## Have Fun! 🎉
+
+This example is a great starting point for experimenting with local LLMs. Feel free to uncomment the additional options and play around with different configurations. Happy coding!
diff --git a/examples/yzma-llm-example/go.mod b/examples/yzma-llm-example/go.mod
@@ -0,0 +1,17 @@
+module github.com/tmc/langchaingo/examples/yzma-llm-example
+
+go 1.24.4
+
+replace github.com/tmc/langchaingo => ../..
+
+require github.com/tmc/langchaingo v0.0.0-00010101000000-000000000000
+
+require (
+	github.com/dlclark/regexp2 v1.10.0 // indirect
+	github.com/ebitengine/purego v0.8.4 // indirect
+	github.com/google/uuid v1.6.0 // indirect
+	github.com/hybridgroup/yzma v0.7.0 // indirect
+	github.com/jupiterrider/ffi v0.5.1 // indirect
+	github.com/pkoukk/tiktoken-go v0.1.6 // indirect
+	golang.org/x/sys v0.36.0 // indirect
+)
diff --git a/examples/yzma-llm-example/go.sum b/examples/yzma-llm-example/go.sum
@@ -0,0 +1,28 @@
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
+github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dlclark/regexp2 v1.10.0 h1:+/GIL799phkJqYW+3YbOd8LCcbHzT0Pbo8zl70MHsq0=
+github.com/dlclark/regexp2 v1.10.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8=
+github.com/ebitengine/purego v0.8.4 h1:CF7LEKg5FFOsASUj0+QwaXf8Ht6TlFxg09+S9wz0omw=
+github.com/ebitengine/purego v0.8.4/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/hybridgroup/yzma v0.7.0 h1:VKuIzQSeqZgK4162cCTP2HaJvYlpRJJROWoZgzW4uAU=
+github.com/hybridgroup/yzma v0.7.0/go.mod h1:hqcOnvdEmI0ci1UHo9AStKmTgqWIXTyEiU7ZnQz0HCU=
+github.com/jupiterrider/ffi v0.5.1 h1:l7ANXU+Ex33LilVa283HNaf/sTzCrrht7D05k6T6nlc=
+github.com/jupiterrider/ffi v0.5.1/go.mod h1:x7xdNKo8h0AmLuXfswDUBxUsd2OqUP4ekC8sCnsmbvo=
+github.com/pkoukk/tiktoken-go v0.1.6 h1:JF0TlJzhTbrI30wCvFuiw6FzP2+/bR+FIxUdgEAcUsw=
+github.com/pkoukk/tiktoken-go v0.1.6/go.mod h1:9NiV+i9mJKGj1rYOT+njbv+ZwA/zJxYdewGl6qVatpg=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
+github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
+github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+golang.org/x/sys v0.36.0 h1:KVRy2GtZBrk1cBYA7MKu5bEZFxQk4NIDV6RLVcC8o0k=
+golang.org/x/sys v0.36.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
+gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+sigs.k8s.io/yaml v1.3.0 h1:a2VclLzOGrwOHDiV8EfBGhvjHvP46CtW5j6POvhYGGo=
+sigs.k8s.io/yaml v1.3.0/go.mod h1:GeOyir5tyXNByN85N/dRIT9es5UQNerPYEKK56eTBm8=
diff --git a/examples/yzma-llm-example/yzma_llm_example.go b/examples/yzma-llm-example/yzma_llm_example.go
@@ -0,0 +1,34 @@
+package main
+
+import (
+	"context"
+	"fmt"
+	"log"
+
+	"github.com/tmc/langchaingo/llms"
+	"github.com/tmc/langchaingo/llms/yzma"
+)
+
+const modelPath = "/home/ron/models/SmolLM2-135M-Instruct.Q2_K.gguf"
+
+func main() {
+	llm, err := yzma.New(yzma.WithModel(modelPath))
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	// Init context
+	ctx := context.Background()
+
+	completion, err := llms.GenerateFromSinglePrompt(ctx, llm, "How many sides does a square have?")
+	// Or append to default args options from global llms.Options
+	//generateOptions := []llms.CallOption{
+	//	llms.WithTopK(10),
+	//	llms.WithTopP(0.95),
+	//	llms.WithTemperature(0.25),
+	//}
+	if err != nil {
+		log.Fatal(err)
+	}
+	fmt.Println(completion)
+}
diff --git a/llms/yzma/llm_test.go b/llms/yzma/llm_test.go
@@ -0,0 +1,23 @@
+package yzma
+
+import (
+	"os"
+	"testing"
+
+	"github.com/tmc/langchaingo/testing/llmtest"
+)
+
+func TestLLM(t *testing.T) {
+	testModel := os.Getenv("YZMA_TEST_MODEL")
+	if testModel == "" {
+		t.Skip("YZMA_TEST_MODEL not set to point to test model")
+	}
+
+	llm, err := New(WithModel(testModel))
+	if err != nil {
+		t.Fatalf("Failed to create yzma LLM: %v", err)
+	}
+	defer llm.Close()
+
+	llmtest.TestLLM(t, llm)
+}
diff --git a/llms/yzma/options.go b/llms/yzma/options.go
@@ -0,0 +1,22 @@
+package yzma
+
+type options struct {
+	model  string
+	system string
+}
+
+type Option func(*options)
+
+// WithModel sets the model to use.
+func WithModel(model string) Option {
+	return func(opts *options) {
+		opts.model = model
+	}
+}
+
+// WithSystemPrompt sets the system prompt.
+func WithSystemPrompt(p string) Option {
+	return func(opts *options) {
+		opts.system = p
+	}
+}
diff --git a/llms/yzma/yzma.go b/llms/yzma/yzma.go
@@ -0,0 +1,201 @@
+package yzma
+
+import (
+	"context"
+	"errors"
+	"os"
+
+	"github.com/hybridgroup/yzma/pkg/llama"
+	"github.com/tmc/langchaingo/llms"
+)
+
+const (
+	defaultTemperature = 0.8
+	defaultTopK        = 40
+	defaultTopP        = 0.9
+)
+
+// LLM is a yzma local implementation wrapper to call directly to llama.cpp libs using the FFI interface.
+type LLM struct {
+	model   string
+	options options
+}
+
+// New creates a new yzma LLM implementation.
+func New(opts ...Option) (*LLM, error) {
+	o := options{}
+	for _, opt := range opts {
+		opt(&o)
+	}
+
+	libPath := os.Getenv("YZMA_LIB")
+	if libPath == "" {
+		return nil, errors.New("no path to yzma libs")
+	}
+
+	if err := llama.Load(""); err != nil {
+		return nil, err
+	}
+
+	llama.LogSet(llama.LogSilent())
+	llama.Init()
+
+	llm := LLM{
+		model:   o.model,
+		options: o,
+	}
+
+	return &llm, nil
+}
+
+// Close frees all resources.
+func (o *LLM) Close() {
+	llama.BackendFree()
+}
+
+// Call calls yzma with the given prompt.
+func (o *LLM) Call(ctx context.Context, prompt string, options ...llms.CallOption) (string, error) {
+	return llms.GenerateFromSinglePrompt(ctx, o, prompt, options...)
+}
+
+// GenerateContent implements the Model interface.
+func (o *LLM) GenerateContent(ctx context.Context, messages []llms.MessageContent, options ...llms.CallOption) (*llms.ContentResponse, error) {
+	opts := llms.CallOptions{}
+	for _, opt := range options {
+		opt(&opts)
+	}
+
+	modelName := o.model
+	if opts.Model != "" {
+		modelName = opts.Model
+	}
+
+	maxTokens := int32(1024)
+	if opts.MaxTokens > 0 {
+		maxTokens = int32(opts.MaxTokens)
+	}
+
+	// TODO: allow for setting any passed model params
+	model := llama.ModelLoadFromFile(modelName, llama.ModelDefaultParams())
+	if model == llama.Model(0) {
+		return nil, errors.New("unable to load model")
+	}
+	defer llama.ModelFree(model)
+
+	// TODO: allow for setting any passed context options
+	ctxParams := llama.ContextDefaultParams()
+	ctxParams.NCtx = uint32(4096)
+	ctxParams.NBatch = uint32(2048)
+
+	lctx := llama.InitFromModel(model, ctxParams)
+	if lctx == llama.Context(0) {
+		return nil, errors.New("unable to init model")
+	}
+
+	defer llama.Free(lctx)
+
+	vocab := llama.ModelGetVocab(model)
+	sampler := initSampler(opts)
+
+	msg := chatTemplate(templateForModel(model), convertMessageContent(messages), true)
+
+	// call once to get the size of the tokens from the prompt
+	count := llama.Tokenize(vocab, msg, nil, true, true)
+
+	// now get the actual tokens
+	tokens := make([]llama.Token, count)
+	llama.Tokenize(vocab, msg, tokens, true, true)
+
+	batch := llama.BatchGetOne(tokens)
+
+	if llama.ModelHasEncoder(model) {
+		llama.Encode(lctx, batch)
+
+		start := llama.ModelDecoderStartToken(model)
+		if start == llama.TokenNull {
+			start = llama.VocabBOS(vocab)
+		}
+
+		batch = llama.BatchGetOne([]llama.Token{start})
+	}
+
+	result := ""
+
+	for pos := int32(0); pos < maxTokens; pos += batch.NTokens {
+		llama.Decode(lctx, batch)
+		token := llama.SamplerSample(sampler, lctx, -1)
+
+		if llama.VocabIsEOG(vocab, token) {
+			break
+		}
+
+		buf := make([]byte, 64)
+		len := llama.TokenToPiece(vocab, token, buf, 0, true)
+
+		result = result + string(buf[:len])
+		batch = llama.BatchGetOne([]llama.Token{token})
+	}
+
+	choices := []*llms.ContentChoice{
+		{
+			Content: result,
+		},
+	}
+
+	response := &llms.ContentResponse{Choices: choices}
+	return response, nil
+}
+
+func initSampler(opts llms.CallOptions) llama.Sampler {
+	temperature := defaultTemperature
+	if opts.Temperature > 0 {
+		temperature = opts.Temperature
+	}
+	topK := defaultTopK
+	if opts.TopK > 0 {
+		topK = opts.TopK
+	}
+
+	minP := 0.1
+
+	topP := defaultTopP
+	if opts.TopP > 0 {
+		topP = opts.TopP
+	}
+
+	sampler := llama.SamplerChainInit(llama.SamplerChainDefaultParams())
+	llama.SamplerChainAdd(sampler, llama.SamplerInitTopK(int32(topK)))
+	llama.SamplerChainAdd(sampler, llama.SamplerInitTopP(float32(topP), 1))
+	llama.SamplerChainAdd(sampler, llama.SamplerInitMinP(float32(minP), 1))
+	llama.SamplerChainAdd(sampler, llama.SamplerInitTempExt(float32(temperature), 0, 1.0))
+	llama.SamplerChainAdd(sampler, llama.SamplerInitDist(llama.DefaultSeed))
+
+	return sampler
+}
+
+func templateForModel(model llama.Model) string {
+	template := llama.ModelChatTemplate(model, "")
+	if template == "" {
+		template = "chatml"
+	}
+	return template
+}
+
+func convertMessageContent(msgs []llms.MessageContent) []llama.ChatMessage {
+	chatMsgs := []llama.ChatMessage{}
+	for _, m := range msgs {
+		p := m.Parts[0]
+		switch pt := p.(type) {
+		case llms.TextContent:
+			chatMsgs = append(chatMsgs, llama.NewChatMessage(string(m.Role), pt.Text))
+		}
+	}
+	return chatMsgs
+}
+
+func chatTemplate(template string, msgs []llama.ChatMessage, add bool) string {
+	buf := make([]byte, 2048)
+	len := llama.ChatApplyTemplate(template, msgs, add, buf)
+	result := string(buf[:len])
+	return result
+}
diff --git a/llms/yzma/yzma_test.go b/llms/yzma/yzma_test.go