From f85b5c31d202e4b2f395520c5402b1cda4762d0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Serta=C3=A7=20=C3=96zercan?=
 <852750+sozercan@users.noreply.github.com>
Date: Wed, 24 Jan 2024 20:59:10 -0800
Subject: [PATCH] feat: add mamba backend (#109)

Signed-off-by: Sertac Ozercan <sozercan@gmail.com>
---
 .github/workflows/test-docker-gpu.yaml | 28 ++++++++++--
 .github/workflows/test-docker.yaml     |  4 +-
 .github/workflows/test-kubernetes.yaml |  2 +-
 pkg/aikit2llb/convert.go               | 61 +++++++++++++++++---------
 pkg/aikit2llb/image.go                 |  9 +++-
 pkg/build/build.go                     | 14 +++---
 pkg/utils/const.go                     |  1 +
 test/aikitfile-mamba.yaml              | 33 ++++++++++++++
 8 files changed, 117 insertions(+), 35 deletions(-)
 create mode 100644 test/aikitfile-mamba.yaml

diff --git a/.github/workflows/test-docker-gpu.yaml b/.github/workflows/test-docker-gpu.yaml
index e0e60ed9..09fd8572 100644
--- a/.github/workflows/test-docker-gpu.yaml
+++ b/.github/workflows/test-docker-gpu.yaml
@@ -15,11 +15,15 @@ jobs:
       matrix:
         backend:
           - llama-cuda
-          # - exllama
+          # - exllama # https://github.com/sozercan/aikit/issues/94
           - exllama2-gptq
           - exllama2-exl2
+          - mamba
     steps:
-      - uses: AutoModality/action-clean@11d611e7824ef8f2fe7f05a117d1ffe4c1a090f0 # v1.1.1
+      - name: cleanup workspace
+        run: |
+          rm -rf ./* || true
+          rm -rf ./.??* || true
       - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
 
       - name: build aikit
@@ -40,8 +44,9 @@ jobs:
         run: docker run --name testmodel -d --rm -p 8080:8080 --gpus all testmodel:test
 
       - name: run test
+        if: matrix.backend != 'mamba'
         run: |
-          result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
             "model": "llama-2-7b-chat",
             "messages": [{"role": "user", "content": "explain kubernetes in a sentence"}]
           }')
@@ -52,6 +57,20 @@ jobs:
             exit 1
           fi
 
+      - name: run test
+        if: matrix.backend == 'mamba'
+        run: |
+          result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+            "model": "mamba-chat",
+            "messages": [{"role": "user", "content": "explain kubernetes in a sentence"}]
+          }')
+          echo $result
+
+          choices=$(echo "$result" | jq '.choices')
+          if [ -z "$choices" ]; then
+            exit 1
+          fi
+
       - name: save logs
         if: always()
         run: docker logs testmodel > /tmp/docker-${{ matrix.backend }}.log
@@ -59,6 +78,9 @@ jobs:
       - run: docker stop testmodel
         if: always()
 
+      - run: docker system prune -a -f --volumes
+        if: always()
+
       - name: publish test artifacts
         if: always()
         uses: actions/upload-artifact@694cdabd8bdb0f10b2cea11669e1bf5453eed0a6 # v4.2.0
diff --git a/.github/workflows/test-docker.yaml b/.github/workflows/test-docker.yaml
index 4fd14a1a..12bb77fd 100644
--- a/.github/workflows/test-docker.yaml
+++ b/.github/workflows/test-docker.yaml
@@ -78,7 +78,7 @@ jobs:
       - name: run llama test
         if: matrix.backend == 'llama'
         run: |
-          result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
             "model": "llama-2-7b-chat",
             "messages": [{"role": "user", "content": "explain kubernetes in a sentence"}]
           }')
@@ -92,7 +92,7 @@ jobs:
       - name: run stablediffusion test
         if: matrix.backend == 'stablediffusion'
         run: |
-          result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/images/generations -H "Content-Type: application/json" -d '{
+          result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/images/generations -H "Content-Type: application/json" -d '{
             "prompt": "A cute baby llama",
             "size": "256x256"
           }')
diff --git a/.github/workflows/test-kubernetes.yaml b/.github/workflows/test-kubernetes.yaml
index 615cb8ca..961e7dd3 100644
--- a/.github/workflows/test-kubernetes.yaml
+++ b/.github/workflows/test-kubernetes.yaml
@@ -84,7 +84,7 @@ jobs:
 
       - name: run test
         run: |
-          result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+          result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
             "model": "llama-2-7b-chat",
             "messages": [{"role": "user", "content": "explain kubernetes in a sentence"}]
           }')
diff --git a/pkg/aikit2llb/convert.go b/pkg/aikit2llb/convert.go
index 91c22d1e..c000d5c6 100644
--- a/pkg/aikit2llb/convert.go
+++ b/pkg/aikit2llb/convert.go
@@ -32,7 +32,7 @@ func Aikit2LLB(c *config.Config) (llb.State, *specs.Image) {
 
 	// install cuda if runtime is nvidia
 	if c.Runtime == utils.RuntimeNVIDIA {
-		merge = installCuda(c, state, merge)
+		state, merge = installCuda(c, state, merge)
 	}
 
 	// install opencv and friends if stable diffusion backend is being used
@@ -43,6 +43,8 @@ func Aikit2LLB(c *config.Config) (llb.State, *specs.Image) {
 			merge = installExllama(c, state, merge)
 		case utils.BackendStableDiffusion:
 			merge = installOpenCV(state, merge)
+		case utils.BackendMamba:
+			merge = installMamba(state, merge)
 		}
 	}
 
@@ -51,26 +53,14 @@ func Aikit2LLB(c *config.Config) (llb.State, *specs.Image) {
 }
 
 func getBaseImage(c *config.Config) llb.State {
-	for b := range c.Backends {
-		switch c.Backends[b] {
-		case utils.BackendExllama:
-		case utils.BackendExllamaV2:
-			return llb.Image(debianSlim)
-		case utils.BackendStableDiffusion:
-			return llb.Image(debianSlim)
-		}
+	if len(c.Backends) > 0 {
+		return llb.Image(debianSlim)
 	}
 	return llb.Image(distrolessBase)
 }
 
 func copyModels(c *config.Config, base llb.State, s llb.State) (llb.State, llb.State) {
 	savedState := s
-
-	// create config file if defined
-	if c.Config != "" {
-		s = s.Run(shf("echo -n \"%s\" > /config.yaml", c.Config)).Root()
-	}
-
 	for _, model := range c.Models {
 		var opts []llb.HTTPOption
 		opts = append(opts, llb.Filename(fileNameFromURL(model.Source)))
@@ -104,6 +94,12 @@ func copyModels(c *config.Config, base llb.State, s llb.State) (llb.State, llb.S
 			}
 		}
 	}
+
+	// create config file if defined
+	if c.Config != "" {
+		s = s.Run(shf("echo -n \"%s\" > /config.yaml", c.Config)).Root()
+	}
+
 	diff := llb.Diff(savedState, s)
 	merge := llb.Merge([]llb.State{base, diff})
 	return s, merge
@@ -117,7 +113,7 @@ func fileNameFromURL(urlString string) string {
 	return path.Base(parsedURL.Path)
 }
 
-func installCuda(c *config.Config, s llb.State, merge llb.State) llb.State {
+func installCuda(c *config.Config, s llb.State, merge llb.State) (llb.State, llb.State) {
 	cudaKeyringURL := "https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb"
 	cudaKeyring := llb.HTTP(cudaKeyringURL)
 	s = s.File(
@@ -125,10 +121,11 @@ func installCuda(c *config.Config, s llb.State, merge llb.State) llb.State {
 		llb.WithCustomName("Copying "+fileNameFromURL(cudaKeyringURL)), //nolint: goconst
 	)
 	s = s.Run(sh("dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring_1.1-1_all.deb")).Root()
+
+	savedState := s
 	// running apt-get update twice due to nvidia repo
 	s = s.Run(sh("apt-get update && apt-get install -y ca-certificates && apt-get update"), llb.IgnoreCache).Root()
 
-	savedState := s
 	// install cuda libraries
 	if len(c.Backends) == 0 {
 		s = s.Run(shf("apt-get install -y --no-install-recommends libcublas-%[1]s cuda-cudart-%[1]s && apt-get clean", cudaVersion)).Root()
@@ -149,20 +146,25 @@ func installCuda(c *config.Config, s llb.State, merge llb.State) llb.State {
 
 			s = s.Run(sh(exllamaDeps)).Root()
 		}
+
+		if c.Backends[b] == utils.BackendMamba {
+			mambaDeps := fmt.Sprintf("apt-get install -y --no-install-recommends cuda-crt-%[1]s cuda-cudart-dev-%[1]s cuda-nvcc-%[1]s && apt-get clean", cudaVersion)
+			s = s.Run(sh(mambaDeps)).Root()
+		}
 	}
 
 	diff := llb.Diff(savedState, s)
-	return llb.Merge([]llb.State{merge, diff})
+	return s, llb.Merge([]llb.State{merge, diff})
 }
 
 func installExllama(c *config.Config, s llb.State, merge llb.State) llb.State {
-	backend := "exllama"
+	backend := utils.BackendExllama
 	exllamaRepo := "https://github.com/turboderp/exllama"
 	exllamaTag := "master"
 	for b := range c.Backends {
 		if c.Backends[b] == utils.BackendExllamaV2 {
 			exllamaRepo = "https://github.com/turboderp/exllamav2"
-			backend = "exllama2"
+			backend = utils.BackendExllamaV2
 			exllamaTag = "v0.0.11"
 		}
 	}
@@ -171,7 +173,7 @@ func installExllama(c *config.Config, s llb.State, merge llb.State) llb.State {
 	s = s.Run(sh("apt-get update && apt-get install --no-install-recommends -y git ca-certificates python3-pip python3-dev g++ && apt-get clean"), llb.IgnoreCache).Root()
 
 	// clone localai exllama backend only
-	s = s.Run(shf("git clone --filter=blob:none --no-checkout %[1]s /tmp/localai/ && cd /tmp/localai && git sparse-checkout init --cone && git sparse-checkout set backend/python/%[2]s && git checkout %[3]s && rm -rf .git", localAIRepo, backend, localAIVersion)).Root()
+	s = cloneLocalAI(s, backend)
 
 	// clone exllama to localai exllama backend path and install python dependencies
 	s = s.Run(shf("git clone --depth 1 %[1]s --branch %[2]s /tmp/%[3]s && mv /tmp/%[3]s/* /tmp/localai/backend/python/%[3]s && rm -rf /tmp/%[3]s && cd /tmp/localai/backend/python/%[3]s && rm -rf .git && pip3 install grpcio protobuf typing-extensions sympy mpmath setuptools numpy --break-system-packages && pip3 install -r /tmp/localai/backend/python/%[3]s/requirements.txt --break-system-packages", exllamaRepo, exllamaTag, backend)).Root()
@@ -180,6 +182,19 @@ func installExllama(c *config.Config, s llb.State, merge llb.State) llb.State {
 	return llb.Merge([]llb.State{merge, diff})
 }
 
+func installMamba(s llb.State, merge llb.State) llb.State {
+	savedState := s
+	// libexpat1 is requirement but git is not. however libexpat1 is a dependency of git
+	s = s.Run(sh("apt-get install --no-install-recommends -y git python3 python3-dev python3-pip libssl3 openssl && apt-get clean"), llb.IgnoreCache).Root()
+
+	s = cloneLocalAI(s, utils.BackendMamba)
+
+	s = s.Run(shf("pip3 install packaging numpy torch==2.1.0 grpcio protobuf --break-system-packages && pip3 install causal-conv1d==1.0.0 mamba-ssm==1.0.1 --break-system-packages")).Root()
+
+	diff := llb.Diff(savedState, s)
+	return llb.Merge([]llb.State{merge, diff})
+}
+
 func installOpenCV(s llb.State, merge llb.State) llb.State {
 	savedState := s
 	// adding debian 11 (bullseye) repo due to opencv 4.5 requirement
@@ -233,6 +248,10 @@ func addLocalAI(c *config.Config, s llb.State, merge llb.State) (llb.State, llb.
 	return s, llb.Merge([]llb.State{merge, diff})
 }
 
+func cloneLocalAI(s llb.State, backend string) llb.State {
+	return s.Run(shf("git clone --filter=blob:none --no-checkout %[1]s /tmp/localai/ && cd /tmp/localai && git sparse-checkout init --cone && git sparse-checkout set backend/python/%[2]s && git checkout %[3]s && rm -rf .git", localAIRepo, backend, localAIVersion)).Root()
+}
+
 func shf(cmd string, v ...interface{}) llb.RunOption {
 	return llb.Args([]string{"/bin/sh", "-c", fmt.Sprintf(cmd, v...)})
 }
diff --git a/pkg/aikit2llb/image.go b/pkg/aikit2llb/image.go
index 63212179..9ca5bd44 100644
--- a/pkg/aikit2llb/image.go
+++ b/pkg/aikit2llb/image.go
@@ -46,12 +46,19 @@ func emptyImage(c *config.Config) *specs.Image {
 	}
 
 	for b := range c.Backends {
-		if c.Backends[b] == utils.BackendExllama || c.Backends[b] == utils.BackendExllamaV2 {
+		switch c.Backends[b] {
+		case utils.BackendExllama, utils.BackendExllamaV2:
 			exllamaEnv := []string{
 				"EXTERNAL_GRPC_BACKENDS=exllama:/tmp/localai/backend/python/exllama/exllama.py,exllama2:/tmp/localai/backend/python/exllama2/exllama2_backend.py",
 				"CUDA_HOME=/usr/local/cuda",
 			}
 			img.Config.Env = append(img.Config.Env, exllamaEnv...)
+		case utils.BackendMamba:
+			mambaEnv := []string{
+				"EXTERNAL_GRPC_BACKENDS=mamba:/tmp/localai/backend/python/mamba/backend_mamba.py",
+				"CUDA_HOME=/usr/local/cuda",
+			}
+			img.Config.Env = append(img.Config.Env, mambaEnv...)
 		}
 	}
 
diff --git a/pkg/build/build.go b/pkg/build/build.go
index ee157936..f39926a3 100644
--- a/pkg/build/build.go
+++ b/pkg/build/build.go
@@ -128,19 +128,19 @@ func validateConfig(c *config.Config) error {
 		return errors.New("no models defined")
 	}
 
-	if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) {
-		return errors.New("cannot specify both stablediffusion with exllama or exllama2 at this time")
+	if len(c.Backends) > 1 {
+		return errors.New("only one backend is supported at this time")
 	}
 
-	if slices.Contains(c.Backends, utils.BackendExllama) && slices.Contains(c.Backends, utils.BackendExllamaV2) {
-		return errors.New("cannot specify both exllama and exllamav2 at this time")
+	if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) {
+		return errors.New("cannot specify both stablediffusion with exllama or exllama2 at this time")
 	}
 
-	if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) && c.Runtime != utils.RuntimeNVIDIA {
-		return errors.New("exllama only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
+	if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2) || slices.Contains(c.Backends, utils.BackendMamba)) && c.Runtime != utils.RuntimeNVIDIA {
+		return errors.New("exllama and mamba only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
 	}
 
-	backends := []string{utils.BackendExllama, utils.BackendExllamaV2, utils.BackendStableDiffusion}
+	backends := []string{utils.BackendExllama, utils.BackendExllamaV2, utils.BackendStableDiffusion, utils.BackendMamba}
 	for _, b := range c.Backends {
 		if !slices.Contains(backends, b) {
 			return errors.Errorf("backend %s is not supported", b)
diff --git a/pkg/utils/const.go b/pkg/utils/const.go
index 4f33c4b0..47e961c2 100644
--- a/pkg/utils/const.go
+++ b/pkg/utils/const.go
@@ -9,6 +9,7 @@ const (
 	BackendStableDiffusion = "stablediffusion"
 	BackendExllama         = "exllama"
 	BackendExllamaV2       = "exllama2"
+	BackendMamba           = "mamba"
 
 	APIv1alpha1 = "v1alpha1"
 )
diff --git a/test/aikitfile-mamba.yaml b/test/aikitfile-mamba.yaml
new file mode 100644
index 00000000..b76a849c
--- /dev/null
+++ b/test/aikitfile-mamba.yaml
@@ -0,0 +1,33 @@
+#syntax=aikit:test
+apiVersion: v1alpha1
+debug: true
+runtime: cuda
+backends:
+  - mamba
+models:
+  - name: mamba-chat/config.json
+    source: https://huggingface.co/havenhq/mamba-chat/raw/d343f8ade4c870d916b362746dd23821aae132dd/config.json
+  - name: mamba-chat/pytorch_model.bin
+    source: https://huggingface.co/havenhq/mamba-chat/resolve/d343f8ade4c870d916b362746dd23821aae132dd/pytorch_model.bin
+    sha256: 6751a8c3888564a90a7f759a620e2ddfc1ab2cc3e919f2cbaf7bfc41cc5f85e7
+  - name: mamba-chat/tokenizer.json
+    source: https://huggingface.co/havenhq/mamba-chat/raw/d343f8ade4c870d916b362746dd23821aae132dd/tokenizer.json
+  - name: mamba-chat/tokenizer_config.json
+    source: https://huggingface.co/havenhq/mamba-chat/raw/d343f8ade4c870d916b362746dd23821aae132dd/tokenizer_config.json
+config: |
+  - name: mamba-chat
+    backend: mamba
+    parameters:
+      model: /models/mamba-chat
+    trimsuffix:
+    - <|endoftext|>
+    template:
+      chat_message: |
+        {{if eq .RoleName \"assistant\"}}<|assistant|>{{else if eq .RoleName \"system\"}}<|system|>{{else if eq .RoleName \"user\"}}<|user|>{{end}}
+        {{if .Content}}{{.Content}}{{end}}
+        </s>
+      chat: |
+        {{.Input}}
+        <|assistant|>
+      completion: |
+        {{.Input}}