From f85b5c31d202e4b2f395520c5402b1cda4762d0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Serta=C3=A7=20=C3=96zercan?= <852750+sozercan@users.noreply.github.com> Date: Wed, 24 Jan 2024 20:59:10 -0800 Subject: [PATCH] feat: add mamba backend (#109) Signed-off-by: Sertac Ozercan --- .github/workflows/test-docker-gpu.yaml | 28 ++++++++++-- .github/workflows/test-docker.yaml | 4 +- .github/workflows/test-kubernetes.yaml | 2 +- pkg/aikit2llb/convert.go | 61 +++++++++++++++++--------- pkg/aikit2llb/image.go | 9 +++- pkg/build/build.go | 14 +++--- pkg/utils/const.go | 1 + test/aikitfile-mamba.yaml | 33 ++++++++++++++ 8 files changed, 117 insertions(+), 35 deletions(-) create mode 100644 test/aikitfile-mamba.yaml diff --git a/.github/workflows/test-docker-gpu.yaml b/.github/workflows/test-docker-gpu.yaml index e0e60ed9..09fd8572 100644 --- a/.github/workflows/test-docker-gpu.yaml +++ b/.github/workflows/test-docker-gpu.yaml @@ -15,11 +15,15 @@ jobs: matrix: backend: - llama-cuda - # - exllama + # - exllama # https://github.com/sozercan/aikit/issues/94 - exllama2-gptq - exllama2-exl2 + - mamba steps: - - uses: AutoModality/action-clean@11d611e7824ef8f2fe7f05a117d1ffe4c1a090f0 # v1.1.1 + - name: cleanup workspace + run: | + rm -rf ./* || true + rm -rf ./.??* || true - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: build aikit @@ -40,8 +44,9 @@ jobs: run: docker run --name testmodel -d --rm -p 8080:8080 --gpus all testmodel:test - name: run test + if: matrix.backend != 'mamba' run: | - result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ "model": "llama-2-7b-chat", "messages": [{"role": "user", "content": "explain kubernetes in a sentence"}] }') @@ -52,6 +57,20 @@ jobs: exit 1 fi + - name: run test + if: matrix.backend == 'mamba' + run: | + result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "mamba-chat", + "messages": [{"role": "user", "content": "explain kubernetes in a sentence"}] + }') + echo $result + + choices=$(echo "$result" | jq '.choices') + if [ -z "$choices" ]; then + exit 1 + fi + - name: save logs if: always() run: docker logs testmodel > /tmp/docker-${{ matrix.backend }}.log @@ -59,6 +78,9 @@ jobs: - run: docker stop testmodel if: always() + - run: docker system prune -a -f --volumes + if: always() + - name: publish test artifacts if: always() uses: actions/upload-artifact@694cdabd8bdb0f10b2cea11669e1bf5453eed0a6 # v4.2.0 diff --git a/.github/workflows/test-docker.yaml b/.github/workflows/test-docker.yaml index 4fd14a1a..12bb77fd 100644 --- a/.github/workflows/test-docker.yaml +++ b/.github/workflows/test-docker.yaml @@ -78,7 +78,7 @@ jobs: - name: run llama test if: matrix.backend == 'llama' run: | - result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ "model": "llama-2-7b-chat", "messages": [{"role": "user", "content": "explain kubernetes in a sentence"}] }') @@ -92,7 +92,7 @@ jobs: - name: run stablediffusion test if: matrix.backend == 'stablediffusion' run: | - result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/images/generations -H "Content-Type: application/json" -d '{ + result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/images/generations -H "Content-Type: application/json" -d '{ "prompt": "A cute baby llama", "size": "256x256" }') diff --git a/.github/workflows/test-kubernetes.yaml b/.github/workflows/test-kubernetes.yaml index 615cb8ca..961e7dd3 100644 --- a/.github/workflows/test-kubernetes.yaml +++ b/.github/workflows/test-kubernetes.yaml @@ -84,7 +84,7 @@ jobs: - name: run test run: | - result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ "model": "llama-2-7b-chat", "messages": [{"role": "user", "content": "explain kubernetes in a sentence"}] }') diff --git a/pkg/aikit2llb/convert.go b/pkg/aikit2llb/convert.go index 91c22d1e..c000d5c6 100644 --- a/pkg/aikit2llb/convert.go +++ b/pkg/aikit2llb/convert.go @@ -32,7 +32,7 @@ func Aikit2LLB(c *config.Config) (llb.State, *specs.Image) { // install cuda if runtime is nvidia if c.Runtime == utils.RuntimeNVIDIA { - merge = installCuda(c, state, merge) + state, merge = installCuda(c, state, merge) } // install opencv and friends if stable diffusion backend is being used @@ -43,6 +43,8 @@ func Aikit2LLB(c *config.Config) (llb.State, *specs.Image) { merge = installExllama(c, state, merge) case utils.BackendStableDiffusion: merge = installOpenCV(state, merge) + case utils.BackendMamba: + merge = installMamba(state, merge) } } @@ -51,26 +53,14 @@ func Aikit2LLB(c *config.Config) (llb.State, *specs.Image) { } func getBaseImage(c *config.Config) llb.State { - for b := range c.Backends { - switch c.Backends[b] { - case utils.BackendExllama: - case utils.BackendExllamaV2: - return llb.Image(debianSlim) - case utils.BackendStableDiffusion: - return llb.Image(debianSlim) - } + if len(c.Backends) > 0 { + return llb.Image(debianSlim) } return llb.Image(distrolessBase) } func copyModels(c *config.Config, base llb.State, s llb.State) (llb.State, llb.State) { savedState := s - - // create config file if defined - if c.Config != "" { - s = s.Run(shf("echo -n \"%s\" > /config.yaml", c.Config)).Root() - } - for _, model := range c.Models { var opts []llb.HTTPOption opts = append(opts, llb.Filename(fileNameFromURL(model.Source))) @@ -104,6 +94,12 @@ func copyModels(c *config.Config, base llb.State, s llb.State) (llb.State, llb.S } } } + + // create config file if defined + if c.Config != "" { + s = s.Run(shf("echo -n \"%s\" > /config.yaml", c.Config)).Root() + } + diff := llb.Diff(savedState, s) merge := llb.Merge([]llb.State{base, diff}) return s, merge @@ -117,7 +113,7 @@ func fileNameFromURL(urlString string) string { return path.Base(parsedURL.Path) } -func installCuda(c *config.Config, s llb.State, merge llb.State) llb.State { +func installCuda(c *config.Config, s llb.State, merge llb.State) (llb.State, llb.State) { cudaKeyringURL := "https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb" cudaKeyring := llb.HTTP(cudaKeyringURL) s = s.File( @@ -125,10 +121,11 @@ func installCuda(c *config.Config, s llb.State, merge llb.State) llb.State { llb.WithCustomName("Copying "+fileNameFromURL(cudaKeyringURL)), //nolint: goconst ) s = s.Run(sh("dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring_1.1-1_all.deb")).Root() + + savedState := s // running apt-get update twice due to nvidia repo s = s.Run(sh("apt-get update && apt-get install -y ca-certificates && apt-get update"), llb.IgnoreCache).Root() - savedState := s // install cuda libraries if len(c.Backends) == 0 { s = s.Run(shf("apt-get install -y --no-install-recommends libcublas-%[1]s cuda-cudart-%[1]s && apt-get clean", cudaVersion)).Root() @@ -149,20 +146,25 @@ func installCuda(c *config.Config, s llb.State, merge llb.State) llb.State { s = s.Run(sh(exllamaDeps)).Root() } + + if c.Backends[b] == utils.BackendMamba { + mambaDeps := fmt.Sprintf("apt-get install -y --no-install-recommends cuda-crt-%[1]s cuda-cudart-dev-%[1]s cuda-nvcc-%[1]s && apt-get clean", cudaVersion) + s = s.Run(sh(mambaDeps)).Root() + } } diff := llb.Diff(savedState, s) - return llb.Merge([]llb.State{merge, diff}) + return s, llb.Merge([]llb.State{merge, diff}) } func installExllama(c *config.Config, s llb.State, merge llb.State) llb.State { - backend := "exllama" + backend := utils.BackendExllama exllamaRepo := "https://github.com/turboderp/exllama" exllamaTag := "master" for b := range c.Backends { if c.Backends[b] == utils.BackendExllamaV2 { exllamaRepo = "https://github.com/turboderp/exllamav2" - backend = "exllama2" + backend = utils.BackendExllamaV2 exllamaTag = "v0.0.11" } } @@ -171,7 +173,7 @@ func installExllama(c *config.Config, s llb.State, merge llb.State) llb.State { s = s.Run(sh("apt-get update && apt-get install --no-install-recommends -y git ca-certificates python3-pip python3-dev g++ && apt-get clean"), llb.IgnoreCache).Root() // clone localai exllama backend only - s = s.Run(shf("git clone --filter=blob:none --no-checkout %[1]s /tmp/localai/ && cd /tmp/localai && git sparse-checkout init --cone && git sparse-checkout set backend/python/%[2]s && git checkout %[3]s && rm -rf .git", localAIRepo, backend, localAIVersion)).Root() + s = cloneLocalAI(s, backend) // clone exllama to localai exllama backend path and install python dependencies s = s.Run(shf("git clone --depth 1 %[1]s --branch %[2]s /tmp/%[3]s && mv /tmp/%[3]s/* /tmp/localai/backend/python/%[3]s && rm -rf /tmp/%[3]s && cd /tmp/localai/backend/python/%[3]s && rm -rf .git && pip3 install grpcio protobuf typing-extensions sympy mpmath setuptools numpy --break-system-packages && pip3 install -r /tmp/localai/backend/python/%[3]s/requirements.txt --break-system-packages", exllamaRepo, exllamaTag, backend)).Root() @@ -180,6 +182,19 @@ func installExllama(c *config.Config, s llb.State, merge llb.State) llb.State { return llb.Merge([]llb.State{merge, diff}) } +func installMamba(s llb.State, merge llb.State) llb.State { + savedState := s + // libexpat1 is requirement but git is not. however libexpat1 is a dependency of git + s = s.Run(sh("apt-get install --no-install-recommends -y git python3 python3-dev python3-pip libssl3 openssl && apt-get clean"), llb.IgnoreCache).Root() + + s = cloneLocalAI(s, utils.BackendMamba) + + s = s.Run(shf("pip3 install packaging numpy torch==2.1.0 grpcio protobuf --break-system-packages && pip3 install causal-conv1d==1.0.0 mamba-ssm==1.0.1 --break-system-packages")).Root() + + diff := llb.Diff(savedState, s) + return llb.Merge([]llb.State{merge, diff}) +} + func installOpenCV(s llb.State, merge llb.State) llb.State { savedState := s // adding debian 11 (bullseye) repo due to opencv 4.5 requirement @@ -233,6 +248,10 @@ func addLocalAI(c *config.Config, s llb.State, merge llb.State) (llb.State, llb. return s, llb.Merge([]llb.State{merge, diff}) } +func cloneLocalAI(s llb.State, backend string) llb.State { + return s.Run(shf("git clone --filter=blob:none --no-checkout %[1]s /tmp/localai/ && cd /tmp/localai && git sparse-checkout init --cone && git sparse-checkout set backend/python/%[2]s && git checkout %[3]s && rm -rf .git", localAIRepo, backend, localAIVersion)).Root() +} + func shf(cmd string, v ...interface{}) llb.RunOption { return llb.Args([]string{"/bin/sh", "-c", fmt.Sprintf(cmd, v...)}) } diff --git a/pkg/aikit2llb/image.go b/pkg/aikit2llb/image.go index 63212179..9ca5bd44 100644 --- a/pkg/aikit2llb/image.go +++ b/pkg/aikit2llb/image.go @@ -46,12 +46,19 @@ func emptyImage(c *config.Config) *specs.Image { } for b := range c.Backends { - if c.Backends[b] == utils.BackendExllama || c.Backends[b] == utils.BackendExllamaV2 { + switch c.Backends[b] { + case utils.BackendExllama, utils.BackendExllamaV2: exllamaEnv := []string{ "EXTERNAL_GRPC_BACKENDS=exllama:/tmp/localai/backend/python/exllama/exllama.py,exllama2:/tmp/localai/backend/python/exllama2/exllama2_backend.py", "CUDA_HOME=/usr/local/cuda", } img.Config.Env = append(img.Config.Env, exllamaEnv...) + case utils.BackendMamba: + mambaEnv := []string{ + "EXTERNAL_GRPC_BACKENDS=mamba:/tmp/localai/backend/python/mamba/backend_mamba.py", + "CUDA_HOME=/usr/local/cuda", + } + img.Config.Env = append(img.Config.Env, mambaEnv...) } } diff --git a/pkg/build/build.go b/pkg/build/build.go index ee157936..f39926a3 100644 --- a/pkg/build/build.go +++ b/pkg/build/build.go @@ -128,19 +128,19 @@ func validateConfig(c *config.Config) error { return errors.New("no models defined") } - if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) { - return errors.New("cannot specify both stablediffusion with exllama or exllama2 at this time") + if len(c.Backends) > 1 { + return errors.New("only one backend is supported at this time") } - if slices.Contains(c.Backends, utils.BackendExllama) && slices.Contains(c.Backends, utils.BackendExllamaV2) { - return errors.New("cannot specify both exllama and exllamav2 at this time") + if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) { + return errors.New("cannot specify both stablediffusion with exllama or exllama2 at this time") } - if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) && c.Runtime != utils.RuntimeNVIDIA { - return errors.New("exllama only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml") + if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2) || slices.Contains(c.Backends, utils.BackendMamba)) && c.Runtime != utils.RuntimeNVIDIA { + return errors.New("exllama and mamba only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml") } - backends := []string{utils.BackendExllama, utils.BackendExllamaV2, utils.BackendStableDiffusion} + backends := []string{utils.BackendExllama, utils.BackendExllamaV2, utils.BackendStableDiffusion, utils.BackendMamba} for _, b := range c.Backends { if !slices.Contains(backends, b) { return errors.Errorf("backend %s is not supported", b) diff --git a/pkg/utils/const.go b/pkg/utils/const.go index 4f33c4b0..47e961c2 100644 --- a/pkg/utils/const.go +++ b/pkg/utils/const.go @@ -9,6 +9,7 @@ const ( BackendStableDiffusion = "stablediffusion" BackendExllama = "exllama" BackendExllamaV2 = "exllama2" + BackendMamba = "mamba" APIv1alpha1 = "v1alpha1" ) diff --git a/test/aikitfile-mamba.yaml b/test/aikitfile-mamba.yaml new file mode 100644 index 00000000..b76a849c --- /dev/null +++ b/test/aikitfile-mamba.yaml @@ -0,0 +1,33 @@ +#syntax=aikit:test +apiVersion: v1alpha1 +debug: true +runtime: cuda +backends: + - mamba +models: + - name: mamba-chat/config.json + source: https://huggingface.co/havenhq/mamba-chat/raw/d343f8ade4c870d916b362746dd23821aae132dd/config.json + - name: mamba-chat/pytorch_model.bin + source: https://huggingface.co/havenhq/mamba-chat/resolve/d343f8ade4c870d916b362746dd23821aae132dd/pytorch_model.bin + sha256: 6751a8c3888564a90a7f759a620e2ddfc1ab2cc3e919f2cbaf7bfc41cc5f85e7 + - name: mamba-chat/tokenizer.json + source: https://huggingface.co/havenhq/mamba-chat/raw/d343f8ade4c870d916b362746dd23821aae132dd/tokenizer.json + - name: mamba-chat/tokenizer_config.json + source: https://huggingface.co/havenhq/mamba-chat/raw/d343f8ade4c870d916b362746dd23821aae132dd/tokenizer_config.json +config: | + - name: mamba-chat + backend: mamba + parameters: + model: /models/mamba-chat + trimsuffix: + - <|endoftext|> + template: + chat_message: | + {{if eq .RoleName \"assistant\"}}<|assistant|>{{else if eq .RoleName \"system\"}}<|system|>{{else if eq .RoleName \"user\"}}<|user|>{{end}} + {{if .Content}}{{.Content}}{{end}} + + chat: | + {{.Input}} + <|assistant|> + completion: | + {{.Input}}