Skip to content

Commit

Permalink
feat: add mamba backend (#109)
Browse files Browse the repository at this point in the history
Signed-off-by: Sertac Ozercan <[email protected]>
  • Loading branch information
sozercan authored Jan 25, 2024
1 parent 62bbc88 commit f85b5c3
Show file tree
Hide file tree
Showing 8 changed files with 117 additions and 35 deletions.
28 changes: 25 additions & 3 deletions .github/workflows/test-docker-gpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,15 @@ jobs:
matrix:
backend:
- llama-cuda
# - exllama
# - exllama # https://github.com/sozercan/aikit/issues/94
- exllama2-gptq
- exllama2-exl2
- mamba
steps:
- uses: AutoModality/action-clean@11d611e7824ef8f2fe7f05a117d1ffe4c1a090f0 # v1.1.1
- name: cleanup workspace
run: |
rm -rf ./* || true
rm -rf ./.??* || true
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: build aikit
Expand All @@ -40,8 +44,9 @@ jobs:
run: docker run --name testmodel -d --rm -p 8080:8080 --gpus all testmodel:test

- name: run test
if: matrix.backend != 'mamba'
run: |
result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llama-2-7b-chat",
"messages": [{"role": "user", "content": "explain kubernetes in a sentence"}]
}')
Expand All @@ -52,13 +57,30 @@ jobs:
exit 1
fi
- name: run test
if: matrix.backend == 'mamba'
run: |
result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "mamba-chat",
"messages": [{"role": "user", "content": "explain kubernetes in a sentence"}]
}')
echo $result
choices=$(echo "$result" | jq '.choices')
if [ -z "$choices" ]; then
exit 1
fi
- name: save logs
if: always()
run: docker logs testmodel > /tmp/docker-${{ matrix.backend }}.log

- run: docker stop testmodel
if: always()

- run: docker system prune -a -f --volumes
if: always()

- name: publish test artifacts
if: always()
uses: actions/upload-artifact@694cdabd8bdb0f10b2cea11669e1bf5453eed0a6 # v4.2.0
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test-docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ jobs:
- name: run llama test
if: matrix.backend == 'llama'
run: |
result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llama-2-7b-chat",
"messages": [{"role": "user", "content": "explain kubernetes in a sentence"}]
}')
Expand All @@ -92,7 +92,7 @@ jobs:
- name: run stablediffusion test
if: matrix.backend == 'stablediffusion'
run: |
result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/images/generations -H "Content-Type: application/json" -d '{
result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/images/generations -H "Content-Type: application/json" -d '{
"prompt": "A cute baby llama",
"size": "256x256"
}')
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test-kubernetes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ jobs:
- name: run test
run: |
result=$(curl --fail --retry 5 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
result=$(curl --fail --retry 10 --retry-all-errors http://127.0.0.1:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llama-2-7b-chat",
"messages": [{"role": "user", "content": "explain kubernetes in a sentence"}]
}')
Expand Down
61 changes: 40 additions & 21 deletions pkg/aikit2llb/convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ func Aikit2LLB(c *config.Config) (llb.State, *specs.Image) {

// install cuda if runtime is nvidia
if c.Runtime == utils.RuntimeNVIDIA {
merge = installCuda(c, state, merge)
state, merge = installCuda(c, state, merge)
}

// install opencv and friends if stable diffusion backend is being used
Expand All @@ -43,6 +43,8 @@ func Aikit2LLB(c *config.Config) (llb.State, *specs.Image) {
merge = installExllama(c, state, merge)
case utils.BackendStableDiffusion:
merge = installOpenCV(state, merge)
case utils.BackendMamba:
merge = installMamba(state, merge)
}
}

Expand All @@ -51,26 +53,14 @@ func Aikit2LLB(c *config.Config) (llb.State, *specs.Image) {
}

func getBaseImage(c *config.Config) llb.State {
for b := range c.Backends {
switch c.Backends[b] {
case utils.BackendExllama:
case utils.BackendExllamaV2:
return llb.Image(debianSlim)
case utils.BackendStableDiffusion:
return llb.Image(debianSlim)
}
if len(c.Backends) > 0 {
return llb.Image(debianSlim)
}
return llb.Image(distrolessBase)
}

func copyModels(c *config.Config, base llb.State, s llb.State) (llb.State, llb.State) {
savedState := s

// create config file if defined
if c.Config != "" {
s = s.Run(shf("echo -n \"%s\" > /config.yaml", c.Config)).Root()
}

for _, model := range c.Models {
var opts []llb.HTTPOption
opts = append(opts, llb.Filename(fileNameFromURL(model.Source)))
Expand Down Expand Up @@ -104,6 +94,12 @@ func copyModels(c *config.Config, base llb.State, s llb.State) (llb.State, llb.S
}
}
}

// create config file if defined
if c.Config != "" {
s = s.Run(shf("echo -n \"%s\" > /config.yaml", c.Config)).Root()
}

diff := llb.Diff(savedState, s)
merge := llb.Merge([]llb.State{base, diff})
return s, merge
Expand All @@ -117,18 +113,19 @@ func fileNameFromURL(urlString string) string {
return path.Base(parsedURL.Path)
}

func installCuda(c *config.Config, s llb.State, merge llb.State) llb.State {
func installCuda(c *config.Config, s llb.State, merge llb.State) (llb.State, llb.State) {
cudaKeyringURL := "https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb"
cudaKeyring := llb.HTTP(cudaKeyringURL)
s = s.File(
llb.Copy(cudaKeyring, fileNameFromURL(cudaKeyringURL), "/"),
llb.WithCustomName("Copying "+fileNameFromURL(cudaKeyringURL)), //nolint: goconst
)
s = s.Run(sh("dpkg -i cuda-keyring_1.1-1_all.deb && rm cuda-keyring_1.1-1_all.deb")).Root()

savedState := s
// running apt-get update twice due to nvidia repo
s = s.Run(sh("apt-get update && apt-get install -y ca-certificates && apt-get update"), llb.IgnoreCache).Root()

savedState := s
// install cuda libraries
if len(c.Backends) == 0 {
s = s.Run(shf("apt-get install -y --no-install-recommends libcublas-%[1]s cuda-cudart-%[1]s && apt-get clean", cudaVersion)).Root()
Expand All @@ -149,20 +146,25 @@ func installCuda(c *config.Config, s llb.State, merge llb.State) llb.State {

s = s.Run(sh(exllamaDeps)).Root()
}

if c.Backends[b] == utils.BackendMamba {
mambaDeps := fmt.Sprintf("apt-get install -y --no-install-recommends cuda-crt-%[1]s cuda-cudart-dev-%[1]s cuda-nvcc-%[1]s && apt-get clean", cudaVersion)
s = s.Run(sh(mambaDeps)).Root()
}
}

diff := llb.Diff(savedState, s)
return llb.Merge([]llb.State{merge, diff})
return s, llb.Merge([]llb.State{merge, diff})
}

func installExllama(c *config.Config, s llb.State, merge llb.State) llb.State {
backend := "exllama"
backend := utils.BackendExllama
exllamaRepo := "https://github.com/turboderp/exllama"
exllamaTag := "master"
for b := range c.Backends {
if c.Backends[b] == utils.BackendExllamaV2 {
exllamaRepo = "https://github.com/turboderp/exllamav2"
backend = "exllama2"
backend = utils.BackendExllamaV2
exllamaTag = "v0.0.11"
}
}
Expand All @@ -171,7 +173,7 @@ func installExllama(c *config.Config, s llb.State, merge llb.State) llb.State {
s = s.Run(sh("apt-get update && apt-get install --no-install-recommends -y git ca-certificates python3-pip python3-dev g++ && apt-get clean"), llb.IgnoreCache).Root()

// clone localai exllama backend only
s = s.Run(shf("git clone --filter=blob:none --no-checkout %[1]s /tmp/localai/ && cd /tmp/localai && git sparse-checkout init --cone && git sparse-checkout set backend/python/%[2]s && git checkout %[3]s && rm -rf .git", localAIRepo, backend, localAIVersion)).Root()
s = cloneLocalAI(s, backend)

// clone exllama to localai exllama backend path and install python dependencies
s = s.Run(shf("git clone --depth 1 %[1]s --branch %[2]s /tmp/%[3]s && mv /tmp/%[3]s/* /tmp/localai/backend/python/%[3]s && rm -rf /tmp/%[3]s && cd /tmp/localai/backend/python/%[3]s && rm -rf .git && pip3 install grpcio protobuf typing-extensions sympy mpmath setuptools numpy --break-system-packages && pip3 install -r /tmp/localai/backend/python/%[3]s/requirements.txt --break-system-packages", exllamaRepo, exllamaTag, backend)).Root()
Expand All @@ -180,6 +182,19 @@ func installExllama(c *config.Config, s llb.State, merge llb.State) llb.State {
return llb.Merge([]llb.State{merge, diff})
}

func installMamba(s llb.State, merge llb.State) llb.State {
savedState := s
// libexpat1 is requirement but git is not. however libexpat1 is a dependency of git
s = s.Run(sh("apt-get install --no-install-recommends -y git python3 python3-dev python3-pip libssl3 openssl && apt-get clean"), llb.IgnoreCache).Root()

s = cloneLocalAI(s, utils.BackendMamba)

s = s.Run(shf("pip3 install packaging numpy torch==2.1.0 grpcio protobuf --break-system-packages && pip3 install causal-conv1d==1.0.0 mamba-ssm==1.0.1 --break-system-packages")).Root()

diff := llb.Diff(savedState, s)
return llb.Merge([]llb.State{merge, diff})
}

func installOpenCV(s llb.State, merge llb.State) llb.State {
savedState := s
// adding debian 11 (bullseye) repo due to opencv 4.5 requirement
Expand Down Expand Up @@ -233,6 +248,10 @@ func addLocalAI(c *config.Config, s llb.State, merge llb.State) (llb.State, llb.
return s, llb.Merge([]llb.State{merge, diff})
}

func cloneLocalAI(s llb.State, backend string) llb.State {
return s.Run(shf("git clone --filter=blob:none --no-checkout %[1]s /tmp/localai/ && cd /tmp/localai && git sparse-checkout init --cone && git sparse-checkout set backend/python/%[2]s && git checkout %[3]s && rm -rf .git", localAIRepo, backend, localAIVersion)).Root()
}

func shf(cmd string, v ...interface{}) llb.RunOption {
return llb.Args([]string{"/bin/sh", "-c", fmt.Sprintf(cmd, v...)})
}
Expand Down
9 changes: 8 additions & 1 deletion pkg/aikit2llb/image.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,12 +46,19 @@ func emptyImage(c *config.Config) *specs.Image {
}

for b := range c.Backends {
if c.Backends[b] == utils.BackendExllama || c.Backends[b] == utils.BackendExllamaV2 {
switch c.Backends[b] {
case utils.BackendExllama, utils.BackendExllamaV2:
exllamaEnv := []string{
"EXTERNAL_GRPC_BACKENDS=exllama:/tmp/localai/backend/python/exllama/exllama.py,exllama2:/tmp/localai/backend/python/exllama2/exllama2_backend.py",
"CUDA_HOME=/usr/local/cuda",
}
img.Config.Env = append(img.Config.Env, exllamaEnv...)
case utils.BackendMamba:
mambaEnv := []string{
"EXTERNAL_GRPC_BACKENDS=mamba:/tmp/localai/backend/python/mamba/backend_mamba.py",
"CUDA_HOME=/usr/local/cuda",
}
img.Config.Env = append(img.Config.Env, mambaEnv...)
}
}

Expand Down
14 changes: 7 additions & 7 deletions pkg/build/build.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,19 @@ func validateConfig(c *config.Config) error {
return errors.New("no models defined")
}

if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) {
return errors.New("cannot specify both stablediffusion with exllama or exllama2 at this time")
if len(c.Backends) > 1 {
return errors.New("only one backend is supported at this time")
}

if slices.Contains(c.Backends, utils.BackendExllama) && slices.Contains(c.Backends, utils.BackendExllamaV2) {
return errors.New("cannot specify both exllama and exllamav2 at this time")
if slices.Contains(c.Backends, utils.BackendStableDiffusion) && (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) {
return errors.New("cannot specify both stablediffusion with exllama or exllama2 at this time")
}

if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2)) && c.Runtime != utils.RuntimeNVIDIA {
return errors.New("exllama only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
if (slices.Contains(c.Backends, utils.BackendExllama) || slices.Contains(c.Backends, utils.BackendExllamaV2) || slices.Contains(c.Backends, utils.BackendMamba)) && c.Runtime != utils.RuntimeNVIDIA {
return errors.New("exllama and mamba only supports nvidia cuda runtime. please add 'runtime: cuda' to your aikitfile.yaml")
}

backends := []string{utils.BackendExllama, utils.BackendExllamaV2, utils.BackendStableDiffusion}
backends := []string{utils.BackendExllama, utils.BackendExllamaV2, utils.BackendStableDiffusion, utils.BackendMamba}
for _, b := range c.Backends {
if !slices.Contains(backends, b) {
return errors.Errorf("backend %s is not supported", b)
Expand Down
1 change: 1 addition & 0 deletions pkg/utils/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ const (
BackendStableDiffusion = "stablediffusion"
BackendExllama = "exllama"
BackendExllamaV2 = "exllama2"
BackendMamba = "mamba"

APIv1alpha1 = "v1alpha1"
)
33 changes: 33 additions & 0 deletions test/aikitfile-mamba.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#syntax=aikit:test
apiVersion: v1alpha1
debug: true
runtime: cuda
backends:
- mamba
models:
- name: mamba-chat/config.json
source: https://huggingface.co/havenhq/mamba-chat/raw/d343f8ade4c870d916b362746dd23821aae132dd/config.json
- name: mamba-chat/pytorch_model.bin
source: https://huggingface.co/havenhq/mamba-chat/resolve/d343f8ade4c870d916b362746dd23821aae132dd/pytorch_model.bin
sha256: 6751a8c3888564a90a7f759a620e2ddfc1ab2cc3e919f2cbaf7bfc41cc5f85e7
- name: mamba-chat/tokenizer.json
source: https://huggingface.co/havenhq/mamba-chat/raw/d343f8ade4c870d916b362746dd23821aae132dd/tokenizer.json
- name: mamba-chat/tokenizer_config.json
source: https://huggingface.co/havenhq/mamba-chat/raw/d343f8ade4c870d916b362746dd23821aae132dd/tokenizer_config.json
config: |
- name: mamba-chat
backend: mamba
parameters:
model: /models/mamba-chat
trimsuffix:
- <|endoftext|>
template:
chat_message: |
{{if eq .RoleName \"assistant\"}}<|assistant|>{{else if eq .RoleName \"system\"}}<|system|>{{else if eq .RoleName \"user\"}}<|user|>{{end}}
{{if .Content}}{{.Content}}{{end}}
</s>
chat: |
{{.Input}}
<|assistant|>
completion: |
{{.Input}}

0 comments on commit f85b5c3

Please sign in to comment.