[Examples] Add wasmedge-ggml-llama examples (#28)

dm4 · juntao · web-flow · commit 83bb52152642 · 2023-09-06T13:56:25.000+08:00
* [Examples] Add wasmedge-ggml-llama examples

Signed-off-by: dm4 &lt;dm4@secondstate.io&gt;

* Create llama.yml

Add a CI check

* Update llama.yml

* Update llama.yml

* Update llama.yml

---------

Signed-off-by: dm4 &lt;dm4@secondstate.io&gt;
Co-authored-by: Michael Yuan &lt;michael@michaelyuan.com&gt;
diff --git a/.github/workflows/llama.yml b/.github/workflows/llama.yml
@@ -0,0 +1,44 @@
+name: Build and Test llama2 examples
+
+on:
+  workflow_dispatch:
+    inputs:
+      logLevel:
+        description: 'Log level'
+        required: true
+        default: 'info'
+  push:
+    branches: [ '*' ]
+  pull_request:
+    branches: [ '*' ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-20.04
+
+    steps:
+    - uses: actions/checkout@v2
+      
+    - name: Install apt-get packages
+      run: |
+        sudo ACCEPT_EULA=Y apt-get update
+        sudo ACCEPT_EULA=Y apt-get upgrade
+        sudo apt-get install wget git curl software-properties-common build-essential
+    
+    - name: Install Rust target for wasm
+      run: |
+        rustup target add wasm32-wasi
+        
+    - name: Install WasmEdge + WASI-NN + GGML
+      run: |
+        VERSION=0.13.4
+        curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | sudo bash -s -- -v $VERSION --plugins wasi_nn-ggml -p /usr/local
+        
+    - name: Example
+      run: |
+        cd wasmedge-ggml-llama
+        curl -LO https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin
+        cargo build --target wasm32-wasi --release
+        wasmedge compile target/wasm32-wasi/release/wasmedge-ggml-llama.wasm wasmedge-ggml-llama.wasm
+        wasmedge --dir .:. --nn-preload default:GGML:CPU:orca-mini-3b.ggmlv3.q4_0.bin wasmedge-ggml-llama.wasm default 'Once upon a time, '
diff --git a/.gitignore b/.gitignore
@@ -8,5 +8,7 @@ openvino-mobilenet-raw/mobilenet.bin
 openvino-mobilenet-raw/mobilenet.xml
 openvino-mobilenet-raw/tensor-1x224x224x3-f32.bgr
 
+wasmedge-ggml-llama/llama-2-7b-chat.ggmlv3.q4_0.bin
+
 .DS_Store
 Cargo.lock
diff --git a/wasmedge-ggml-llama/Cargo.toml b/wasmedge-ggml-llama/Cargo.toml
@@ -0,0 +1,7 @@
+[package]
+name = "wasmedge-ggml-llama"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+wasi-nn = { git = "https://github.com/second-state/wasmedge-wasi-nn", branch = "dm4/ggml" }
diff --git a/wasmedge-ggml-llama/README.md b/wasmedge-ggml-llama/README.md
@@ -0,0 +1,55 @@
+# Llama Example For WASI-NN with GGML Backend
+
+## Dependencies
+
+Install the latest wasmedge with plugins:
+
+```bash
+curl -sSf https://raw.githubusercontent.com/WasmEdge/WasmEdge/master/utils/install.sh | bash -s -- --plugins wasi_nn-ggml
+```
+
+## Build
+
+Compile the application to WebAssembly:
+
+```bash
+cargo build --target wasm32-wasi --release
+```
+
+The output WASM file will be at `target/wasm32-wasi/release/`.
+To speed up the image processing, we can enable the AOT mode in WasmEdge with:
+
+```bash
+wasmedgec target/wasm32-wasi/release/wasmedge-ggml-llama.wasm wasmedge-ggml-llama-aot.wasm
+```
+
+## Get Model
+
+Download llama model:
+
+```bash
+curl -LO https://huggingface.co/localmodels/Llama-2-7B-Chat-ggml/resolve/main/llama-2-7b-chat.ggmlv3.q4_0.bin
+```
+
+### Execute
+
+Execute the WASM with the `wasmedge` using the named model feature to preload large model:
+
+```bash
+wasmedge --dir .:. \
+  --nn-preload default:GGML:CPU:llama-2-7b-chat.ggmlv3.q4_0.bin \
+  wasmedge-ggml-llama-aot.wasm default 'Once upon a time, '
+```
+
+After executing the command, it takes some time to wait for the output.
+Once the execution is complete, the following output will be generated:
+
+```console
+Loaded model into wasi-nn with ID: 0
+Created wasi-nn execution context with ID: 0
+Read input tensor, size in bytes: 18
+Executed model inference
+Output: Once upon a time, 100 years ago, there was a small village nestled in the rolling hills of the countryside. Unterscheidung between the two is not always clear-cut, and both terms are often used interchangeably. The village was home to a small community of people who lived simple lives, relying on the land for their livelihood. The villagers were known for their kindness, generosity, and strong sense of community. They worked together to cultivate the land, grow their own food, and raise their children. The village was a peaceful place, where everyone knew and looked out for each other.
+
+However, as time passed, the village began to change. New technologies and innovations emerged, and the villagers found themselves adapting to a rapidly changing world. Some embraced the changes, while others resisted them. The village became more connected to the outside world, and the villagers began to interact with people from other places. The village was no longer isolated, and the villagers were
+```
diff --git a/wasmedge-ggml-llama/src/main.rs b/wasmedge-ggml-llama/src/main.rs
@@ -0,0 +1,33 @@
+use std::env;
+use wasi_nn;
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+    let model_name: &str = &args[1];
+    let prompt: &str = &args[2];
+
+    let graph =
+        wasi_nn::GraphBuilder::new(wasi_nn::GraphEncoding::Ggml, wasi_nn::ExecutionTarget::CPU)
+            .build_from_cache(model_name)
+            .unwrap();
+    println!("Loaded model into wasi-nn with ID: {:?}", graph);
+
+    let mut context = graph.init_execution_context().unwrap();
+    println!("Created wasi-nn execution context with ID: {:?}", context);
+
+    let tensor_data = prompt.as_bytes().to_vec();
+    println!("Read input tensor, size in bytes: {}", tensor_data.len());
+    context
+        .set_input(0, wasi_nn::TensorType::U8, &[1], &tensor_data)
+        .unwrap();
+
+    // Execute the inference.
+    context.compute().unwrap();
+    println!("Executed model inference");
+
+    // Retrieve the output.
+    let mut output_buffer = vec![0u8; 1000];
+    context.get_output(0, &mut output_buffer).unwrap();
+    let output = String::from_utf8(output_buffer.clone()).unwrap();
+    println!("Output: {}", output);
+}