alibaba · wangzhaode · Dec 16, 2024 · Oct 29, 2024 · Oct 29, 2024 · Oct 30, 2024
diff --git a/.gitignore b/.gitignore
@@ -360,3 +360,9 @@ pymnn_build/
 
 # mnncompress generated
 MNN_compression_pb2.py
+
+# model path
+model/
+
+# datasets
+datasets/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -20,7 +20,9 @@ endif()
 project(MNN VERSION ${MNN_VERSION} LANGUAGES C CXX ASM)
 # complier options
 set(CMAKE_C_STANDARD 99)
-set(CMAKE_CXX_STANDARD 11)
+IF (NOT (CMAKE_CXX_STANDARD EQUAL 17))
+  set(CMAKE_CXX_STANDARD 11)
+ENDIF()
 set(CMAKE_MODULE_PATH
   ${CMAKE_MODULE_PATH}
   "${CMAKE_CURRENT_LIST_DIR}/cmake"
@@ -284,7 +286,7 @@ if(CMAKE_SYSTEM_NAME MATCHES "^Android")
 endif()
 option(MNN_USE_CPP11 "Enable MNN use c++11" ON)
 if (NOT MSVC)
-    if(MNN_CUDA AND MNN_SUPPORT_TRANSFORMER_FUSE)
+    if((MNN_CUDA AND MNN_SUPPORT_TRANSFORMER_FUSE) OR (CMAKE_CXX_STANDARD EQUAL 17))
         set(CMAKE_CXX_STANDARD 17)
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=gnu99")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")

diff --git a/TODO.md b/TODO.md
@@ -0,0 +1,59 @@
+## Change Log
+- [x] implement an independent `Sampler` Module.
+- [x] implement 8 individual basic samplers: `greedy`, `temperature`, `topK`, `topP`, `minP`, `tfs`, `typical`, `penalty`. (can be configured through config.json)
+- [x] implement `mixed` sampler, whose sampling order (Penalties -> top_k -> tfs_z -> typical_p -> top_p -> min_p -> temperature). one can change the samplers through configuring `mixed_samplers`. field in config.json
+- [x] implement `PromptLib` to enable `chat_demo` for all LLM.
+- [x] remove the `seq_len` control in `Llm` to `Sampler` and higher level modules to migrate design complexity. 
+- [x] implement `Chat` to organize the workflow of a chatbot APP. 
+- [x] change `#define FP16_QSCALE 0.25` in `CPUAttention` to ensure Llama3.2 FP16 correctness.
+- [x] `chat_demo` tested on ubuntu 22.04, android(including ARM64, ARM82, OPENCL backend).  
+- [ ] `chat_demo` supports visual model tasks (support Qwen2-VL demo). 
+- [ ] `transformers/llm/engine/android` gives an text-only chatbot app based on `Qwen2.5-1.5B-Instruct`.
+- [ ] `transformers/llm/engine/android` gives an text+image chatbot app based on `Qwen2-VL-2B-Instruct`.
+
+Motivation: 
+1. Sampler: performance, variety, different user-preferrence.
+2. System Prompt: support history context, memory; few-shot generation (examples); role-play role profile.
+
+## TODO Lists
+
+### 0. Overall TODO Lists
+- [ ] test ShareGPT, VQA, Audio...
+- [ ] merge KV cache implementation
+- [ ] verify the possibility of hetereogeneous computing (CPU + opencl/...)
+- [ ] Kv cache + sampler
+
+### 1. Engineering TODO Lists
+- [x] llm-export convert Qwen2.5-1.5B-Instructx, Qwen2.5-3B-Instructx, Qwen2.5-7B-Instruct (Qwen2.5 language series) https://qwenlm.github.io/zh/blog/qwen2.5/ (<7B: 32K/8K, >=7B: 128K/8K)
+- [x] llm-export convert Llama-3.2-1B-Instructx, Llama-3.2-3B-Instruct (Llama-3.2 language series) https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/ (128K)
+- [ ] (optional) llm-export convert Llama-3.2-11B-Vision-Instruct (Llama-3.2 Vision series)
+- [x] (optional) llm-export convert Qwen2-VL-2B-Instruct, Qwen2-VL-7B-Instruct (Qwen2-VL series)
+- [ ] (optional) llm-export convert Qwen2-Audio-7B-Instruct (Qwen2 Audio series)
+- [x] implement `Chat` application in transformers/llm/app/chat.
+- [x] implement `LocalSampler` module. 
+- [x] implement `PromptLib` module.
+- [x] implement `Llm` module. (before StateCacheManager's existence)
+- [x] build MNN on pc end.
+- [x] build MNN and run on android (fp32+fp16)
+- [ ] build and deploy an Llama-3.2-3B-Instruct model chatbot on Android cell phone, show the demo.
+- [ ] add RAG implementation.
+
+```bash
+python llmexport.py --path ../../../model/Qwen2.5-1.5B-Instruct/ --dst_path ../../../model/qwen2_5-1_5b-instruct-mnn/ --export mnn --quant_bit 4 --quant_block 128
+
+python llmexport.py --path ../../../model/Qwen2-VL-2B-Instruct/ --dst_path ../../../model/qwen2-vl-2b-instruct-mnn/ --export mnn --quant_bit 4 --quant_block 128
+
+python llmexport.py --path ../../../model/Llama-3.2-3B-Instruct/ --dst_path ../../../model/llama3_2-3b-instruct-mnn --export mnn --quant_bit 4 --quant_block 128
+
+adb push ../../model/qwen2_5-1_5b-instruct-mnn/ /data/local/tmp/llm
+adb push ../../model/qwen2-vl-2b-instruct-mnn/ /data/local/tmp/llm
+adb push ../../model/llama3_2-3b-instruct-mnn/ /data/local/tmp/llm
+
+cd build/phone
+adb push chat_demo libllm.so libMNN_CL.so libMNN_Express.so libMNN.so tools/cv/libMNNOpenCV.so /data/local/tmp/llm
+```
+
+
+### 2. Experiments TODO Lists
+- [ ] test `Chat` on ShareGPT datasets, measuring time and space
+- [ ] test VQA
diff --git a/transformers/llm/engine/llm_demo.cpp → backupcode/llm/llm_demo.cpp b/transformers/llm/engine/llm_demo.cpp → backupcode/llm/llm_demo.cpp
diff --git a/backupcode/llm/perplexity.hpp b/backupcode/llm/perplexity.hpp
@@ -0,0 +1,50 @@
+#ifndef PERPLEXITY_hpp
+#define PERPLEXITY_hpp
+
+#include <vector>
+#include <memory>
+#include <string>
+#include <fstream>
+#include <sstream>
+#include <iostream>
+#include <streambuf>
+#include <functional>
+#include <unordered_map>
+#include <utility>
+
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/Module.hpp>
+#include <MNN/expr/MathOp.hpp>
+#include <MNN/expr/NeuralNetWorkOp.hpp>
+#include <MNN/StateCacheManager.hpp>
+
+namespace MNN {
+namespace Transformer {
+class Llm;
+
+class MNN_PUBLIC PPLMeasurer {
+protected:
+    Llm* mLlm;
+    StateCacheManager*   mStateCacheManager;
+    std::vector<std::vector<int>> mPrompts;
+    std::shared_ptr<StateCacheReference> mCandidate;
+    int mStride, mMaxLen;
+    void init(Llm* llm, StateCacheManager* manager, std::vector<std::vector<int>> prompts, int max_len, int stride);
+public:
+    PPLMeasurer(Llm* llm, StateCacheManager* manager, std::vector<std::vector<int>> prompts, int max_len=2048, int stride=0);
+    PPLMeasurer(Llm* llm, StateCacheManager* manager, std::vector<std::string> prompts, int max_len=2048, int stride=0);
+    float perplexity_one(const std::vector<int>& prompt);
+    std::vector<float> perplexity();
+    // prepare for another round of sampling
+    // in the future, only reset its own.
+    void reset();
+    void reset(int max_len, int stride);
+};
+
+
+
+} // Transformer
+} // MNN
+
+
+#endif // SAMPLER_hpp
diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md
@@ -157,7 +157,7 @@ sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN
   - visual_model: 当使用VL模型时，visual_model的实际路径为`base_dir + visual_model`，默认为`base_dir + 'visual.mnn'`
 - 推理配置
   - max_new_tokens: 生成时最大token数，默认为`512`
-  - reuse_kv: 多轮对话时是否复用之前对话的`kv cache`，默认为`false`
+  - reuse_kv: 多轮对话时是否复用之前对话的`kv cache`，默认为`false`, 目前只有CPU后端支持设置为`true`.
   - quant_qkv: CPU attention 算子中`query, key, value`是否量化，可选为：`0, 1, 2, 3, 4`，默认为`0`，含义如下：
     - 0: key和value都不量化
     - 1: 使用非对称8bit量化存储key
@@ -173,6 +173,19 @@ sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN
   - thread_num: CPU推理使用硬件线程数，默认为：`4`; OpenCL推理时使用`68`
   - precision: 推理使用精度策略，默认为：`"low"`，尽量使用`fp16`
   - memory: 推理使用内存策略，默认为：`"low"`，开启运行时量化
+- Sampler配置
+  - sampler_type: 使用的sampler种类，目前支持`greedy`, `temperature`, `topK`, `topP`, `minP`, `tfs`, `typical`, `penalty`8种基本sampler，外加`mixed`(混合sampler)。当选择`mixed`时，依次执行mixed_samplers中的sampler。默认为`mixed`。
+  - mixed_samplers: 当`sampler_type`为`mixed`时有效，默认为`["topK", "tfs", "typical", "topP", "min_p", "temperature"]`
+  - temperature: `temperature`, `topP`, `minP`, `tfsZ`, `typical`中temerature值，默认为1.0
+  - topK: `topK`中top K 个的个数，默认为40
+  - topP: `topP`中top P的值，默认为0.9
+  - minP: `minP`中min P的值，默认为0.1
+  - tfsZ: `tfs`中Z的值，默认为1.0，即不使用tfs算法
+  - typical: `typical`中p的值，默认为1.0，即不使用typical算法
+  - penalty: `penalty`中对于logits的惩罚项，默认为0.0，即不惩罚
+  - n_gram: `penalty`中最大存储的ngram大小，默认为8
+  - ngram_factor: `penalty`中对于重复ngram的额外惩罚，默认为1.0，即没有额外惩罚
+  - penalty_sampler: `penalty`中最后一步采用的sampling策略，可选"greedy"或"temperature"，默认greedy.
 
 ##### 配置文件示例
 - `config.json`
@@ -184,7 +197,15 @@ sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN
       "backend_type": "cpu",
       "thread_num": 4,
       "precision": "low",
-      "memory": "low"
+      "memory": "low",
+      "sampler_type": "mixed",
+      "mixed_samplers": ["topK", "tfs", "typical", "topP", "min_p", "temperature"],
+      "temperature": 1.0,
+      "topK": 40,
+      "topP": 0.9,
+      "tfsZ": 1.0,
+      "minP": 0.1,
+      "reuse_kv": true
   }
   ```
 - `llm_config.json`
@@ -207,19 +228,19 @@ sh package_scripts/ios/buildiOS.sh "-DMNN_ARM82=true -DMNN_LOW_MEMORY=true -DMNN
   ```
 
 #### 推理用法
-`llm_demo`的用法如下：
+`chat_demo`的用法如下：
 ```
 # 使用config.json
 ## 交互式聊天
-./llm_demo model_dir/config.json
+./chat_demo model_dir/config.json
 ## 针对prompt中的每行进行回复
-./llm_demo model_dir/config.json prompt.txt
+./chat_demo model_dir/config.json prompt.txt
 
 # 不使用config.json, 使用默认配置
 ## 交互式聊天
-./llm_demo model_dir/llm.mnn
+./chat_demo model_dir/llm.mnn
 ## 针对prompt中的每行进行回复
-./llm_demo model_dir/llm.mnn prompt.txt
+./chat_demo model_dir/llm.mnn prompt.txt
 ```
 
 #### GPTQ权重加载