m5stack
diff --git a/‎projects/llm_framework/main_cosy_voice/src/main.cpp‎
Lines changed: 32 additions & 0 deletions b/‎projects/llm_framework/main_cosy_voice/src/main.cpp‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎projects/llm_framework/main_cosy_voice/src/runner/LLM.hpp‎
Lines changed: 1 addition & 9 deletions b/‎projects/llm_framework/main_cosy_voice/src/runner/LLM.hpp‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎projects/llm_framework/main_llm/src/main.cpp‎
Lines changed: 34 additions & 2 deletions b/‎projects/llm_framework/main_llm/src/main.cpp‎
Lines changed: 34 additions & 2 deletions
diff --git a/‎projects/llm_framework/main_llm/src/runner/LLM.hpp‎
Lines changed: 7 additions & 14 deletions b/‎projects/llm_framework/main_llm/src/runner/LLM.hpp‎
Lines changed: 7 additions & 14 deletions
@@ -92,6 +92,7 @@ class llm_task {
     std::string kvcache_path;
     int precompute_len = 0;
     std::vector<int> _token_ids;
+    static int ax_init_flage_;
     task_callback_t out_callback_;
     bool enoutput_;
     bool enstream_;
@@ -630,9 +631,38 @@ class llm_task {
         return port;
     }
 
+    void _ax_init()
+    {
+        if (!ax_init_flage_) {
+            int ret = AX_SYS_Init();
+            if (0 != ret) {
+                fprintf(stderr, "AX_SYS_Init failed! ret = 0x%x\n", ret);
+            }
+            AX_ENGINE_NPU_ATTR_T npu_attr;
+            memset(&npu_attr, 0, sizeof(npu_attr));
+            ret = AX_ENGINE_Init(&npu_attr);
+            if (0 != ret) {
+                fprintf(stderr, "Init ax-engine failed{0x%8x}.\n", ret);
+            }
+        }
+        ax_init_flage_++;
+    }
+
+    void _ax_deinit()
+    {
+        if (ax_init_flage_ > 0) {
+            --ax_init_flage_;
+            if (!ax_init_flage_) {
+                AX_ENGINE_Deinit();
+                AX_SYS_Deinit();
+            }
+        }
+    }
+
     llm_task(const std::string &workid) : tokenizer_server_flage_(false), port_(getNextPort())
     {
         inference_run_ = std::make_unique<std::thread>(std::bind(&llm_task::run, this));
+        _ax_init();
     }
 
     void start()
@@ -666,10 +696,12 @@ class llm_task {
         if (lToken2Wav_) {
             lToken2Wav_->Deinit();
         }
+        _ax_deinit();
     }
 };
 
 std::atomic<unsigned int> llm_task::next_port_{8070};
+int llm_task::ax_init_flage_ = 0;
 
 #undef CONFIG_AUTO_SET
 
 
@@ -18,6 +18,7 @@
 #include "cqdm.h"
 #include "timer.hpp"
 #include "ax_sys_api.h"
+#include "ax_engine_api.h"
 #include "utils/sampling.hpp"
 #include "utils/utils.hpp"
 
@@ -463,20 +464,11 @@ class LLM {
 
                 layer.layer.inference(_attr.prefill_grpid);
 
-                auto &input_decoder_k_cache = layer.layer.get_input(decode_grpid, "K_cache");
-                auto &input_decoder_v_cache = layer.layer.get_input(decode_grpid, "V_cache");
-
                 auto &output_k_cache = layer.layer.get_output(_attr.prefill_grpid, "K_cache_out");
                 auto &output_v_cache = layer.layer.get_output(_attr.prefill_grpid, "V_cache_out");
 
                 int kv_offset = (_attr.precompute_len + p * _attr.prefill_token_num) * _attr.kv_cache_size;
 
-                memcpy((unsigned short *)input_decoder_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
-                       sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
-
-                memcpy((unsigned short *)input_decoder_v_cache.pVirAddr + kv_offset, (void *)output_v_cache.pVirAddr,
-                       sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
-
                 for (int gid = _attr.prefill_grpid + 1; gid < prefill_split_num + 1; gid++) {
                     auto &input_prefill_k_cache = layer.layer.get_input(gid, "K_cache");
                     memcpy((unsigned short *)input_prefill_k_cache.pVirAddr + kv_offset,
 
@@ -67,6 +67,7 @@ class llm_task {
     std::string kvcache_path;
     int precompute_len = 0;
     std::vector<int> _token_ids;
+    static int ax_init_flage_;
     task_callback_t out_callback_;
     bool enoutput_;
     bool enstream_;
@@ -208,7 +209,7 @@ class llm_task {
                     SLOGI("port_=%s model_id=%s content=%s", std::to_string(port_).c_str(),
                           (base_model + std::string("tokenizer")).c_str(), prompt_.c_str());
 
-                    std::this_thread::sleep_for(std::chrono::seconds(15));
+                    std::this_thread::sleep_for(std::chrono::seconds(5));
                 };
 
                 auto process_field = [&](std::string &field, const char *name_for_log) -> bool {
@@ -432,9 +433,38 @@ class llm_task {
         return port;
     }
 
+    void _ax_init()
+    {
+        if (!ax_init_flage_) {
+            int ret = AX_SYS_Init();
+            if (0 != ret) {
+                fprintf(stderr, "AX_SYS_Init failed! ret = 0x%x\n", ret);
+            }
+            AX_ENGINE_NPU_ATTR_T npu_attr;
+            memset(&npu_attr, 0, sizeof(npu_attr));
+            ret = AX_ENGINE_Init(&npu_attr);
+            if (0 != ret) {
+                fprintf(stderr, "Init ax-engine failed{0x%8x}.\n", ret);
+            }
+        }
+        ax_init_flage_++;
+    }
+
+    void _ax_deinit()
+    {
+        if (ax_init_flage_ > 0) {
+            --ax_init_flage_;
+            if (!ax_init_flage_) {
+                AX_ENGINE_Deinit();
+                AX_SYS_Deinit();
+            }
+        }
+    }
+
     llm_task(const std::string &workid) : tokenizer_server_flage_(false), port_(getNextPort())
     {
         inference_run_ = std::make_unique<std::thread>(std::bind(&llm_task::run, this));
+        _ax_init();
     }
 
     void start()
@@ -469,10 +499,12 @@ class llm_task {
         if (lLaMa_ctx_) {
             lLaMa_ctx_->Deinit();
         }
+        _ax_deinit();
     }
 };
 
 std::atomic<unsigned int> llm_task::next_port_{8080};
+int llm_task::ax_init_flage_ = 0;
 
 #undef CONFIG_AUTO_SET
 
@@ -527,7 +559,7 @@ class llm_llm : public StackFlow {
 
     void pause(const std::string &work_id, const std::string &object, const std::string &data) override
     {
-        SLOGI("llm_asr::work:%s", data.c_str());
+        SLOGI("llm_llm::work:%s", data.c_str());
 
         nlohmann::json error_body;
         int work_id_num = sample_get_work_id_num(work_id);
 
@@ -7,12 +7,14 @@
 #include "Tokenizer/Tokenizer.hpp"
 #include "LLMEmbedSelector.hpp"
 #include "ax_model_runner/ax_model_runner_ax650.hpp"
+#include "ax_model_runner/legacy/ax_model_runner_ax650.hpp"
 #include "ax_cmm_utils.hpp"
 #include "cqdm.h"
 #include "timer.hpp"
 #include "LLMPostprocess.hpp"
 
-#include <ax_sys_api.h>
+#include "ax_sys_api.h"
+#include "ax_engine_api.h"
 
 #include <arm_neon.h>
 #define ALIGN_DOWN(x, a) ((x) & ~((a) - 1))
@@ -78,14 +80,14 @@ class LLM {
     LLMAttrType _attr;
 
     struct LLMLayer {
-        ax_runner_ax650 layer;
+        ax::legacy::ax_runner_ax650 layer;
         std::string filename;
         MMap layer_buffer;
         std::vector<char> layer_buffer_vec;
     };
 
     std::vector<LLMLayer> llama_layers;
-    ax_runner_ax650 llama_post;
+    ax::legacy::ax_runner_ax650 llama_post;
 
     int prefill_grpid = 1;
     int decode_grpid  = 0;
@@ -243,9 +245,9 @@ class LLM {
     void Deinit()
     {
         for (int i = 0; i < _attr.axmodel_num; i++) {
-            llama_layers[i].layer.deinit();
+            llama_layers[i].layer.release();
         }
-        llama_post.deinit();
+        llama_post.release();
         embed_selector.Deinit();
     }
 
@@ -1245,9 +1247,6 @@ class LLM_CTX {
 
                 layer.layer.inference(_attr.prefill_grpid);
 
-                auto &input_decoder_k_cache = layer.layer.get_input(decode_grpid, "K_cache");
-                auto &input_decoder_v_cache = layer.layer.get_input(decode_grpid, "V_cache");
-
                 auto &input_prefill_k_cache = layer.layer.get_input(_attr.prefill_grpid, "K_cache");
                 auto &input_prefill_v_cache = layer.layer.get_input(_attr.prefill_grpid, "V_cache");
 
@@ -1256,12 +1255,6 @@ class LLM_CTX {
 
                 int kv_offset = (_attr.precompute_len + p * _attr.prefill_token_num) * _attr.kv_cache_size;
 
-                memcpy((unsigned short *)input_decoder_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
-                       sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
-
-                memcpy((unsigned short *)input_decoder_v_cache.pVirAddr + kv_offset, (void *)output_v_cache.pVirAddr,
-                       sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
-
                 memcpy((unsigned short *)input_prefill_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
                        sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);