Skip to content

Commit cde5921

Browse files
author
LittleMouse
committed
[update] add legacy llm backend
1 parent cc9d1bc commit cde5921

File tree

12 files changed

+1341
-59
lines changed

12 files changed

+1341
-59
lines changed

projects/llm_framework/main_cosy_voice/src/main.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ class llm_task {
9292
std::string kvcache_path;
9393
int precompute_len = 0;
9494
std::vector<int> _token_ids;
95+
static int ax_init_flage_;
9596
task_callback_t out_callback_;
9697
bool enoutput_;
9798
bool enstream_;
@@ -630,9 +631,38 @@ class llm_task {
630631
return port;
631632
}
632633

634+
void _ax_init()
635+
{
636+
if (!ax_init_flage_) {
637+
int ret = AX_SYS_Init();
638+
if (0 != ret) {
639+
fprintf(stderr, "AX_SYS_Init failed! ret = 0x%x\n", ret);
640+
}
641+
AX_ENGINE_NPU_ATTR_T npu_attr;
642+
memset(&npu_attr, 0, sizeof(npu_attr));
643+
ret = AX_ENGINE_Init(&npu_attr);
644+
if (0 != ret) {
645+
fprintf(stderr, "Init ax-engine failed{0x%8x}.\n", ret);
646+
}
647+
}
648+
ax_init_flage_++;
649+
}
650+
651+
void _ax_deinit()
652+
{
653+
if (ax_init_flage_ > 0) {
654+
--ax_init_flage_;
655+
if (!ax_init_flage_) {
656+
AX_ENGINE_Deinit();
657+
AX_SYS_Deinit();
658+
}
659+
}
660+
}
661+
633662
llm_task(const std::string &workid) : tokenizer_server_flage_(false), port_(getNextPort())
634663
{
635664
inference_run_ = std::make_unique<std::thread>(std::bind(&llm_task::run, this));
665+
_ax_init();
636666
}
637667

638668
void start()
@@ -666,10 +696,12 @@ class llm_task {
666696
if (lToken2Wav_) {
667697
lToken2Wav_->Deinit();
668698
}
699+
_ax_deinit();
669700
}
670701
};
671702

672703
std::atomic<unsigned int> llm_task::next_port_{8070};
704+
int llm_task::ax_init_flage_ = 0;
673705

674706
#undef CONFIG_AUTO_SET
675707

projects/llm_framework/main_cosy_voice/src/runner/LLM.hpp

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "cqdm.h"
1919
#include "timer.hpp"
2020
#include "ax_sys_api.h"
21+
#include "ax_engine_api.h"
2122
#include "utils/sampling.hpp"
2223
#include "utils/utils.hpp"
2324

@@ -463,20 +464,11 @@ class LLM {
463464

464465
layer.layer.inference(_attr.prefill_grpid);
465466

466-
auto &input_decoder_k_cache = layer.layer.get_input(decode_grpid, "K_cache");
467-
auto &input_decoder_v_cache = layer.layer.get_input(decode_grpid, "V_cache");
468-
469467
auto &output_k_cache = layer.layer.get_output(_attr.prefill_grpid, "K_cache_out");
470468
auto &output_v_cache = layer.layer.get_output(_attr.prefill_grpid, "V_cache_out");
471469

472470
int kv_offset = (_attr.precompute_len + p * _attr.prefill_token_num) * _attr.kv_cache_size;
473471

474-
memcpy((unsigned short *)input_decoder_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
475-
sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
476-
477-
memcpy((unsigned short *)input_decoder_v_cache.pVirAddr + kv_offset, (void *)output_v_cache.pVirAddr,
478-
sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
479-
480472
for (int gid = _attr.prefill_grpid + 1; gid < prefill_split_num + 1; gid++) {
481473
auto &input_prefill_k_cache = layer.layer.get_input(gid, "K_cache");
482474
memcpy((unsigned short *)input_prefill_k_cache.pVirAddr + kv_offset,

projects/llm_framework/main_llm/src/main.cpp

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ class llm_task {
6767
std::string kvcache_path;
6868
int precompute_len = 0;
6969
std::vector<int> _token_ids;
70+
static int ax_init_flage_;
7071
task_callback_t out_callback_;
7172
bool enoutput_;
7273
bool enstream_;
@@ -208,7 +209,7 @@ class llm_task {
208209
SLOGI("port_=%s model_id=%s content=%s", std::to_string(port_).c_str(),
209210
(base_model + std::string("tokenizer")).c_str(), prompt_.c_str());
210211

211-
std::this_thread::sleep_for(std::chrono::seconds(15));
212+
std::this_thread::sleep_for(std::chrono::seconds(5));
212213
};
213214

214215
auto process_field = [&](std::string &field, const char *name_for_log) -> bool {
@@ -432,9 +433,38 @@ class llm_task {
432433
return port;
433434
}
434435

436+
void _ax_init()
437+
{
438+
if (!ax_init_flage_) {
439+
int ret = AX_SYS_Init();
440+
if (0 != ret) {
441+
fprintf(stderr, "AX_SYS_Init failed! ret = 0x%x\n", ret);
442+
}
443+
AX_ENGINE_NPU_ATTR_T npu_attr;
444+
memset(&npu_attr, 0, sizeof(npu_attr));
445+
ret = AX_ENGINE_Init(&npu_attr);
446+
if (0 != ret) {
447+
fprintf(stderr, "Init ax-engine failed{0x%8x}.\n", ret);
448+
}
449+
}
450+
ax_init_flage_++;
451+
}
452+
453+
void _ax_deinit()
454+
{
455+
if (ax_init_flage_ > 0) {
456+
--ax_init_flage_;
457+
if (!ax_init_flage_) {
458+
AX_ENGINE_Deinit();
459+
AX_SYS_Deinit();
460+
}
461+
}
462+
}
463+
435464
llm_task(const std::string &workid) : tokenizer_server_flage_(false), port_(getNextPort())
436465
{
437466
inference_run_ = std::make_unique<std::thread>(std::bind(&llm_task::run, this));
467+
_ax_init();
438468
}
439469

440470
void start()
@@ -469,10 +499,12 @@ class llm_task {
469499
if (lLaMa_ctx_) {
470500
lLaMa_ctx_->Deinit();
471501
}
502+
_ax_deinit();
472503
}
473504
};
474505

475506
std::atomic<unsigned int> llm_task::next_port_{8080};
507+
int llm_task::ax_init_flage_ = 0;
476508

477509
#undef CONFIG_AUTO_SET
478510

@@ -527,7 +559,7 @@ class llm_llm : public StackFlow {
527559

528560
void pause(const std::string &work_id, const std::string &object, const std::string &data) override
529561
{
530-
SLOGI("llm_asr::work:%s", data.c_str());
562+
SLOGI("llm_llm::work:%s", data.c_str());
531563

532564
nlohmann::json error_body;
533565
int work_id_num = sample_get_work_id_num(work_id);

projects/llm_framework/main_llm/src/runner/LLM.hpp

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,14 @@
77
#include "Tokenizer/Tokenizer.hpp"
88
#include "LLMEmbedSelector.hpp"
99
#include "ax_model_runner/ax_model_runner_ax650.hpp"
10+
#include "ax_model_runner/legacy/ax_model_runner_ax650.hpp"
1011
#include "ax_cmm_utils.hpp"
1112
#include "cqdm.h"
1213
#include "timer.hpp"
1314
#include "LLMPostprocess.hpp"
1415

15-
#include <ax_sys_api.h>
16+
#include "ax_sys_api.h"
17+
#include "ax_engine_api.h"
1618

1719
#include <arm_neon.h>
1820
#define ALIGN_DOWN(x, a) ((x) & ~((a) - 1))
@@ -78,14 +80,14 @@ class LLM {
7880
LLMAttrType _attr;
7981

8082
struct LLMLayer {
81-
ax_runner_ax650 layer;
83+
ax::legacy::ax_runner_ax650 layer;
8284
std::string filename;
8385
MMap layer_buffer;
8486
std::vector<char> layer_buffer_vec;
8587
};
8688

8789
std::vector<LLMLayer> llama_layers;
88-
ax_runner_ax650 llama_post;
90+
ax::legacy::ax_runner_ax650 llama_post;
8991

9092
int prefill_grpid = 1;
9193
int decode_grpid = 0;
@@ -243,9 +245,9 @@ class LLM {
243245
void Deinit()
244246
{
245247
for (int i = 0; i < _attr.axmodel_num; i++) {
246-
llama_layers[i].layer.deinit();
248+
llama_layers[i].layer.release();
247249
}
248-
llama_post.deinit();
250+
llama_post.release();
249251
embed_selector.Deinit();
250252
}
251253

@@ -1245,9 +1247,6 @@ class LLM_CTX {
12451247

12461248
layer.layer.inference(_attr.prefill_grpid);
12471249

1248-
auto &input_decoder_k_cache = layer.layer.get_input(decode_grpid, "K_cache");
1249-
auto &input_decoder_v_cache = layer.layer.get_input(decode_grpid, "V_cache");
1250-
12511250
auto &input_prefill_k_cache = layer.layer.get_input(_attr.prefill_grpid, "K_cache");
12521251
auto &input_prefill_v_cache = layer.layer.get_input(_attr.prefill_grpid, "V_cache");
12531252

@@ -1256,12 +1255,6 @@ class LLM_CTX {
12561255

12571256
int kv_offset = (_attr.precompute_len + p * _attr.prefill_token_num) * _attr.kv_cache_size;
12581257

1259-
memcpy((unsigned short *)input_decoder_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
1260-
sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
1261-
1262-
memcpy((unsigned short *)input_decoder_v_cache.pVirAddr + kv_offset, (void *)output_v_cache.pVirAddr,
1263-
sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
1264-
12651258
memcpy((unsigned short *)input_prefill_k_cache.pVirAddr + kv_offset, (void *)output_k_cache.pVirAddr,
12661259
sizeof(unsigned short) * input_num_token * _attr.kv_cache_size);
12671260

0 commit comments

Comments
 (0)