From b7d38eef0c04bacfba6dd0608350073845bfdf72 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Tue, 3 Dec 2024 23:37:03 +0100 Subject: [PATCH 01/19] server : (refactoring) reduce usage of json internally --- examples/server/server.cpp | 274 ++++++++----------------------------- examples/server/server.hpp | 191 ++++++++++++++++++++++++++ examples/server/utils.hpp | 24 +--- 3 files changed, 246 insertions(+), 243 deletions(-) create mode 100644 examples/server/server.hpp diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9bca3f30e7574..1482ecbee29df 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,4 +1,5 @@ #include "utils.hpp" +#include "server.hpp" #include "arg.h" #include "common.h" @@ -32,90 +33,6 @@ using json = nlohmann::ordered_json; -enum stop_type { - STOP_TYPE_FULL, - STOP_TYPE_PARTIAL, -}; - -// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 -enum slot_state { - SLOT_STATE_IDLE, - SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future - SLOT_STATE_PROCESSING_PROMPT, - SLOT_STATE_DONE_PROMPT, - SLOT_STATE_GENERATING, -}; - -enum server_state { - SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet - SERVER_STATE_READY, // Server is ready and model is loaded -}; - -enum server_task_type { - SERVER_TASK_TYPE_INFERENCE, - SERVER_TASK_TYPE_CANCEL, - SERVER_TASK_TYPE_NEXT_RESPONSE, - SERVER_TASK_TYPE_METRICS, - SERVER_TASK_TYPE_SLOT_SAVE, - SERVER_TASK_TYPE_SLOT_RESTORE, - SERVER_TASK_TYPE_SLOT_ERASE, - SERVER_TASK_TYPE_SET_LORA, -}; - -enum server_task_inf_type { - SERVER_TASK_INF_TYPE_COMPLETION, - SERVER_TASK_INF_TYPE_EMBEDDING, - SERVER_TASK_INF_TYPE_RERANK, - SERVER_TASK_INF_TYPE_INFILL, -}; - -struct server_task { - int id = -1; // to be filled by server_queue - int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL - - llama_tokens prompt_tokens; - server_task_type type; - json data; - - server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; - - // utility function - static std::unordered_set get_list_id(const std::vector & tasks) { - std::unordered_set ids(tasks.size()); - for (size_t i = 0; i < tasks.size(); i++) { - ids.insert(tasks[i].id); - } - return ids; - } -}; - -struct server_task_result { - int id = -1; - - json data; - - bool stop; - bool error; -}; - -struct slot_params { - bool stream = true; - bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt - - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half - int32_t n_predict = -1; // new tokens to predict - int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters - - int64_t t_max_prompt_ms = -1; // TODO: implement - int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit - - std::vector antiprompt; - - struct common_params_sampling sampling; - struct common_params_speculative speculative; -}; - struct server_slot { int id; int id_task = -1; @@ -166,8 +83,6 @@ struct server_slot { bool stopped_word = false; bool stopped_limit = false; - bool timings_per_token = false; - bool oaicompat = false; std::string oaicompat_model; @@ -255,37 +170,39 @@ struct server_slot { } } - json get_formated_timings() const { - return json { - {"prompt_n", n_prompt_tokens_processed}, - {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed}, - {"prompt_per_second", 1e3 / t_prompt_processing * n_prompt_tokens_processed}, - - {"predicted_n", n_decoded}, - {"predicted_ms", t_token_generation}, - {"predicted_per_token_ms", t_token_generation / n_decoded}, - {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, - }; + result_timings get_timings() const { + result_timings timings; + timings.prompt_n = n_prompt_tokens_processed; + timings.prompt_ms = t_prompt_processing; + timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed; + timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; + + timings.predicted_n = n_decoded; + timings.predicted_ms = t_token_generation; + timings.predicted_per_token_ms = t_token_generation / n_decoded; + timings.predicted_per_second = 1e3 / t_token_generation * n_decoded; + + return timings; } - size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) { + size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) { size_t stop_pos = std::string::npos; for (const std::string & word : params.antiprompt) { size_t pos; - if (type == STOP_TYPE_FULL) { + if (is_full_stop) { const size_t tmp = word.size() + last_token_size; const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; pos = text.find(word, from_pos); } else { + // otherwise, partial stop pos = find_partial_stop_string(word, text); } if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) { - if (type == STOP_TYPE_FULL) { + if (is_full_stop) { stopped_word = true; stopping_word = word; has_next_token = false; @@ -1108,14 +1025,14 @@ struct server_context { const std::string str_test = slot.generated_text.substr(pos); bool send_text = true; - size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL); + size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true); if (stop_pos != std::string::npos) { slot.generated_text.erase( slot.generated_text.begin() + pos + stop_pos, slot.generated_text.end()); pos = std::min(slot.n_sent_text, slot.generated_text.size()); } else if (slot.has_next_token) { - stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL); + stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false); send_text = stop_pos == std::string::npos; } @@ -1229,60 +1146,6 @@ struct server_context { return slot.has_next_token; // continue } - json get_formated_generation(const server_slot & slot) const { - std::vector samplers; - samplers.reserve(slot.params.sampling.samplers.size()); - for (const auto & sampler : slot.params.sampling.samplers) { - samplers.emplace_back(common_sampler_type_to_str(sampler)); - } - - return json { - {"n_ctx", slot.n_ctx}, - {"n_predict", slot.n_predict}, // Server configured n_predict - {"model", params_base.model_alias}, - {"seed", slot.params.sampling.seed}, - {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0}, - {"temperature", slot.params.sampling.temp}, - {"dynatemp_range", slot.params.sampling.dynatemp_range}, - {"dynatemp_exponent", slot.params.sampling.dynatemp_exponent}, - {"top_k", slot.params.sampling.top_k}, - {"top_p", slot.params.sampling.top_p}, - {"min_p", slot.params.sampling.min_p}, - {"xtc_probability", slot.params.sampling.xtc_probability}, - {"xtc_threshold", slot.params.sampling.xtc_threshold}, - {"typical_p", slot.params.sampling.typ_p}, - {"repeat_last_n", slot.params.sampling.penalty_last_n}, - {"repeat_penalty", slot.params.sampling.penalty_repeat}, - {"presence_penalty", slot.params.sampling.penalty_present}, - {"frequency_penalty", slot.params.sampling.penalty_freq}, - {"dry_multiplier", slot.params.sampling.dry_multiplier}, - {"dry_base", slot.params.sampling.dry_base}, - {"dry_allowed_length", slot.params.sampling.dry_allowed_length}, - {"dry_penalty_last_n", slot.params.sampling.dry_penalty_last_n}, - {"dry_sequence_breakers", slot.params.sampling.dry_sequence_breakers}, - {"mirostat", slot.params.sampling.mirostat}, - {"mirostat_tau", slot.params.sampling.mirostat_tau}, - {"mirostat_eta", slot.params.sampling.mirostat_eta}, - {"penalize_nl", slot.params.sampling.penalize_nl}, - {"stop", slot.params.antiprompt}, - {"max_tokens", slot.params.n_predict}, // User configured n_predict - {"n_keep", slot.params.n_keep}, - {"n_discard", slot.params.n_discard}, - {"ignore_eos", slot.params.sampling.ignore_eos}, - {"stream", slot.params.stream}, - //{"logit_bias", slot.params.sampling.logit_bias}, - {"n_probs", slot.params.sampling.n_probs}, - {"min_keep", slot.params.sampling.min_keep}, - {"grammar", slot.params.sampling.grammar}, - {"samplers", samplers}, - {"speculative", slot.can_speculate()}, - {"speculative.n_max", slot.params.speculative.n_max}, - {"speculative.n_min", slot.params.speculative.n_min}, - {"speculative.p_min", slot.params.speculative.p_min}, - {"timings_per_token", slot.timings_per_token}, - }; - } - void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { send_error(task.id, error, type); } @@ -1294,27 +1157,18 @@ struct server_context { void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str()); - server_task_result res; + server_task_result_error res; res.id = id_task; - res.stop = false; - res.error = true; - res.data = format_error_response(error, type); + res.err_type = type; + res.err_msg = error; queue_results.send(res); } void send_partial_response(server_slot & slot, completion_token_output tkn) { - server_task_result res; + server_task_result_cmpl_partial res; res.id = slot.id_task; - res.error = false; - res.stop = false; - res.data = json { - {"content", tkn.text_to_send}, - {"stop", false}, - {"id_slot", slot.id}, - {"multimodal", false}, - {"index", slot.index}, - }; + res.content = tkn.text_to_send; if (slot.params.sampling.n_probs > 0) { const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); @@ -1323,30 +1177,35 @@ struct server_context { std::vector probs_output; if (probs_pos < probs_stop_pos) { - probs_output = std::vector( + res.probs_output = std::vector( slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos); } - slot.n_sent_token_probs = probs_stop_pos; - - res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); - } - - if (slot.oaicompat) { - res.data["oaicompat_token_ctr"] = slot.n_decoded; - res.data["model"] = slot.oaicompat_model; } - if (slot.timings_per_token) { - res.data["timings"] = slot.get_formated_timings(); + if (slot.params.timings_per_token) { + res.timings = slot.get_timings(); } queue_results.send(res); } void send_final_response(const server_slot & slot) { - server_task_result res; - res.id = slot.id_task; + server_task_result_cmpl_final res; + res.id = slot.id_task; + res.id_slot = slot.id; + res.content = slot.generated_text; + + res.n_decoded = slot.n_decoded; + res.n_prompt_tokens = slot.n_prompt_tokens; + res.has_new_line = slot.has_new_line; + res.n_tokens_cached = slot.n_past; + res.content = slot.generated_text; + + res.params = slot.params; // copy the parameters + + + res.error = false; res.stop = true; res.data = json { @@ -1370,36 +1229,27 @@ struct server_context { }; if (slot.params.sampling.n_probs > 0) { - std::vector probs; if (!slot.params.stream && slot.stopped_word) { const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); - probs = std::vector( + res.probs_output = std::vector( slot.generated_token_probs.begin(), slot.generated_token_probs.end() - safe_offset); } else { - probs = std::vector( + res.probs_output = std::vector( slot.generated_token_probs.begin(), slot.generated_token_probs.end()); } - - res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs); - } - - if (slot.oaicompat) { - res.data["oaicompat_token_ctr"] = slot.n_decoded; - res.data["model"] = slot.oaicompat_model; } queue_results.send(res); } void send_embedding(const server_slot & slot, const llama_batch & batch) { - server_task_result res; + server_task_result_embd res; res.id = slot.id_task; - res.error = false; - res.stop = true; + res.index = slot.index; const int n_embd = llama_n_embd(model); @@ -1418,20 +1268,12 @@ struct server_context { if (embd == NULL) { SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - res.data = json { - {"embedding", std::vector(n_embd, 0.0f)}, - {"index", slot.index}, - }; - + res.embedding = std::vector(n_embd, 0.0f); continue; } common_embd_normalize(embd, embd_res.data(), n_embd); - - res.data = json { - {"embedding", embd_res}, - {"index", slot.index}, - }; + res.embedding = embd_res; } SLT_DBG(slot, "%s", "sending embeddings\n"); @@ -1440,10 +1282,9 @@ struct server_context { } void send_rerank(const server_slot & slot, const llama_batch & batch) { - server_task_result res; + server_task_result_rerank res; res.id = slot.id_task; - res.error = false; - res.stop = true; + res.index = slot.index; for (int i = 0; i < batch.n_tokens; ++i) { if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { @@ -1458,21 +1299,14 @@ struct server_context { if (embd == NULL) { SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - res.data = json { - {"index", slot.index}, - {"score", -1e6}, - }; - + res.score = -1e6; continue; } - res.data = json { - {"index", slot.index}, - {"score", embd[0]}, - }; + res.score = embd[0]; } - SLT_DBG(slot, "sending rerank result, res = '%s'\n", res.data.dump().c_str()); + SLT_DBG(slot, "sending rerank result, res.score = %f\n", res.score); queue_results.send(res); } diff --git a/examples/server/server.hpp b/examples/server/server.hpp new file mode 100644 index 0000000000000..a9287bf6dbaaf --- /dev/null +++ b/examples/server/server.hpp @@ -0,0 +1,191 @@ +#pragma once + +#include "common.h" +#include "llama.h" +#include "sampling.h" +#include "speculative.h" + +// Change JSON_ASSERT from assert() to GGML_ASSERT: +#define JSON_ASSERT GGML_ASSERT +#include "json.hpp" + +#include +#include +#include + +using json = nlohmann::ordered_json; + +enum stop_type { + STOP_TYPE_NONE, + STOP_TYPE_EOS, + STOP_TYPE_WORD, + STOP_TYPE_LIMIT, +}; + +// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 +enum slot_state { + SLOT_STATE_IDLE, + SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future + SLOT_STATE_PROCESSING_PROMPT, + SLOT_STATE_DONE_PROMPT, + SLOT_STATE_GENERATING, +}; + +enum server_state { + SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet + SERVER_STATE_READY, // Server is ready and model is loaded +}; + +enum server_task_type { + SERVER_TASK_TYPE_INFERENCE, + SERVER_TASK_TYPE_CANCEL, + SERVER_TASK_TYPE_NEXT_RESPONSE, + SERVER_TASK_TYPE_METRICS, + SERVER_TASK_TYPE_SLOT_SAVE, + SERVER_TASK_TYPE_SLOT_RESTORE, + SERVER_TASK_TYPE_SLOT_ERASE, + SERVER_TASK_TYPE_SET_LORA, +}; + +enum server_task_inf_type { + SERVER_TASK_INF_TYPE_COMPLETION, + SERVER_TASK_INF_TYPE_EMBEDDING, + SERVER_TASK_INF_TYPE_RERANK, + SERVER_TASK_INF_TYPE_INFILL, +}; + +// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 +enum error_type { + ERROR_TYPE_INVALID_REQUEST, + ERROR_TYPE_AUTHENTICATION, + ERROR_TYPE_SERVER, + ERROR_TYPE_NOT_FOUND, + ERROR_TYPE_PERMISSION, + ERROR_TYPE_UNAVAILABLE, // custom error + ERROR_TYPE_NOT_SUPPORTED, // custom error +}; + +struct server_task { + int id = -1; // to be filled by server_queue + int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL + + llama_tokens prompt_tokens; + server_task_type type; + + // TODO @ngxson : we should get rid of json type here + json data; + + server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; + + // utility function + static std::unordered_set get_list_id(const std::vector & tasks) { + std::unordered_set ids(tasks.size()); + for (size_t i = 0; i < tasks.size(); i++) { + ids.insert(tasks[i].id); + } + return ids; + } +}; + +struct result_timings { + int32_t prompt_n; + double prompt_ms; + double prompt_per_token_ms; + double prompt_per_second; + + int32_t predicted_n; + double predicted_ms; + double predicted_per_token_ms; + double predicted_per_second; +}; + +enum result_type { + RESULT_TYPE_CMPL_FINAL, + RESULT_TYPE_CMPL_PARTIAL, + RESULT_TYPE_EMBD, + RESULT_TYPE_RERANK, + RESULT_TYPE_ERROR, + RESULT_TYPE_UNKNOWN, // will throw an error +}; + +struct server_task_result { + result_type type = RESULT_TYPE_UNKNOWN; + int id = -1; + int id_slot = -1; +}; + +struct server_task_result_cmpl_final : server_task_result { + result_type type = RESULT_TYPE_CMPL_FINAL; + int index = 0; + std::string content; + bool stream; + bool timings_per_token; + result_timings timings; + + int32_t n_decoded; + int32_t n_prompt_tokens; + int32_t has_new_line; + int32_t stopping_word; + int32_t n_tokens_cached; + stop_type stop = STOP_TYPE_NONE; + std::vector probs_output; + + slot_params params; +}; + +struct completion_token_output { + llama_token tok; + std::string text_to_send; + struct token_prob { + llama_token tok; + float prob; + }; + std::vector probs; +}; + +struct server_task_result_cmpl_partial : server_task_result { + result_type type = RESULT_TYPE_CMPL_PARTIAL; + int index = 0; + std::string content; + stop_type stop = STOP_TYPE_NONE; + std::vector probs_output; + result_timings timings; +}; + +struct server_task_result_embd : server_task_result { + result_type type = RESULT_TYPE_EMBD; + int index = 0; + std::vector embedding; +}; + +struct server_task_result_rerank : server_task_result { + result_type type = RESULT_TYPE_RERANK; + int index = 0; + float score; +}; + +struct server_task_result_error : server_task_result { + result_type type = RESULT_TYPE_ERROR; + int index = 0; + error_type err_type; + std::string err_msg; +}; + +struct slot_params { + bool stream = true; + bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half + int32_t n_predict = -1; // new tokens to predict + int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters + + int64_t t_max_prompt_ms = -1; // TODO: implement + int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit + + std::vector antiprompt; + bool timings_per_token = false; + + struct common_params_sampling sampling; + struct common_params_speculative speculative; +}; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index e4451532c9d0c..d65773addf231 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -3,6 +3,7 @@ #include "common.h" #include "log.h" #include "llama.h" +#include "server.hpp" #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error @@ -40,17 +41,6 @@ using json = nlohmann::ordered_json; #define QUE_ERR(fmt, ...) LOG_ERR("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) #define QUE_DBG(fmt, ...) LOG_DBG("que %12.*s: " fmt, 12, __func__, __VA_ARGS__) -// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 -enum error_type { - ERROR_TYPE_INVALID_REQUEST, - ERROR_TYPE_AUTHENTICATION, - ERROR_TYPE_SERVER, - ERROR_TYPE_NOT_FOUND, - ERROR_TYPE_PERMISSION, - ERROR_TYPE_UNAVAILABLE, // custom error - ERROR_TYPE_NOT_SUPPORTED, // custom error -}; - template static T json_value(const json & body, const std::string & key, const T & default_value) { // Fallback null to default value @@ -485,18 +475,6 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx, return out; } -struct completion_token_output { - llama_token tok; - std::string text_to_send; - - struct token_prob { - llama_token tok; - float prob; - }; - - std::vector probs; -}; - // convert a vector of completion_token_output to json static json probs_vector_to_json(const llama_context * ctx, const std::vector & probs) { json out = json::array(); From 1011a51b8780a1b53ece91201583ad0c756a7e88 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 4 Dec 2024 14:16:01 +0100 Subject: [PATCH 02/19] move all response types to struct --- examples/server/server.cpp | 381 ++++++++++++++++++------------------- examples/server/server.hpp | 365 +++++++++++++++++++++++++++++++---- examples/server/utils.hpp | 1 + 3 files changed, 511 insertions(+), 236 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 1482ecbee29df..de073b085dbe9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -33,6 +33,9 @@ using json = nlohmann::ordered_json; +// using shared_ptr for polymorphism of server_task_result +using task_result_ptr = std::unique_ptr; + struct server_slot { int id; int id_task = -1; @@ -79,9 +82,7 @@ struct server_slot { bool has_next_token = true; bool has_new_line = false; bool truncated = false; - bool stopped_eos = false; - bool stopped_word = false; - bool stopped_limit = false; + stop_type stop; bool oaicompat = false; @@ -115,9 +116,7 @@ struct server_slot { generated_text = ""; has_new_line = false; truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; + stop = STOP_TYPE_NONE; stopping_word = ""; n_past = 0; n_sent_text = 0; @@ -203,7 +202,7 @@ struct server_slot { if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) { if (is_full_stop) { - stopped_word = true; + stop = STOP_TYPE_WORD; stopping_word = word; has_next_token = false; } @@ -428,8 +427,8 @@ struct server_response { // for keeping track of all tasks waiting for the result std::unordered_set waiting_task_ids; - // the main result queue - std::vector queue_results; + // the main result queue (using ptr for polymorphism) + std::vector queue_results; std::mutex mutex_results; std::condition_variable condition_results; @@ -469,7 +468,7 @@ struct server_response { } // This function blocks the thread until there is a response for one of the id_tasks - server_task_result recv(const std::unordered_set & id_tasks) { + task_result_ptr recv(const std::unordered_set & id_tasks) { while (true) { std::unique_lock lock(mutex_results); condition_results.wait(lock, [&]{ @@ -477,8 +476,8 @@ struct server_response { }); for (int i = 0; i < (int) queue_results.size(); i++) { - if (id_tasks.find(queue_results[i].id) != id_tasks.end()) { - server_task_result res = queue_results[i]; + if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { + task_result_ptr res = std::move(queue_results[i]); queue_results.erase(queue_results.begin() + i); return res; } @@ -489,7 +488,7 @@ struct server_response { } // single-task version of recv() - server_task_result recv(int id_task) { + task_result_ptr recv(int id_task) { std::unordered_set id_tasks = {id_task}; return recv(id_tasks); } @@ -501,9 +500,9 @@ struct server_response { std::unique_lock lock(mutex_results); for (const auto & id_task : waiting_task_ids) { if (result.id == id_task) { - SRV_DBG("task id = %d moved to result queue\n", result.id); + SRV_DBG("task id = %d pushed to result queue\n", result.id); - queue_results.push_back(std::move(result)); + queue_results.push_back(std::make_unique(result)); condition_results.notify_all(); return; } @@ -694,7 +693,7 @@ struct server_context { slots.push_back(slot); } - default_generation_settings_for_props = get_formated_generation(slots.front()); + default_generation_settings_for_props = slots[0].params.to_json(); default_generation_settings_for_props["seed"] = -1; // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens @@ -797,7 +796,7 @@ struct server_context { slot.oaicompat_model = ""; } - slot.timings_per_token = json_value(data, "timings_per_token", false); + slot.params.timings_per_token = json_value(data, "timings_per_token", false); slot.params.stream = json_value(data, "stream", false); slot.params.cache_prompt = json_value(data, "cache_prompt", true); @@ -1056,7 +1055,7 @@ struct server_context { // check the limits if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) { - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict); @@ -1065,7 +1064,7 @@ struct server_context { if (slot.has_new_line) { // if we have already seen a new line, we stop after a certain time limit if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) { - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms); @@ -1085,7 +1084,7 @@ struct server_context { } if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) { - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; // cut the last line @@ -1114,7 +1113,7 @@ struct server_context { // if context shift is disabled, we stop when it reaches the context limit if (slot.n_past >= slot.n_ctx) { slot.truncated = true; - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n", @@ -1122,7 +1121,7 @@ struct server_context { } if (llama_token_is_eog(model, result.tok)) { - slot.stopped_eos = true; + slot.stop = STOP_TYPE_EOS; slot.has_next_token = false; SLT_DBG(slot, "%s", "stopped by EOS\n"); @@ -1132,7 +1131,7 @@ struct server_context { if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) { slot.truncated = true; - slot.stopped_limit = true; + slot.stop = STOP_TYPE_LIMIT; slot.has_next_token = false; // stop prediction SLT_WRN(slot, @@ -1201,35 +1200,12 @@ struct server_context { res.has_new_line = slot.has_new_line; res.n_tokens_cached = slot.n_past; res.content = slot.generated_text; + res.stop = slot.stop; - res.params = slot.params; // copy the parameters - - - - res.error = false; - res.stop = true; - res.data = json { - {"content", !slot.params.stream ? slot.generated_text : ""}, - {"id_slot", slot.id}, - {"stop", true}, - {"model", params_base.model_alias}, - {"tokens_predicted", slot.n_decoded}, - {"tokens_evaluated", slot.n_prompt_tokens}, - {"generation_settings", get_formated_generation(slot)}, - {"prompt", common_detokenize(ctx, slot.prompt_tokens)}, - {"has_new_line", slot.has_new_line}, - {"truncated", slot.truncated}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - {"tokens_cached", slot.n_past}, - {"timings", slot.get_formated_timings()}, - {"index", slot.index}, - }; + res.generation_params = slot.params; // copy the parameters if (slot.params.sampling.n_probs > 0) { - if (!slot.params.stream && slot.stopped_word) { + if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) { const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); @@ -1399,25 +1375,34 @@ struct server_context { } // receive the results from task(s) created by create_tasks_inference - void receive_cmpl_results( + template + void receive_multi_results( const std::unordered_set & id_tasks, - const std::function&)> & result_handler, + const std::function&)> & result_handler, const std::function & error_handler) { - // TODO: currently, there is no way to detect the client has cancelled the request - std::vector results(id_tasks.size()); + std::vector results(id_tasks.size()); for (size_t i = 0; i < id_tasks.size(); i++) { - server_task_result result = queue_results.recv(id_tasks); + task_result_ptr result_raw = queue_results.recv(id_tasks); - if (result.error) { - error_handler(result.data); + if (result_raw->type == RESULT_TYPE_ERROR) { + auto result = server_task_result_error::from_ptr(result_raw); + error_handler(format_error_response(result.err_msg, result.err_type)); cancel_tasks(id_tasks); return; } - const size_t idx = result.data["index"]; - GGML_ASSERT(idx < results.size() && "index out of range"); - - results[idx] = result; + if ( + result_raw->type == RESULT_TYPE_CMPL_FINAL + || result_raw->type == RESULT_TYPE_EMBD + || result_raw->type == RESULT_TYPE_RERANK + ) { + auto result = T::from_ptr(result_raw); + const size_t idx = result.index; + GGML_ASSERT(idx < results.size() && "index out of range"); + results[idx] = result; + } else { + GGML_ASSERT(false && "unexpected result type"); + } } result_handler(results); } @@ -1425,23 +1410,27 @@ struct server_context { // receive the results from task(s) created by create_tasks_inference, in stream mode void receive_cmpl_results_stream( const std::unordered_set & id_tasks, const - std::function & result_handler, const + std::function & result_handler, const std::function & error_handler) { size_t n_finished = 0; while (true) { - server_task_result result = queue_results.recv(id_tasks); - if (!result_handler(result)) { + task_result_ptr result_raw = queue_results.recv(id_tasks); + + if (result_raw->type == RESULT_TYPE_ERROR) { + auto result = server_task_result_error::from_ptr(result_raw); + error_handler(format_error_response(result.err_msg, result.err_type)); cancel_tasks(id_tasks); - break; + return; } - if (result.error) { - error_handler(result.data); + GGML_ASSERT(result_raw->type == RESULT_TYPE_CMPL_PARTIAL); + auto result = server_task_result_cmpl_partial::from_ptr(result_raw); + if (!result_handler(result)) { cancel_tasks(id_tasks); break; } - if (result.stop) { + if (result.stop != STOP_TYPE_NONE) { if (++n_finished == id_tasks.size()) { break; } @@ -1508,7 +1497,7 @@ struct server_context { int n_processing_slots = 0; for (server_slot & slot : slots) { - json slot_data = get_formated_generation(slot); + json slot_data = slot.params.to_json(); slot_data["id"] = slot.id; slot_data["id_task"] = slot.id_task; slot_data["is_processing"] = slot.is_processing(); @@ -1518,9 +1507,6 @@ struct server_context { {"has_new_line", slot.has_new_line}, {"n_remain", slot.n_remaining}, {"n_decoded", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, {"stopping_word", slot.stopping_word}, }; @@ -1534,34 +1520,28 @@ struct server_context { } SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots); - server_task_result res; - res.id = task.id; - res.stop = true; - res.error = false; - res.data = { - { "idle", n_idle_slots }, - { "processing", n_processing_slots }, - { "deferred", queue_tasks.queue_tasks_deferred.size() }, - { "t_start", metrics.t_start}, + server_task_result_metrics res; + res.id = task.id; + res.n_idle_slots = n_idle_slots; + res.n_processing_slots = n_processing_slots; + res.n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); + res.t_start = metrics.t_start; - { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total}, - { "t_tokens_generation_total", metrics.t_tokens_generation_total}, - { "n_tokens_predicted_total", metrics.n_tokens_predicted_total}, - { "t_prompt_processing_total", metrics.t_prompt_processing_total}, + res.kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); + res.kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx); - { "n_prompt_tokens_processed", metrics.n_prompt_tokens_processed}, - { "t_prompt_processing", metrics.t_prompt_processing}, - { "n_tokens_predicted", metrics.n_tokens_predicted}, - { "t_tokens_generation", metrics.t_tokens_generation}, + res.n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; + res.t_prompt_processing_total = metrics.t_prompt_processing_total; + res.n_tokens_predicted_total = metrics.n_tokens_predicted_total; + res.t_tokens_generation_total = metrics.t_tokens_generation_total; - { "n_decode_total", metrics.n_decode_total}, - { "n_busy_slots_total", metrics.n_busy_slots_total}, + res.n_prompt_tokens_processed = metrics.n_prompt_tokens_processed; + res.t_prompt_processing = metrics.t_prompt_processing; + res.n_tokens_predicted = metrics.n_tokens_predicted; + res.t_tokens_generation = metrics.t_tokens_generation; - { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, - { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, - - { "slots", slots_data }, - }; + res.n_decode_total = metrics.n_decode_total; + res.n_busy_slots_total = metrics.n_busy_slots_total; if (json_value(task.data, "reset_bucket", false)) { metrics.reset_bucket(); @@ -1594,19 +1574,14 @@ struct server_context { const int64_t t_end = ggml_time_us(); const double t_save_ms = (t_end - t_start) / 1000.0; - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_saved", token_count }, // tokens saved - { "n_written", nwrite }, // bytes written - { "timings", { - { "save_ms", t_save_ms } - } } - }; + server_task_result_slot_save_load result; + result.id = task.id; + result.id_slot = id_slot; + result.filename = filename; + result.is_save = true; + result.n_saved = token_count; + result.n_written = nwrite; + result.t_ms = t_save_ms; queue_results.send(result); } break; case SERVER_TASK_TYPE_SLOT_RESTORE: @@ -1642,19 +1617,14 @@ struct server_context { const int64_t t_end = ggml_time_us(); const double t_restore_ms = (t_end - t_start) / 1000.0; - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_restored", token_count }, // tokens restored - { "n_read", nread }, // bytes read - { "timings", { - { "restore_ms", t_restore_ms } - } } - }; + server_task_result_slot_save_load result; + result.id = task.id; + result.id_slot = id_slot; + result.filename = filename; + result.is_save = false; + result.n_saved = token_count; + result.n_read = nread; + result.t_ms = t_restore_ms; queue_results.send(result); } break; case SERVER_TASK_TYPE_SLOT_ERASE: @@ -1677,24 +1647,17 @@ struct server_context { llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); slot->cache_tokens.clear(); - server_task_result result; - result.id = task.id; - result.stop = true; - result.error = false; - result.data = json { - { "id_slot", id_slot }, - { "n_erased", n_erased } - }; + server_task_result_slot_erase result; + result.id = task.id; + result.id_slot = id_slot; + result.n_erased = n_erased; queue_results.send(result); } break; case SERVER_TASK_TYPE_SET_LORA: { common_lora_adapters_apply(ctx, loras); - server_task_result result; + server_task_result_apply_lora result; result.id = task.id; - result.stop = true; - result.error = false; - result.data = json{{ "success", true }}; queue_results.send(result); } break; } @@ -2456,19 +2419,26 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.post(task, true); // high-priority task // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); + task_result_ptr result_raw = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); + if (result_raw->type != RESULT_TYPE_METRICS) { + SRV_ERR("Unexpected result type: %d\n", result_raw->type); + res_error(res, format_error_response("Unexpected result type", ERROR_TYPE_SERVER)); + return; + } + + auto result = server_task_result_metrics::from_ptr(result_raw); + // optionally return "fail_on_no_slot" error - const int n_idle_slots = result.data.at("idle"); if (req.has_param("fail_on_no_slot")) { - if (n_idle_slots == 0) { + if (result.n_idle_slots == 0) { res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE)); return; } } - res_ok(res, result.data.at("slots")); + res_ok(res, result.slots_data); }; const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { @@ -2488,73 +2458,68 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.post(task, true); // high-priority task // get the result - server_task_result result = ctx_server.queue_results.recv(task.id); + task_result_ptr result_raw = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); + if (result_raw->type == RESULT_TYPE_ERROR) { + auto result = server_task_result_error::from_ptr(result_raw); + res_error(res, format_error_response(result.err_msg, result.err_type)); + return; + } - json data = result.data; - - const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed"); - const uint64_t t_prompt_processing = data.at("t_prompt_processing"); - - const uint64_t n_tokens_predicted = data.at("n_tokens_predicted"); - const uint64_t t_tokens_generation = data.at("t_tokens_generation"); - - const uint64_t n_decode_total = data.at("n_decode_total"); - const uint64_t n_busy_slots_total = data.at("n_busy_slots_total"); - - const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells"); + GGML_ASSERT(result_raw->type == RESULT_TYPE_METRICS); + auto result = server_task_result_metrics::from_ptr(result_raw); // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names json all_metrics_def = json { {"counter", {{ {"name", "prompt_tokens_total"}, {"help", "Number of prompt tokens processed."}, - {"value", (uint64_t) data.at("n_prompt_tokens_processed_total")} + {"value", (uint64_t) result.n_prompt_tokens_processed_total} }, { {"name", "prompt_seconds_total"}, {"help", "Prompt process time"}, - {"value", (uint64_t) data.at("t_prompt_processing_total") / 1.e3} + {"value", (uint64_t) result.t_prompt_processing_total / 1.e3} }, { {"name", "tokens_predicted_total"}, {"help", "Number of generation tokens processed."}, - {"value", (uint64_t) data.at("n_tokens_predicted_total")} + {"value", (uint64_t) result.n_tokens_predicted_total} }, { {"name", "tokens_predicted_seconds_total"}, {"help", "Predict process time"}, - {"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3} + {"value", (uint64_t) result.t_tokens_generation_total / 1.e3} }, { {"name", "n_decode_total"}, {"help", "Total number of llama_decode() calls"}, - {"value", n_decode_total} + {"value", result.n_decode_total} }, { {"name", "n_busy_slots_per_decode"}, {"help", "Average number of busy slots per llama_decode() call"}, - {"value", (float) n_busy_slots_total / (float) n_decode_total} + {"value", (float) result.n_busy_slots_total / (float) result.n_decode_total} }}}, {"gauge", {{ {"name", "prompt_tokens_seconds"}, {"help", "Average prompt throughput in tokens/s."}, - {"value", n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.} + {"value", result.n_prompt_tokens_processed ? 1.e3 / result.t_prompt_processing * result.n_prompt_tokens_processed : 0.} },{ {"name", "predicted_tokens_seconds"}, {"help", "Average generation throughput in tokens/s."}, - {"value", n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.} + {"value", result.n_tokens_predicted ? 1.e3 / result.t_tokens_generation * result.n_tokens_predicted : 0.} },{ {"name", "kv_cache_usage_ratio"}, {"help", "KV-cache usage. 1 means 100 percent usage."}, - {"value", 1. * kv_cache_used_cells / params.n_ctx} + {"value", 1. * result.kv_cache_used_cells / params.n_ctx} },{ {"name", "kv_cache_tokens"}, {"help", "KV-cache tokens."}, - {"value", (uint64_t) data.at("kv_cache_tokens_count")} + {"value", (uint64_t) result.kv_cache_tokens_count} },{ {"name", "requests_processing"}, {"help", "Number of request processing."}, - {"value", (uint64_t) data.at("processing")} + {"value", (uint64_t) result.n_processing_slots} },{ {"name", "requests_deferred"}, {"help", "Number of request deferred."}, - {"value", (uint64_t) data.at("deferred")} + {"value", (uint64_t) result.n_tasks_deferred} }}} }; @@ -2575,8 +2540,7 @@ int main(int argc, char ** argv) { } } - const int64_t t_start = data.at("t_start"); - res.set_header("Process-Start-Time-Unix", std::to_string(t_start)); + res.set_header("Process-Start-Time-Unix", std::to_string(result.t_start)); res.set_content(prometheus.str(), "text/plain; version=0.0.4"); res.status = 200; // HTTP OK @@ -2602,14 +2566,18 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + task_result_ptr result_raw = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result.error) { - res_error(res, result.data); - } else { - res_ok(res, result.data); + if (result_raw->type == RESULT_TYPE_ERROR) { + auto result = server_task_result_error::from_ptr(result_raw); + res_error(res, format_error_response(result.err_msg, result.err_type)); + return; } + + GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_SAVE_LOAD); + auto result = server_task_result_slot_save_load::from_ptr(result_raw); + res_ok(res, result.to_json()); }; const auto handle_slots_restore = [&ctx_server, &res_error, &res_ok, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { @@ -2632,14 +2600,18 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + task_result_ptr result_raw = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result.error) { - res_error(res, result.data); - } else { - res_ok(res, result.data); + if (result_raw->type == RESULT_TYPE_ERROR) { + auto result = server_task_result_error::from_ptr(result_raw); + res_error(res, format_error_response(result.err_msg, result.err_type)); + return; } + + GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_SAVE_LOAD); + auto result = server_task_result_slot_save_load::from_ptr(result_raw); + res_ok(res, result.to_json()); }; const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) { @@ -2652,14 +2624,18 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + task_result_ptr result_raw = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result.error) { - res_error(res, result.data); - } else { - res_ok(res, result.data); + if (result_raw->type == RESULT_TYPE_ERROR) { + auto result = server_task_result_error::from_ptr(result_raw); + res_error(res, format_error_response(result.err_msg, result.err_type)); + return; } + + GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_ERASE); + auto result = server_task_result_slot_erase::from_ptr(result_raw); + res_ok(res, result.to_json()); }; const auto handle_slots_action = [¶ms, &res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) { @@ -2728,15 +2704,15 @@ int main(int argc, char ** argv) { const auto task_ids = server_task::get_list_id(tasks); if (!stream) { - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { if (results.size() == 1) { // single result - res_ok(res, results[0].data); + res_ok(res, results[0].to_json()); } else { // multiple results (multitask) json arr = json::array(); - for (const auto & res : results) { - arr.push_back(res.data); + for (auto & res : results) { + arr.push_back(res.to_json()); } res_ok(res, arr); } @@ -2747,8 +2723,8 @@ int main(int argc, char ** argv) { ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { - return server_sent_event(sink, "data", result.data); + ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool { + return server_sent_event(sink, "data", result.to_json()); }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); @@ -2837,9 +2813,9 @@ int main(int argc, char ** argv) { const auto completion_id = gen_chatcmplid(); if (!stream) { - ctx_server.receive_cmpl_results(task_ids, [&](const std::vector & results) { + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { // multitask is never support in chat completion, there is only one result - json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id, /*.streaming =*/ false, verbose); + json result_oai = format_final_response_oaicompat(data, results[0].to_json(), completion_id, /*.streaming =*/ false, verbose); res_ok(res, result_oai); }, [&](const json & error_data) { res_error(res, error_data); @@ -2848,8 +2824,8 @@ int main(int argc, char ** argv) { ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool { - std::vector result_array = format_partial_response_oaicompat(result.data, completion_id); + ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool { + std::vector result_array = format_partial_response_oaicompat(result.to_json(), completion_id); for (auto & event_data : result_array) { if (event_data.empty()) { continue; // skip the stop token @@ -2974,9 +2950,10 @@ int main(int argc, char ** argv) { // get the result std::unordered_set task_ids = server_task::get_list_id(tasks); - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { - for (const auto & res : results) { - responses.push_back(res.data); + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + for (auto & res : results) { + GGML_ASSERT(res.type == RESULT_TYPE_EMBD); + responses.push_back(res.to_json()); } }, [&](const json & error_data) { res_error(res, error_data); @@ -3052,9 +3029,10 @@ int main(int argc, char ** argv) { // get the result std::unordered_set task_ids = server_task::get_list_id(tasks); - ctx_server.receive_cmpl_results(task_ids, [&](std::vector & results) { - for (const auto & res : results) { - responses.push_back(res.data); + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + for (auto & res : results) { + GGML_ASSERT(res.type == RESULT_TYPE_RERANK); + responses.push_back(res.to_json()); } }, [&](const json & error_data) { res_error(res, error_data); @@ -3110,11 +3088,18 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - server_task_result result = ctx_server.queue_results.recv(id_task); + task_result_ptr result_raw = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - res_ok(res, result.data); - res.status = 200; // HTTP OK + if (result_raw->type == RESULT_TYPE_ERROR) { + auto result = server_task_result_error::from_ptr(result_raw); + res_error(res, format_error_response(result.err_msg, result.err_type)); + return; + } + + GGML_ASSERT(result_raw->type == RESULT_TYPE_APPLY_LORA); + auto result = server_task_result_apply_lora::from_ptr(result_raw); + res_ok(res, result.to_json()); }; // diff --git a/examples/server/server.hpp b/examples/server/server.hpp index a9287bf6dbaaf..081ad2069b05e 100644 --- a/examples/server/server.hpp +++ b/examples/server/server.hpp @@ -15,6 +15,8 @@ using json = nlohmann::ordered_json; +#define copy_cast_ptr(TYPEOUT, ptr) *(static_cast(ptr.get())) + enum stop_type { STOP_TYPE_NONE, STOP_TYPE_EOS, @@ -65,6 +67,19 @@ enum error_type { ERROR_TYPE_NOT_SUPPORTED, // custom error }; +enum result_type { + RESULT_TYPE_CMPL_FINAL, + RESULT_TYPE_CMPL_PARTIAL, + RESULT_TYPE_EMBD, + RESULT_TYPE_RERANK, + RESULT_TYPE_METRICS, + RESULT_TYPE_SLOT_SAVE_LOAD, + RESULT_TYPE_SLOT_ERASE, + RESULT_TYPE_APPLY_LORA, + RESULT_TYPE_ERROR, + RESULT_TYPE_UNKNOWN, // will throw an error +}; + struct server_task { int id = -1; // to be filled by server_queue int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL @@ -87,41 +102,145 @@ struct server_task { } }; +struct slot_params { + bool stream = true; + bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half + int32_t n_predict = -1; // new tokens to predict + int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters + + int64_t t_max_prompt_ms = -1; // TODO: implement + int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit + + std::vector antiprompt; + bool timings_per_token = false; + + struct common_params_sampling sampling; + struct common_params_speculative speculative; + + // params only used in to_json() + int32_t n_ctx; + uint32_t seed_cur; + bool can_speculative; + + json to_json() { + std::vector samplers; + samplers.reserve(sampling.samplers.size()); + for (const auto & sampler : sampling.samplers) { + samplers.emplace_back(common_sampler_type_to_str(sampler)); + } + + return json { + {"n_ctx", n_ctx}, + {"n_predict", n_predict}, // Server configured n_predict + {"temperature", sampling.temp}, + {"dynatemp_range", sampling.dynatemp_range}, + {"dynatemp_exponent", sampling.dynatemp_exponent}, + {"top_k", sampling.top_k}, + {"top_p", sampling.top_p}, + {"min_p", sampling.min_p}, + {"xtc_probability", sampling.xtc_probability}, + {"xtc_threshold", sampling.xtc_threshold}, + {"typical_p", sampling.typ_p}, + {"repeat_last_n", sampling.penalty_last_n}, + {"repeat_penalty", sampling.penalty_repeat}, + {"presence_penalty", sampling.penalty_present}, + {"frequency_penalty", sampling.penalty_freq}, + {"dry_multiplier", sampling.dry_multiplier}, + {"dry_base", sampling.dry_base}, + {"dry_allowed_length", sampling.dry_allowed_length}, + {"dry_penalty_last_n", sampling.dry_penalty_last_n}, + {"dry_sequence_breakers", sampling.dry_sequence_breakers}, + {"mirostat", sampling.mirostat}, + {"mirostat_tau", sampling.mirostat_tau}, + {"mirostat_eta", sampling.mirostat_eta}, + {"penalize_nl", sampling.penalize_nl}, + {"stop", antiprompt}, + {"max_tokens", n_predict}, // User configured n_predict + {"n_keep", n_keep}, + {"n_discard", n_discard}, + {"ignore_eos", sampling.ignore_eos}, + {"stream", stream}, + //{"logit_bias", sampling.logit_bias}, + {"n_probs", sampling.n_probs}, + {"min_keep", sampling.min_keep}, + {"grammar", sampling.grammar}, + {"samplers", samplers}, + {"speculative", can_speculative}, + {"speculative.n_max", speculative.n_max}, + {"speculative.n_min", speculative.n_min}, + {"speculative.p_min", speculative.p_min}, + {"timings_per_token", timings_per_token}, + }; + } +}; + struct result_timings { - int32_t prompt_n; + int32_t prompt_n = -1; double prompt_ms; double prompt_per_token_ms; double prompt_per_second; - int32_t predicted_n; + int32_t predicted_n = -1; double predicted_ms; double predicted_per_token_ms; double predicted_per_second; -}; -enum result_type { - RESULT_TYPE_CMPL_FINAL, - RESULT_TYPE_CMPL_PARTIAL, - RESULT_TYPE_EMBD, - RESULT_TYPE_RERANK, - RESULT_TYPE_ERROR, - RESULT_TYPE_UNKNOWN, // will throw an error + json to_json() { + return { + {"prompt_n", prompt_n}, + {"prompt_ms", prompt_ms}, + {"prompt_per_token_ms", prompt_per_token_ms}, + {"prompt_per_second", prompt_per_second}, + + {"predicted_n", predicted_n}, + {"predicted_ms", predicted_ms}, + {"predicted_per_token_ms", predicted_per_token_ms}, + {"predicted_per_second", predicted_per_second}, + }; + } }; struct server_task_result { result_type type = RESULT_TYPE_UNKNOWN; int id = -1; int id_slot = -1; + server_task_result() = default; + server_task_result(result_type type) : type(type) {} +}; + +inline std::string stop_type_to_str(stop_type type) { + switch (type) { + case STOP_TYPE_EOS: return "eos"; + case STOP_TYPE_WORD: return "word"; + case STOP_TYPE_LIMIT: return "limit"; + default: return "none"; + } +} + +struct completion_token_output { + llama_token tok; + std::string text_to_send; + struct token_prob { + llama_token tok; + float prob; + }; + std::vector probs; }; struct server_task_result_cmpl_final : server_task_result { - result_type type = RESULT_TYPE_CMPL_FINAL; + server_task_result_cmpl_final() : server_task_result(RESULT_TYPE_CMPL_FINAL) {} int index = 0; std::string content; bool stream; bool timings_per_token; result_timings timings; + std::string model_alias; + std::string prompt; + bool truncated; int32_t n_decoded; int32_t n_prompt_tokens; int32_t has_new_line; @@ -130,62 +249,232 @@ struct server_task_result_cmpl_final : server_task_result { stop_type stop = STOP_TYPE_NONE; std::vector probs_output; - slot_params params; -}; + slot_params generation_params; + + json to_json() { + // non-OAI-compat JSON + return json { + {"index", index}, + {"content", content}, + {"id_slot", id_slot}, + {"stop", true}, + {"model", model_alias}, + {"tokens_predicted", n_decoded}, + {"tokens_evaluated", n_prompt_tokens}, + {"generation_settings", generation_params.to_json()}, + {"prompt", prompt}, + {"has_new_line", has_new_line}, + {"truncated", truncated}, + {"stop_type", stop_type_to_str(stop)}, + {"stopping_word", stopping_word}, + {"tokens_cached", n_tokens_cached}, + {"timings", timings.to_json()}, + }; + } -struct completion_token_output { - llama_token tok; - std::string text_to_send; - struct token_prob { - llama_token tok; - float prob; - }; - std::vector probs; + static server_task_result_cmpl_final from_ptr(std::unique_ptr & result_ptr) { + return copy_cast_ptr(server_task_result_cmpl_final, result_ptr); + } }; struct server_task_result_cmpl_partial : server_task_result { - result_type type = RESULT_TYPE_CMPL_PARTIAL; + server_task_result_cmpl_partial() : server_task_result(RESULT_TYPE_CMPL_PARTIAL) {} int index = 0; std::string content; stop_type stop = STOP_TYPE_NONE; std::vector probs_output; result_timings timings; + + json to_json() { + json res = json { + {"index", index}, + {"content", content}, + {"stop", stop != STOP_TYPE_NONE}, + {"id_slot", id_slot}, + }; + // populate the timings object when timings_per_token is set + if (timings.prompt_n > 0) { + res.push_back({"timings", timings.to_json()}); + } + return res; + } + + static server_task_result_cmpl_partial from_ptr(std::unique_ptr & result_ptr) { + return copy_cast_ptr(server_task_result_cmpl_partial, result_ptr); + } }; struct server_task_result_embd : server_task_result { + server_task_result_embd() : server_task_result(RESULT_TYPE_EMBD) {} result_type type = RESULT_TYPE_EMBD; int index = 0; std::vector embedding; + + json to_json() { + return json { + {"index", index}, + {"embedding", embedding}, + }; + } + + static server_task_result_embd from_ptr(std::unique_ptr & result_ptr) { + return copy_cast_ptr(server_task_result_embd, result_ptr); + } }; struct server_task_result_rerank : server_task_result { - result_type type = RESULT_TYPE_RERANK; + server_task_result_rerank() : server_task_result(RESULT_TYPE_RERANK) {} int index = 0; float score; + + json to_json() { + return json { + {"index", index}, + {"score", score}, + }; + } + + static server_task_result_rerank from_ptr(std::unique_ptr & result_ptr) { + return copy_cast_ptr(server_task_result_rerank, result_ptr); + } }; struct server_task_result_error : server_task_result { - result_type type = RESULT_TYPE_ERROR; + server_task_result_error() : server_task_result(RESULT_TYPE_ERROR) {} int index = 0; - error_type err_type; + error_type err_type = ERROR_TYPE_SERVER; std::string err_msg; + + static server_task_result_error from_ptr(std::unique_ptr & result_ptr) { + return copy_cast_ptr(server_task_result_error, result_ptr); + } }; -struct slot_params { - bool stream = true; - bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt +struct server_task_result_metrics : server_task_result { + server_task_result_metrics() : server_task_result(RESULT_TYPE_METRICS) {} + int n_idle_slots; + int n_processing_slots; + int n_tasks_deferred; + int64_t t_start; - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half - int32_t n_predict = -1; // new tokens to predict - int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters + int32_t kv_cache_tokens_count; + int32_t kv_cache_used_cells; - int64_t t_max_prompt_ms = -1; // TODO: implement - int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit + // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t t_prompt_processing_total = 0; + uint64_t n_tokens_predicted_total = 0; + uint64_t t_tokens_generation_total = 0; - std::vector antiprompt; - bool timings_per_token = false; + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; - struct common_params_sampling sampling; - struct common_params_speculative speculative; + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + uint64_t n_decode_total = 0; + uint64_t n_busy_slots_total = 0; + + // TODO: get rid of this json object and use to_json() instead + json slots_data = json::array(); + + json to_json() { + return json { + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "deferred", n_tasks_deferred }, + { "t_start", t_start }, + + { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total }, + { "t_tokens_generation_total", t_tokens_generation_total }, + { "n_tokens_predicted_total", n_tokens_predicted_total }, + { "t_prompt_processing_total", t_prompt_processing_total }, + + { "n_prompt_tokens_processed", n_prompt_tokens_processed }, + { "t_prompt_processing", t_prompt_processing }, + { "n_tokens_predicted", n_tokens_predicted }, + { "t_tokens_generation", t_tokens_generation }, + + { "n_decode_total", n_decode_total }, + { "n_busy_slots_total", n_busy_slots_total }, + + { "kv_cache_tokens_count", kv_cache_tokens_count }, + { "kv_cache_used_cells", kv_cache_used_cells }, + + { "slots", slots_data }, + }; + } + + static server_task_result_metrics from_ptr(std::unique_ptr & result_ptr) { + return copy_cast_ptr(server_task_result_metrics, result_ptr); + } +}; + +struct server_task_result_slot_save_load : server_task_result { + server_task_result_slot_save_load() : server_task_result(RESULT_TYPE_SLOT_SAVE_LOAD) {} + std::string filename; + bool is_save; // true = save, false = load + + size_t n_saved; + size_t n_written; + + size_t n_restored; + size_t n_read; + + double t_ms; + + json to_json() { + if (is_save) { + return json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_saved", n_saved }, + { "n_written", n_written }, + { "timings", { + { "save_ms", t_ms } + }}, + }; + } else { + return json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_restored", n_restored }, + { "n_read", n_read }, + { "timings", { + { "restore_ms", t_ms } + }}, + }; + } + } + + static server_task_result_slot_save_load from_ptr(std::unique_ptr & result_ptr) { + return copy_cast_ptr(server_task_result_slot_save_load, result_ptr); + } +}; + +struct server_task_result_slot_erase : server_task_result { + server_task_result_slot_erase() : server_task_result(RESULT_TYPE_SLOT_ERASE) {} + size_t n_erased; + + json to_json() { + return json { + { "id_slot", id_slot }, + { "n_erased", n_erased }, + }; + } + + static server_task_result_slot_erase from_ptr(std::unique_ptr & result_ptr) { + return copy_cast_ptr(server_task_result_slot_erase, result_ptr); + } +}; + +struct server_task_result_apply_lora : server_task_result { + server_task_result_apply_lora() : server_task_result(RESULT_TYPE_APPLY_LORA) {} + json to_json() { + return json {{ "success", true }}; + } + + static server_task_result_apply_lora from_ptr(std::unique_ptr & result_ptr) { + return copy_cast_ptr(server_task_result_apply_lora, result_ptr); + } }; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index d65773addf231..b01a7757fc259 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" From 0d6485f0f830d9fd3de5680e861f897d6e9312aa Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 4 Dec 2024 15:03:37 +0100 Subject: [PATCH 03/19] wip [no ci] --- examples/server/server.cpp | 26 +++++++++----- examples/server/server.hpp | 2 ++ examples/server/utils.hpp | 71 +++++++++++++++++--------------------- 3 files changed, 51 insertions(+), 48 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index de073b085dbe9..a673fb4158540 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -494,7 +494,9 @@ struct server_response { } // Send a new result to a waiting id_task - void send(server_task_result & result) { + template + void send(T & result) { + static_assert(std::is_base_of::value, "T must be derived from server_task_result"); SRV_DBG("sending result for task id = %d\n", result.id); std::unique_lock lock(mutex_results); @@ -502,7 +504,7 @@ struct server_response { if (result.id == id_task) { SRV_DBG("task id = %d pushed to result queue\n", result.id); - queue_results.push_back(std::make_unique(result)); + queue_results.push_back(std::make_unique(std::move(result))); condition_results.notify_all(); return; } @@ -1166,8 +1168,10 @@ struct server_context { void send_partial_response(server_slot & slot, completion_token_output tkn) { server_task_result_cmpl_partial res; - res.id = slot.id_task; - res.content = tkn.text_to_send; + res.id = slot.id_task; + res.n_decoded = slot.n_decoded; + res.n_prompt_tokens = slot.n_prompt_tokens; + res.content = tkn.text_to_send; if (slot.params.sampling.n_probs > 0) { const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); @@ -1189,7 +1193,11 @@ struct server_context { queue_results.send(res); } - void send_final_response(const server_slot & slot) { + void send_final_response(server_slot & slot) { + if (slot.params.stream) { + return send_partial_response(slot, {0, "", {}}); + } + server_task_result_cmpl_final res; res.id = slot.id_task; res.id_slot = slot.id; @@ -1380,6 +1388,7 @@ struct server_context { const std::unordered_set & id_tasks, const std::function&)> & result_handler, const std::function & error_handler) { + static_assert(std::is_base_of::value, "T must be derived from server_task_result"); std::vector results(id_tasks.size()); for (size_t i = 0; i < id_tasks.size(); i++) { task_result_ptr result_raw = queue_results.recv(id_tasks); @@ -2815,7 +2824,7 @@ int main(int argc, char ** argv) { if (!stream) { ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { // multitask is never support in chat completion, there is only one result - json result_oai = format_final_response_oaicompat(data, results[0].to_json(), completion_id, /*.streaming =*/ false, verbose); + json result_oai = format_final_response_oaicompat(data, results[0], completion_id, /*.streaming =*/ false, verbose); res_ok(res, result_oai); }, [&](const json & error_data) { res_error(res, error_data); @@ -2823,9 +2832,10 @@ int main(int argc, char ** argv) { ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { - const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) { + std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + const auto chunked_content_provider = [task_ids, &ctx_server, completion_id, model_name](size_t, httplib::DataSink & sink) { ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool { - std::vector result_array = format_partial_response_oaicompat(result.to_json(), completion_id); + std::vector result_array = format_partial_response_oaicompat(model_name, result, completion_id); for (auto & event_data : result_array) { if (event_data.empty()) { continue; // skip the stop token diff --git a/examples/server/server.hpp b/examples/server/server.hpp index 081ad2069b05e..6197ae56519fe 100644 --- a/examples/server/server.hpp +++ b/examples/server/server.hpp @@ -281,6 +281,8 @@ struct server_task_result_cmpl_partial : server_task_result { server_task_result_cmpl_partial() : server_task_result(RESULT_TYPE_CMPL_PARTIAL) {} int index = 0; std::string content; + int32_t n_decoded; + int32_t n_prompt_tokens; stop_type stop = STOP_TYPE_NONE; std::vector probs_output; result_timings timings; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index b01a7757fc259..98a777192027c 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -583,15 +583,14 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) { - bool stopped_word = result.count("stopped_word") != 0; - bool stopped_eos = json_value(result, "stopped_eos", false); - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - std::string content = json_value(result, "content", std::string("")); - +static json format_final_response_oaicompat( + const json & request, + server_task_result_cmpl_final & result, + const std::string & completion_id, + bool streaming = false, + bool verbose = false) { std::string finish_reason = "length"; - if (stopped_word || stopped_eos) { + if (result.stop == STOP_TYPE_WORD || result.stop == STOP_TYPE_EOS) { finish_reason = "stop"; } @@ -601,7 +600,7 @@ static json format_final_response_oaicompat(const json & request, const json & r {"delta", json::object()}}}) : json::array({json{{"finish_reason", finish_reason}, {"index", 0}, - {"message", json{{"content", content}, + {"message", json{{"content", result.content}, {"role", "assistant"}}}}}); std::time_t t = std::time(0); @@ -613,48 +612,42 @@ static json format_final_response_oaicompat(const json & request, const json & r json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, {"usage", json { - {"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens} + {"completion_tokens", result.n_decoded}, + {"prompt_tokens", result.n_prompt_tokens}, + {"total_tokens", result.n_decoded + result.n_prompt_tokens} }}, {"id", completion_id} }; // extra fields for debugging purposes if (verbose) { - res["__verbose"] = result; + res["__verbose"] = result.to_json(); } - if (result.contains("completion_probabilities")) { - res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); - } + // TODO: fix this + // if (result.contains("completion_probabilities")) { + // res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); + // } - if (result.contains("timings")) { - res.push_back({"timings", json_value(result, "timings", json::object())}); + if (result.timings.prompt_n >= 0) { + res.push_back({"timings", result.timings.to_json()}); } return res; } // return value is vector as there is one case where we might need to generate two responses -static std::vector format_partial_response_oaicompat(const json & result, const std::string & completion_id) { - if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { - return std::vector({result}); - } - - bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; - std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - - bool stopped_word = json_value(result, "stopped_word", false); - bool stopped_eos = json_value(result, "stopped_eos", false); - bool stopped_limit = json_value(result, "stopped_limit", false); - std::string content = json_value(result, "content", std::string("")); +static std::vector format_partial_response_oaicompat( + std::string modelname, + server_task_result_cmpl_partial & result, + const std::string & completion_id) { + bool first = result.n_decoded == 0; + std::string content = result.content; std::string finish_reason; - if (stopped_word || stopped_eos) { + if (result.stop == STOP_TYPE_WORD || result.stop == STOP_TYPE_EOS) { finish_reason = "stop"; - } - if (stopped_limit) { + } else if (result.stop == STOP_TYPE_LIMIT) { finish_reason = "length"; } @@ -724,17 +717,15 @@ static std::vector format_partial_response_oaicompat(const json & result, {"object", "chat.completion.chunk"} }; - if (result.contains("timings")) { - ret.push_back({"timings", json_value(result, "timings", json::object())}); + if (result.timings.prompt_n >= 0) { + ret.push_back({"timings", result.timings.to_json()}); } if (!finish_reason.empty()) { - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); ret.push_back({"usage", json { - {"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens} + {"completion_tokens", result.n_decoded}, + {"prompt_tokens", result.n_prompt_tokens}, + {"total_tokens", result.n_decoded + result.n_prompt_tokens} }}); } From d2419b325588e4086819e5be412b274679ee527a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 4 Dec 2024 18:58:16 +0100 Subject: [PATCH 04/19] many fixes --- examples/server/server.cpp | 26 ++++++++++++------- examples/server/server.hpp | 24 ++++++++++++----- examples/server/tests/tests.sh | 4 +++ .../server/tests/unit/test_chat_completion.py | 15 +++++------ 4 files changed, 45 insertions(+), 24 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index a673fb4158540..c26bc08674d58 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1172,6 +1172,8 @@ struct server_context { res.n_decoded = slot.n_decoded; res.n_prompt_tokens = slot.n_prompt_tokens; res.content = tkn.text_to_send; + res.stop = slot.stop; + res.truncated = slot.truncated; if (slot.params.sampling.n_probs > 0) { const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); @@ -1186,7 +1188,8 @@ struct server_context { } } - if (slot.params.timings_per_token) { + // populate timings if this is final response or timings_per_token is enabled + if (slot.stop != STOP_TYPE_NONE || slot.params.timings_per_token) { res.timings = slot.get_timings(); } @@ -1195,6 +1198,7 @@ struct server_context { void send_final_response(server_slot & slot) { if (slot.params.stream) { + // if in stream mode, send the last partial response return send_partial_response(slot, {0, "", {}}); } @@ -1209,6 +1213,8 @@ struct server_context { res.n_tokens_cached = slot.n_past; res.content = slot.generated_text; res.stop = slot.stop; + res.truncated = slot.truncated; + res.timings = slot.get_timings(); res.generation_params = slot.params; // copy the parameters @@ -1439,6 +1445,8 @@ struct server_context { break; } + SRV_ERR("received partial result, %s\n", result.to_json().dump().c_str()); + if (result.stop != STOP_TYPE_NONE) { if (++n_finished == id_tasks.size()) { break; @@ -1533,7 +1541,7 @@ struct server_context { res.id = task.id; res.n_idle_slots = n_idle_slots; res.n_processing_slots = n_processing_slots; - res.n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); + res.n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); res.t_start = metrics.t_start; res.kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); @@ -1627,13 +1635,13 @@ struct server_context { const double t_restore_ms = (t_end - t_start) / 1000.0; server_task_result_slot_save_load result; - result.id = task.id; - result.id_slot = id_slot; - result.filename = filename; - result.is_save = false; - result.n_saved = token_count; - result.n_read = nread; - result.t_ms = t_restore_ms; + result.id = task.id; + result.id_slot = id_slot; + result.filename = filename; + result.is_save = false; + result.n_restored = token_count; + result.n_read = nread; + result.t_ms = t_restore_ms; queue_results.send(result); } break; case SERVER_TASK_TYPE_SLOT_ERASE: diff --git a/examples/server/server.hpp b/examples/server/server.hpp index 6197ae56519fe..3e2fd2f527e28 100644 --- a/examples/server/server.hpp +++ b/examples/server/server.hpp @@ -15,6 +15,7 @@ using json = nlohmann::ordered_json; +// cast a shared_ptr to a specific type using copy constructor #define copy_cast_ptr(TYPEOUT, ptr) *(static_cast(ptr.get())) enum stop_type { @@ -281,23 +282,34 @@ struct server_task_result_cmpl_partial : server_task_result { server_task_result_cmpl_partial() : server_task_result(RESULT_TYPE_CMPL_PARTIAL) {} int index = 0; std::string content; + + bool truncated; int32_t n_decoded; int32_t n_prompt_tokens; + stop_type stop = STOP_TYPE_NONE; std::vector probs_output; result_timings timings; json to_json() { + bool is_stop = stop != STOP_TYPE_NONE; + // non-OAI-compat JSON json res = json { - {"index", index}, - {"content", content}, - {"stop", stop != STOP_TYPE_NONE}, - {"id_slot", id_slot}, + {"index", index}, + {"content", content}, + {"stop_type", stop_type_to_str(stop)}, + {"stop", is_stop}, + {"id_slot", id_slot}, + {"tokens_predicted", n_decoded}, + {"tokens_evaluated", n_prompt_tokens}, }; - // populate the timings object when timings_per_token is set + // populate the timings object when needed (usually for the last response or with timings_per_token enabled) if (timings.prompt_n > 0) { res.push_back({"timings", timings.to_json()}); } + if (is_stop) { + res.push_back({"truncated", truncated}); + } return res; } @@ -464,7 +476,7 @@ struct server_task_result_slot_erase : server_task_result { { "n_erased", n_erased }, }; } - + static server_task_result_slot_erase from_ptr(std::unique_ptr & result_ptr) { return copy_cast_ptr(server_task_result_slot_erase, result_ptr); } diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index 1e285dcdac14b..1e0777de367fc 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -1,5 +1,9 @@ #!/bin/bash +# make sure we are in the right directory +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd $SCRIPT_DIR + set -eu if [ $# -lt 1 ] diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index 8a439f9ef0f29..486c1f87a0856 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -12,13 +12,13 @@ def create_server(): @pytest.mark.parametrize( - "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated", + "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason", [ - ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False), - ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False), + ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"), + ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"), ] ) -def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated): +def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason): global server server.start() res = server.make_request("POST", "/chat/completions", data={ @@ -35,10 +35,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte choice = res.body["choices"][0] assert "assistant" == choice["message"]["role"] assert match_regex(re_content, choice["message"]["content"]) - if truncated: - assert choice["finish_reason"] == "length" - else: - assert choice["finish_reason"] == "stop" + assert choice["finish_reason"] == finish_reason @pytest.mark.parametrize( @@ -93,7 +90,7 @@ def test_chat_completion_with_openai_library(): temperature=0.8, ) print(res) - assert res.choices[0].finish_reason == "stop" + assert res.choices[0].finish_reason == "length" assert res.choices[0].message.content is not None assert match_regex("(Suddenly)+", res.choices[0].message.content) From ea1be7f8acf7ec0a04327b6ec899d16c9293629b Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 4 Dec 2024 19:18:56 +0100 Subject: [PATCH 05/19] add virtual function --- examples/server/server.cpp | 28 ++++++++++++++-------------- examples/server/server.hpp | 35 ++++++++++++++++++++++++----------- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c26bc08674d58..d299d7274e91e 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1592,13 +1592,13 @@ struct server_context { const double t_save_ms = (t_end - t_start) / 1000.0; server_task_result_slot_save_load result; - result.id = task.id; - result.id_slot = id_slot; - result.filename = filename; - result.is_save = true; - result.n_saved = token_count; - result.n_written = nwrite; - result.t_ms = t_save_ms; + result.id = task.id; + result.id_slot = id_slot; + result.filename = filename; + result.is_save = true; + result.n_tokens = token_count; + result.n_bytes = nwrite; + result.t_ms = t_save_ms; queue_results.send(result); } break; case SERVER_TASK_TYPE_SLOT_RESTORE: @@ -1635,13 +1635,13 @@ struct server_context { const double t_restore_ms = (t_end - t_start) / 1000.0; server_task_result_slot_save_load result; - result.id = task.id; - result.id_slot = id_slot; - result.filename = filename; - result.is_save = false; - result.n_restored = token_count; - result.n_read = nread; - result.t_ms = t_restore_ms; + result.id = task.id; + result.id_slot = id_slot; + result.filename = filename; + result.is_save = false; + result.n_tokens = token_count; + result.n_bytes = nread; + result.t_ms = t_restore_ms; queue_results.send(result); } break; case SERVER_TASK_TYPE_SLOT_ERASE: diff --git a/examples/server/server.hpp b/examples/server/server.hpp index 3e2fd2f527e28..e9c94fa56484c 100644 --- a/examples/server/server.hpp +++ b/examples/server/server.hpp @@ -16,7 +16,7 @@ using json = nlohmann::ordered_json; // cast a shared_ptr to a specific type using copy constructor -#define copy_cast_ptr(TYPEOUT, ptr) *(static_cast(ptr.get())) +#define copy_cast_ptr(TYPEOUT, ptr) *(static_cast(ptr.get())); enum stop_type { STOP_TYPE_NONE, @@ -210,6 +210,7 @@ struct server_task_result { int id_slot = -1; server_task_result() = default; server_task_result(result_type type) : type(type) {} + virtual ~server_task_result() = default; }; inline std::string stop_type_to_str(stop_type type) { @@ -276,6 +277,8 @@ struct server_task_result_cmpl_final : server_task_result { static server_task_result_cmpl_final from_ptr(std::unique_ptr & result_ptr) { return copy_cast_ptr(server_task_result_cmpl_final, result_ptr); } + + virtual ~server_task_result_cmpl_final() = default; }; struct server_task_result_cmpl_partial : server_task_result { @@ -316,6 +319,8 @@ struct server_task_result_cmpl_partial : server_task_result { static server_task_result_cmpl_partial from_ptr(std::unique_ptr & result_ptr) { return copy_cast_ptr(server_task_result_cmpl_partial, result_ptr); } + + virtual ~server_task_result_cmpl_partial() = default; }; struct server_task_result_embd : server_task_result { @@ -334,6 +339,8 @@ struct server_task_result_embd : server_task_result { static server_task_result_embd from_ptr(std::unique_ptr & result_ptr) { return copy_cast_ptr(server_task_result_embd, result_ptr); } + + virtual ~server_task_result_embd() = default; }; struct server_task_result_rerank : server_task_result { @@ -351,6 +358,8 @@ struct server_task_result_rerank : server_task_result { static server_task_result_rerank from_ptr(std::unique_ptr & result_ptr) { return copy_cast_ptr(server_task_result_rerank, result_ptr); } + + virtual ~server_task_result_rerank() = default; }; struct server_task_result_error : server_task_result { @@ -362,6 +371,8 @@ struct server_task_result_error : server_task_result { static server_task_result_error from_ptr(std::unique_ptr & result_ptr) { return copy_cast_ptr(server_task_result_error, result_ptr); } + + virtual ~server_task_result_error() = default; }; struct server_task_result_metrics : server_task_result { @@ -422,6 +433,8 @@ struct server_task_result_metrics : server_task_result { static server_task_result_metrics from_ptr(std::unique_ptr & result_ptr) { return copy_cast_ptr(server_task_result_metrics, result_ptr); } + + virtual ~server_task_result_metrics() = default; }; struct server_task_result_slot_save_load : server_task_result { @@ -429,12 +442,8 @@ struct server_task_result_slot_save_load : server_task_result { std::string filename; bool is_save; // true = save, false = load - size_t n_saved; - size_t n_written; - - size_t n_restored; - size_t n_read; - + size_t n_tokens; + size_t n_bytes; double t_ms; json to_json() { @@ -442,8 +451,8 @@ struct server_task_result_slot_save_load : server_task_result { return json { { "id_slot", id_slot }, { "filename", filename }, - { "n_saved", n_saved }, - { "n_written", n_written }, + { "n_saved", n_tokens }, + { "n_written", n_bytes }, { "timings", { { "save_ms", t_ms } }}, @@ -452,8 +461,8 @@ struct server_task_result_slot_save_load : server_task_result { return json { { "id_slot", id_slot }, { "filename", filename }, - { "n_restored", n_restored }, - { "n_read", n_read }, + { "n_restored", n_tokens }, + { "n_read", n_bytes }, { "timings", { { "restore_ms", t_ms } }}, @@ -464,6 +473,8 @@ struct server_task_result_slot_save_load : server_task_result { static server_task_result_slot_save_load from_ptr(std::unique_ptr & result_ptr) { return copy_cast_ptr(server_task_result_slot_save_load, result_ptr); } + + virtual ~server_task_result_slot_save_load() = default; }; struct server_task_result_slot_erase : server_task_result { @@ -480,6 +491,8 @@ struct server_task_result_slot_erase : server_task_result { static server_task_result_slot_erase from_ptr(std::unique_ptr & result_ptr) { return copy_cast_ptr(server_task_result_slot_erase, result_ptr); } + + virtual ~server_task_result_slot_erase() = default; }; struct server_task_result_apply_lora : server_task_result { From 3b41ad53a3223fa7ace9d1323191f5654fd498e4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 4 Dec 2024 19:26:36 +0100 Subject: [PATCH 06/19] fix index --- examples/server/server.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d299d7274e91e..60947a17f6a77 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1169,6 +1169,7 @@ struct server_context { void send_partial_response(server_slot & slot, completion_token_output tkn) { server_task_result_cmpl_partial res; res.id = slot.id_task; + res.index = slot.index; res.n_decoded = slot.n_decoded; res.n_prompt_tokens = slot.n_prompt_tokens; res.content = tkn.text_to_send; @@ -1205,6 +1206,7 @@ struct server_context { server_task_result_cmpl_final res; res.id = slot.id_task; res.id_slot = slot.id; + res.index = slot.index; res.content = slot.generated_text; res.n_decoded = slot.n_decoded; @@ -1411,7 +1413,7 @@ struct server_context { || result_raw->type == RESULT_TYPE_EMBD || result_raw->type == RESULT_TYPE_RERANK ) { - auto result = T::from_ptr(result_raw); + T result = T::from_ptr(result_raw); const size_t idx = result.index; GGML_ASSERT(idx < results.size() && "index out of range"); results[idx] = result; From 12610861639c30201bf6071fc951fd5954bb2b2e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 4 Dec 2024 19:36:37 +0100 Subject: [PATCH 07/19] minor style fix --- examples/server/server.cpp | 23 +++++++++++++++-------- examples/server/server.hpp | 9 +++++---- 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 60947a17f6a77..469663b2e2573 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1170,12 +1170,15 @@ struct server_context { server_task_result_cmpl_partial res; res.id = slot.id_task; res.index = slot.index; + res.content = tkn.text_to_send; + + res.truncated = slot.truncated; res.n_decoded = slot.n_decoded; res.n_prompt_tokens = slot.n_prompt_tokens; - res.content = tkn.text_to_send; + res.stop = slot.stop; - res.truncated = slot.truncated; + // populate res.probs_output if (slot.params.sampling.n_probs > 0) { const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); @@ -1206,20 +1209,22 @@ struct server_context { server_task_result_cmpl_final res; res.id = slot.id_task; res.id_slot = slot.id; + res.index = slot.index; res.content = slot.generated_text; + res.timings = slot.get_timings(); + res.model_alias = slot.oaicompat_model; + res.prompt = common_detokenize(ctx, slot.prompt_tokens, true); + res.truncated = slot.truncated; res.n_decoded = slot.n_decoded; res.n_prompt_tokens = slot.n_prompt_tokens; - res.has_new_line = slot.has_new_line; res.n_tokens_cached = slot.n_past; - res.content = slot.generated_text; + res.has_new_line = slot.has_new_line; + res.stopping_word = slot.stopping_word; res.stop = slot.stop; - res.truncated = slot.truncated; - res.timings = slot.get_timings(); - - res.generation_params = slot.params; // copy the parameters + // populate res.probs_output if (slot.params.sampling.n_probs > 0) { if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) { const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); @@ -1235,6 +1240,8 @@ struct server_context { } } + res.generation_params = slot.params; // copy the parameters + queue_results.send(res); } diff --git a/examples/server/server.hpp b/examples/server/server.hpp index e9c94fa56484c..1e65614f62ac7 100644 --- a/examples/server/server.hpp +++ b/examples/server/server.hpp @@ -237,7 +237,6 @@ struct server_task_result_cmpl_final : server_task_result { int index = 0; std::string content; bool stream; - bool timings_per_token; result_timings timings; std::string model_alias; std::string prompt; @@ -245,10 +244,11 @@ struct server_task_result_cmpl_final : server_task_result { bool truncated; int32_t n_decoded; int32_t n_prompt_tokens; - int32_t has_new_line; - int32_t stopping_word; int32_t n_tokens_cached; + int32_t has_new_line; + std::string stopping_word; stop_type stop = STOP_TYPE_NONE; + std::vector probs_output; slot_params generation_params; @@ -291,6 +291,7 @@ struct server_task_result_cmpl_partial : server_task_result { int32_t n_prompt_tokens; stop_type stop = STOP_TYPE_NONE; + std::vector probs_output; result_timings timings; @@ -346,7 +347,7 @@ struct server_task_result_embd : server_task_result { struct server_task_result_rerank : server_task_result { server_task_result_rerank() : server_task_result(RESULT_TYPE_RERANK) {} int index = 0; - float score; + float score = -1e6; json to_json() { return json { From eaa12887da2c73d4ca50170eee112d4c847ba4c7 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 4 Dec 2024 19:52:28 +0100 Subject: [PATCH 08/19] add std::move --- examples/server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 469663b2e2573..9057c0a4c5d0d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1423,7 +1423,7 @@ struct server_context { T result = T::from_ptr(result_raw); const size_t idx = result.index; GGML_ASSERT(idx < results.size() && "index out of range"); - results[idx] = result; + results[idx] = std::move(result); } else { GGML_ASSERT(false && "unexpected result type"); } From cb666718b1d4fe94de819f4888035d53b73b4133 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 4 Dec 2024 23:53:25 +0100 Subject: [PATCH 09/19] refactor handle_completions_generic --- examples/server/server.cpp | 102 ++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 53 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 9057c0a4c5d0d..0ab09db22934c 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2716,7 +2716,16 @@ int main(int argc, char ** argv) { res_ok(res, {{ "success", true }}); }; - const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) { + // handle completion-like requests (completion, chat, infill) + // we can optionally provide a custom format for partial results and final results + const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok]( + server_task_inf_type inf_type, + json & data, + httplib::Response & res, + const std::function(server_task_result_cmpl_partial&)> & format_partial = nullptr, + const std::function&)> & format_final = nullptr, + // wether to send [DONE] event after completion (required for OAI-compat) + bool send_done_event = false) { if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; @@ -2731,7 +2740,9 @@ int main(int argc, char ** argv) { if (!stream) { ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { - if (results.size() == 1) { + if (format_final) { + res_ok(res, format_final(results)); + } else if (results.size() == 1) { // single result res_ok(res, results[0].to_json()); } else { @@ -2748,12 +2759,25 @@ int main(int argc, char ** argv) { ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { - const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) { + const auto chunked_content_provider = [task_ids, &ctx_server, format_partial = std::move(format_partial), send_done_event](size_t, httplib::DataSink & sink) { ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool { - return server_sent_event(sink, "data", result.to_json()); + if (format_partial) { + for (const auto & res : format_partial(result)) { + if (!server_sent_event(sink, "data", res)) { + return false; + } + } + return true; + } else { + return server_sent_event(sink, "data", result.to_json()); + } }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); + if (send_done_event) { + static const std::string ev_done = "data: [DONE]\n\n"; + sink.write(ev_done.data(), ev_done.size()); + } sink.done(); return false; }; @@ -2768,7 +2792,13 @@ int main(int argc, char ** argv) { const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { json data = json::parse(req.body); - return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res); + return handle_completions_generic( + SERVER_TASK_INF_TYPE_COMPLETION, + data, + res, + // TODO: support OAI-compat response via format_partial and format_final + /* format_partial */ nullptr, + /* format_final */ nullptr); }; const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { @@ -2821,8 +2851,7 @@ int main(int argc, char ** argv) { return handle_completions_generic(SERVER_TASK_INF_TYPE_INFILL, data, res); }; - // TODO: maybe merge this function with "handle_completions_generic" - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_generic, verbose](const httplib::Request & req, httplib::Response & res) { if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; @@ -2830,53 +2859,20 @@ int main(int argc, char ** argv) { json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); - std::vector tasks = ctx_server.create_tasks_inference(data, SERVER_TASK_INF_TYPE_COMPLETION); - ctx_server.queue_results.add_waiting_tasks(tasks); - ctx_server.queue_tasks.post(tasks); - - bool stream = json_value(data, "stream", false); - const auto task_ids = server_task::get_list_id(tasks); const auto completion_id = gen_chatcmplid(); - - if (!stream) { - ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { - // multitask is never support in chat completion, there is only one result - json result_oai = format_final_response_oaicompat(data, results[0], completion_id, /*.streaming =*/ false, verbose); - res_ok(res, result_oai); - }, [&](const json & error_data) { - res_error(res, error_data); - }); - - ctx_server.queue_results.remove_waiting_task_ids(task_ids); - } else { - std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - const auto chunked_content_provider = [task_ids, &ctx_server, completion_id, model_name](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool { - std::vector result_array = format_partial_response_oaicompat(model_name, result, completion_id); - for (auto & event_data : result_array) { - if (event_data.empty()) { - continue; // skip the stop token - } - if (!server_sent_event(sink, "data", event_data)) { - return false; // connection is closed - } - } - return true; // ok - }, [&](const json & error_data) { - server_sent_event(sink, "error", error_data); - }); - static const std::string ev_done = "data: [DONE]\n\n"; - sink.write(ev_done.data(), ev_done.size()); - sink.done(); - return true; - }; - - auto on_complete = [task_ids, &ctx_server] (bool) { - ctx_server.queue_results.remove_waiting_task_ids(task_ids); - }; - - res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete); - } + std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + + return handle_completions_generic( + SERVER_TASK_INF_TYPE_COMPLETION, + data, + res, + /* format_partial */ [data, model_name, completion_id](server_task_result_cmpl_partial & result) { + return format_partial_response_oaicompat(model_name, result, completion_id); + }, + /* format_final */ [data, verbose, model_name](std::vector & results) { + return format_final_response_oaicompat(data, results[0], model_name, false, verbose); + }, + /* send_done_event */ true); }; const auto handle_models = [¶ms, &ctx_server](const httplib::Request &, httplib::Response & res) { From 8ab173c865e76a4078c9ea461589e15e6e74c631 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Dec 2024 14:44:06 +0100 Subject: [PATCH 10/19] add virtual functions --- examples/server/server.cpp | 437 +++++++++++++++++-------------------- examples/server/server.hpp | 323 ++++++++++++++++++++------- examples/server/utils.hpp | 189 ---------------- 3 files changed, 440 insertions(+), 509 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 0ab09db22934c..c8cb48b15c6ba 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -84,9 +84,6 @@ struct server_slot { bool truncated = false; stop_type stop; - bool oaicompat = false; - - std::string oaicompat_model; std::string stopping_word; // sampling @@ -494,17 +491,15 @@ struct server_response { } // Send a new result to a waiting id_task - template - void send(T & result) { - static_assert(std::is_base_of::value, "T must be derived from server_task_result"); - SRV_DBG("sending result for task id = %d\n", result.id); + void send(task_result_ptr && result) { + SRV_DBG("sending result for task id = %d\n", result->id); std::unique_lock lock(mutex_results); for (const auto & id_task : waiting_task_ids) { - if (result.id == id_task) { - SRV_DBG("task id = %d pushed to result queue\n", result.id); + if (result->id == id_task) { + SRV_DBG("task id = %d pushed to result queue\n", result->id); - queue_results.push_back(std::make_unique(std::move(result))); + queue_results.emplace_back(std::move(result)); condition_results.notify_all(); return; } @@ -791,13 +786,16 @@ struct server_context { const auto & data = task.data; if (data.count("__oaicompat") != 0) { - slot.oaicompat = true; - slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + slot.params.oaicompat = true; + slot.params.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string()); } else { - slot.oaicompat = false; - slot.oaicompat_model = ""; + slot.params.oaicompat = false; } + + // enabling this will output extra debug information in the HTTP responses from the server + slot.params.verbose = params_base.verbosity > 9; slot.params.timings_per_token = json_value(data, "timings_per_token", false); slot.params.stream = json_value(data, "stream", false); @@ -1158,25 +1156,29 @@ struct server_context { void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str()); - server_task_result_error res; - res.id = id_task; - res.err_type = type; - res.err_msg = error; + auto res = std::make_unique(); + res->id = id_task; + res->err_type = type; + res->err_msg = error; - queue_results.send(res); + queue_results.send(std::move(res)); } void send_partial_response(server_slot & slot, completion_token_output tkn) { - server_task_result_cmpl_partial res; - res.id = slot.id_task; - res.index = slot.index; - res.content = tkn.text_to_send; + auto res = std::make_unique(); + res->id = slot.id_task; + res->index = slot.index; + res->content = tkn.text_to_send; + + res->truncated = slot.truncated; + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.n_prompt_tokens; - res.truncated = slot.truncated; - res.n_decoded = slot.n_decoded; - res.n_prompt_tokens = slot.n_prompt_tokens; + res->stop = slot.stop; - res.stop = slot.stop; + res->oaicompat_model = slot.params.oaicompat_model; + res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; + res->verbose = slot.params.verbose; // populate res.probs_output if (slot.params.sampling.n_probs > 0) { @@ -1186,7 +1188,7 @@ struct server_context { std::vector probs_output; if (probs_pos < probs_stop_pos) { - res.probs_output = std::vector( + res->probs_output = std::vector( slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos); } @@ -1194,10 +1196,10 @@ struct server_context { // populate timings if this is final response or timings_per_token is enabled if (slot.stop != STOP_TYPE_NONE || slot.params.timings_per_token) { - res.timings = slot.get_timings(); + res->timings = slot.get_timings(); } - queue_results.send(res); + queue_results.send(std::move(res)); } void send_final_response(server_slot & slot) { @@ -1206,23 +1208,26 @@ struct server_context { return send_partial_response(slot, {0, "", {}}); } - server_task_result_cmpl_final res; - res.id = slot.id_task; - res.id_slot = slot.id; + auto res = std::make_unique(); + res->id = slot.id_task; + res->id_slot = slot.id; - res.index = slot.index; - res.content = slot.generated_text; - res.timings = slot.get_timings(); - res.model_alias = slot.oaicompat_model; - res.prompt = common_detokenize(ctx, slot.prompt_tokens, true); + res->index = slot.index; + res->content = slot.generated_text; + res->timings = slot.get_timings(); + res->prompt = common_detokenize(ctx, slot.prompt_tokens, true); - res.truncated = slot.truncated; - res.n_decoded = slot.n_decoded; - res.n_prompt_tokens = slot.n_prompt_tokens; - res.n_tokens_cached = slot.n_past; - res.has_new_line = slot.has_new_line; - res.stopping_word = slot.stopping_word; - res.stop = slot.stop; + res->truncated = slot.truncated; + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.n_prompt_tokens; + res->n_tokens_cached = slot.n_past; + res->has_new_line = slot.has_new_line; + res->stopping_word = slot.stopping_word; + res->stop = slot.stop; + + res->oaicompat_model = slot.params.oaicompat_model; + res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; + res->verbose = slot.params.verbose; // populate res.probs_output if (slot.params.sampling.n_probs > 0) { @@ -1230,25 +1235,25 @@ struct server_context { const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false); size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size()); - res.probs_output = std::vector( + res->probs_output = std::vector( slot.generated_token_probs.begin(), slot.generated_token_probs.end() - safe_offset); } else { - res.probs_output = std::vector( + res->probs_output = std::vector( slot.generated_token_probs.begin(), slot.generated_token_probs.end()); } } - res.generation_params = slot.params; // copy the parameters + res->generation_params = slot.params; // copy the parameters - queue_results.send(res); + queue_results.send(std::move(res)); } void send_embedding(const server_slot & slot, const llama_batch & batch) { - server_task_result_embd res; - res.id = slot.id_task; - res.index = slot.index; + auto res = std::make_unique(); + res->id = slot.id_task; + res->index = slot.index; const int n_embd = llama_n_embd(model); @@ -1267,23 +1272,23 @@ struct server_context { if (embd == NULL) { SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - res.embedding = std::vector(n_embd, 0.0f); + res->embedding = std::vector(n_embd, 0.0f); continue; } common_embd_normalize(embd, embd_res.data(), n_embd); - res.embedding = embd_res; + res->embedding = embd_res; } SLT_DBG(slot, "%s", "sending embeddings\n"); - queue_results.send(res); + queue_results.send(std::move(res)); } void send_rerank(const server_slot & slot, const llama_batch & batch) { - server_task_result_rerank res; - res.id = slot.id_task; - res.index = slot.index; + auto res = std::make_unique(); + res->id = slot.id_task; + res->index = slot.index; for (int i = 0; i < batch.n_tokens; ++i) { if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { @@ -1298,16 +1303,16 @@ struct server_context { if (embd == NULL) { SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - res.score = -1e6; + res->score = -1e6; continue; } - res.score = embd[0]; + res->score = embd[0]; } - SLT_DBG(slot, "sending rerank result, res.score = %f\n", res.score); + SLT_DBG(slot, "sending rerank result, res.score = %f\n", res->score); - queue_results.send(res); + queue_results.send(std::move(res)); } // @@ -1398,35 +1403,28 @@ struct server_context { } // receive the results from task(s) created by create_tasks_inference - template void receive_multi_results( const std::unordered_set & id_tasks, - const std::function&)> & result_handler, + const std::function&)> & result_handler, const std::function & error_handler) { - static_assert(std::is_base_of::value, "T must be derived from server_task_result"); - std::vector results(id_tasks.size()); + std::vector results(id_tasks.size()); for (size_t i = 0; i < id_tasks.size(); i++) { - task_result_ptr result_raw = queue_results.recv(id_tasks); + task_result_ptr result = queue_results.recv(id_tasks); - if (result_raw->type == RESULT_TYPE_ERROR) { - auto result = server_task_result_error::from_ptr(result_raw); - error_handler(format_error_response(result.err_msg, result.err_type)); + if (result->is_error()) { + error_handler(result->to_json()); cancel_tasks(id_tasks); return; } - if ( - result_raw->type == RESULT_TYPE_CMPL_FINAL - || result_raw->type == RESULT_TYPE_EMBD - || result_raw->type == RESULT_TYPE_RERANK - ) { - T result = T::from_ptr(result_raw); - const size_t idx = result.index; - GGML_ASSERT(idx < results.size() && "index out of range"); - results[idx] = std::move(result); - } else { - GGML_ASSERT(false && "unexpected result type"); - } + GGML_ASSERT( + dynamic_cast(result.get()) != nullptr + || dynamic_cast(result.get()) != nullptr + || dynamic_cast(result.get()) != nullptr + ); + const size_t idx = result->get_index(); + GGML_ASSERT(idx < results.size() && "index out of range"); + results[idx] = std::move(result); } result_handler(results); } @@ -1434,29 +1432,25 @@ struct server_context { // receive the results from task(s) created by create_tasks_inference, in stream mode void receive_cmpl_results_stream( const std::unordered_set & id_tasks, const - std::function & result_handler, const + std::function & result_handler, const std::function & error_handler) { size_t n_finished = 0; while (true) { - task_result_ptr result_raw = queue_results.recv(id_tasks); + task_result_ptr result = queue_results.recv(id_tasks); - if (result_raw->type == RESULT_TYPE_ERROR) { - auto result = server_task_result_error::from_ptr(result_raw); - error_handler(format_error_response(result.err_msg, result.err_type)); + if (result->is_error()) { + error_handler(result->to_json()); cancel_tasks(id_tasks); return; } - GGML_ASSERT(result_raw->type == RESULT_TYPE_CMPL_PARTIAL); - auto result = server_task_result_cmpl_partial::from_ptr(result_raw); + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); if (!result_handler(result)) { cancel_tasks(id_tasks); break; } - SRV_ERR("received partial result, %s\n", result.to_json().dump().c_str()); - - if (result.stop != STOP_TYPE_NONE) { + if (result->is_stop()) { if (++n_finished == id_tasks.size()) { break; } @@ -1546,33 +1540,33 @@ struct server_context { } SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots); - server_task_result_metrics res; - res.id = task.id; - res.n_idle_slots = n_idle_slots; - res.n_processing_slots = n_processing_slots; - res.n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); - res.t_start = metrics.t_start; + auto res = std::make_unique(); + res->id = task.id; + res->n_idle_slots = n_idle_slots; + res->n_processing_slots = n_processing_slots; + res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size(); + res->t_start = metrics.t_start; - res.kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); - res.kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx); + res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx); + res->kv_cache_used_cells = llama_get_kv_cache_used_cells(ctx); - res.n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; - res.t_prompt_processing_total = metrics.t_prompt_processing_total; - res.n_tokens_predicted_total = metrics.n_tokens_predicted_total; - res.t_tokens_generation_total = metrics.t_tokens_generation_total; + res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total; + res->t_prompt_processing_total = metrics.t_prompt_processing_total; + res->n_tokens_predicted_total = metrics.n_tokens_predicted_total; + res->t_tokens_generation_total = metrics.t_tokens_generation_total; - res.n_prompt_tokens_processed = metrics.n_prompt_tokens_processed; - res.t_prompt_processing = metrics.t_prompt_processing; - res.n_tokens_predicted = metrics.n_tokens_predicted; - res.t_tokens_generation = metrics.t_tokens_generation; + res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed; + res->t_prompt_processing = metrics.t_prompt_processing; + res->n_tokens_predicted = metrics.n_tokens_predicted; + res->t_tokens_generation = metrics.t_tokens_generation; - res.n_decode_total = metrics.n_decode_total; - res.n_busy_slots_total = metrics.n_busy_slots_total; + res->n_decode_total = metrics.n_decode_total; + res->n_busy_slots_total = metrics.n_busy_slots_total; if (json_value(task.data, "reset_bucket", false)) { metrics.reset_bucket(); } - queue_results.send(res); + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SLOT_SAVE: { @@ -1600,15 +1594,15 @@ struct server_context { const int64_t t_end = ggml_time_us(); const double t_save_ms = (t_end - t_start) / 1000.0; - server_task_result_slot_save_load result; - result.id = task.id; - result.id_slot = id_slot; - result.filename = filename; - result.is_save = true; - result.n_tokens = token_count; - result.n_bytes = nwrite; - result.t_ms = t_save_ms; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->filename = filename; + res->is_save = true; + res->n_tokens = token_count; + res->n_bytes = nwrite; + res->t_ms = t_save_ms; + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SLOT_RESTORE: { @@ -1643,15 +1637,15 @@ struct server_context { const int64_t t_end = ggml_time_us(); const double t_restore_ms = (t_end - t_start) / 1000.0; - server_task_result_slot_save_load result; - result.id = task.id; - result.id_slot = id_slot; - result.filename = filename; - result.is_save = false; - result.n_tokens = token_count; - result.n_bytes = nread; - result.t_ms = t_restore_ms; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->filename = filename; + res->is_save = false; + res->n_tokens = token_count; + res->n_bytes = nread; + res->t_ms = t_restore_ms; + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SLOT_ERASE: { @@ -1673,18 +1667,18 @@ struct server_context { llama_kv_cache_seq_rm(ctx, slot->id, -1, -1); slot->cache_tokens.clear(); - server_task_result_slot_erase result; - result.id = task.id; - result.id_slot = id_slot; - result.n_erased = n_erased; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + res->id_slot = id_slot; + res->n_erased = n_erased; + queue_results.send(std::move(res)); } break; case SERVER_TASK_TYPE_SET_LORA: { common_lora_adapters_apply(ctx, loras); - server_task_result_apply_lora result; - result.id = task.id; - queue_results.send(result); + auto res = std::make_unique(); + res->id = task.id; + queue_results.send(std::move(res)); } break; } } @@ -2250,10 +2244,6 @@ int main(int argc, char ** argv) { common_init(); - // enabling this will output extra debug information in the HTTP responses from the server - // see format_final_response_oaicompat() - const bool verbose = params.verbosity > 9; - // struct that contains llama context and inference server_context ctx_server; @@ -2445,26 +2435,27 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.post(task, true); // high-priority task // get the result - task_result_ptr result_raw = ctx_server.queue_results.recv(task.id); + task_result_ptr result = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); - if (result_raw->type != RESULT_TYPE_METRICS) { - SRV_ERR("Unexpected result type: %d\n", result_raw->type); - res_error(res, format_error_response("Unexpected result type", ERROR_TYPE_SERVER)); + if (result->is_error()) { + res_error(res, result->to_json()); return; } - auto result = server_task_result_metrics::from_ptr(result_raw); + // TODO: get rid of this dynamic_cast + auto res_metrics = dynamic_cast(result.get()); + GGML_ASSERT(res_metrics != nullptr); // optionally return "fail_on_no_slot" error if (req.has_param("fail_on_no_slot")) { - if (result.n_idle_slots == 0) { + if (res_metrics->n_idle_slots == 0) { res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE)); return; } } - res_ok(res, result.slots_data); + res_ok(res, res_metrics->slots_data); }; const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) { @@ -2484,68 +2475,69 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.post(task, true); // high-priority task // get the result - task_result_ptr result_raw = ctx_server.queue_results.recv(task.id); + task_result_ptr result = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); - if (result_raw->type == RESULT_TYPE_ERROR) { - auto result = server_task_result_error::from_ptr(result_raw); - res_error(res, format_error_response(result.err_msg, result.err_type)); + + if (result->is_error()) { + res_error(res, result->to_json()); return; } - GGML_ASSERT(result_raw->type == RESULT_TYPE_METRICS); - auto result = server_task_result_metrics::from_ptr(result_raw); + // TODO: get rid of this dynamic_cast + auto res_metrics = dynamic_cast(result.get()); + GGML_ASSERT(res_metrics != nullptr); // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names json all_metrics_def = json { {"counter", {{ {"name", "prompt_tokens_total"}, {"help", "Number of prompt tokens processed."}, - {"value", (uint64_t) result.n_prompt_tokens_processed_total} + {"value", (uint64_t) res_metrics->n_prompt_tokens_processed_total} }, { {"name", "prompt_seconds_total"}, {"help", "Prompt process time"}, - {"value", (uint64_t) result.t_prompt_processing_total / 1.e3} + {"value", (uint64_t) res_metrics->t_prompt_processing_total / 1.e3} }, { {"name", "tokens_predicted_total"}, {"help", "Number of generation tokens processed."}, - {"value", (uint64_t) result.n_tokens_predicted_total} + {"value", (uint64_t) res_metrics->n_tokens_predicted_total} }, { {"name", "tokens_predicted_seconds_total"}, {"help", "Predict process time"}, - {"value", (uint64_t) result.t_tokens_generation_total / 1.e3} + {"value", (uint64_t) res_metrics->t_tokens_generation_total / 1.e3} }, { {"name", "n_decode_total"}, {"help", "Total number of llama_decode() calls"}, - {"value", result.n_decode_total} + {"value", res_metrics->n_decode_total} }, { {"name", "n_busy_slots_per_decode"}, {"help", "Average number of busy slots per llama_decode() call"}, - {"value", (float) result.n_busy_slots_total / (float) result.n_decode_total} + {"value", (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total} }}}, {"gauge", {{ {"name", "prompt_tokens_seconds"}, {"help", "Average prompt throughput in tokens/s."}, - {"value", result.n_prompt_tokens_processed ? 1.e3 / result.t_prompt_processing * result.n_prompt_tokens_processed : 0.} + {"value", res_metrics->n_prompt_tokens_processed ? 1.e3 / res_metrics->t_prompt_processing * res_metrics->n_prompt_tokens_processed : 0.} },{ {"name", "predicted_tokens_seconds"}, {"help", "Average generation throughput in tokens/s."}, - {"value", result.n_tokens_predicted ? 1.e3 / result.t_tokens_generation * result.n_tokens_predicted : 0.} + {"value", res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.} },{ {"name", "kv_cache_usage_ratio"}, {"help", "KV-cache usage. 1 means 100 percent usage."}, - {"value", 1. * result.kv_cache_used_cells / params.n_ctx} + {"value", 1. * res_metrics->kv_cache_used_cells / params.n_ctx} },{ {"name", "kv_cache_tokens"}, {"help", "KV-cache tokens."}, - {"value", (uint64_t) result.kv_cache_tokens_count} + {"value", (uint64_t) res_metrics->kv_cache_tokens_count} },{ {"name", "requests_processing"}, {"help", "Number of request processing."}, - {"value", (uint64_t) result.n_processing_slots} + {"value", (uint64_t) res_metrics->n_processing_slots} },{ {"name", "requests_deferred"}, {"help", "Number of request deferred."}, - {"value", (uint64_t) result.n_tasks_deferred} + {"value", (uint64_t) res_metrics->n_tasks_deferred} }}} }; @@ -2566,7 +2558,7 @@ int main(int argc, char ** argv) { } } - res.set_header("Process-Start-Time-Unix", std::to_string(result.t_start)); + res.set_header("Process-Start-Time-Unix", std::to_string(res_metrics->t_start)); res.set_content(prometheus.str(), "text/plain; version=0.0.4"); res.status = 200; // HTTP OK @@ -2592,18 +2584,15 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - task_result_ptr result_raw = ctx_server.queue_results.recv(id_task); + task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result_raw->type == RESULT_TYPE_ERROR) { - auto result = server_task_result_error::from_ptr(result_raw); - res_error(res, format_error_response(result.err_msg, result.err_type)); + if (result->is_error()) { + res_error(res, result->to_json()); return; } - GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_SAVE_LOAD); - auto result = server_task_result_slot_save_load::from_ptr(result_raw); - res_ok(res, result.to_json()); + res_ok(res, result->to_json()); }; const auto handle_slots_restore = [&ctx_server, &res_error, &res_ok, ¶ms](const httplib::Request & req, httplib::Response & res, int id_slot) { @@ -2626,18 +2615,16 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - task_result_ptr result_raw = ctx_server.queue_results.recv(id_task); + task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result_raw->type == RESULT_TYPE_ERROR) { - auto result = server_task_result_error::from_ptr(result_raw); - res_error(res, format_error_response(result.err_msg, result.err_type)); + if (result->is_error()) { + res_error(res, result->to_json()); return; } - GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_SAVE_LOAD); - auto result = server_task_result_slot_save_load::from_ptr(result_raw); - res_ok(res, result.to_json()); + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res_ok(res, result->to_json()); }; const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) { @@ -2650,18 +2637,16 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - task_result_ptr result_raw = ctx_server.queue_results.recv(id_task); + task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result_raw->type == RESULT_TYPE_ERROR) { - auto result = server_task_result_error::from_ptr(result_raw); - res_error(res, format_error_response(result.err_msg, result.err_type)); + if (result->is_error()) { + res_error(res, result->to_json()); return; } - GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_ERASE); - auto result = server_task_result_slot_erase::from_ptr(result_raw); - res_ok(res, result.to_json()); + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res_ok(res, result->to_json()); }; const auto handle_slots_action = [¶ms, &res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) { @@ -2722,15 +2707,13 @@ int main(int argc, char ** argv) { server_task_inf_type inf_type, json & data, httplib::Response & res, - const std::function(server_task_result_cmpl_partial&)> & format_partial = nullptr, - const std::function&)> & format_final = nullptr, - // wether to send [DONE] event after completion (required for OAI-compat) - bool send_done_event = false) { + bool oai_compat = false) { if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; } + data["completion_id"] = gen_chatcmplid(); std::vector tasks = ctx_server.create_tasks_inference(data, inf_type); ctx_server.queue_results.add_waiting_tasks(tasks); ctx_server.queue_tasks.post(tasks); @@ -2739,17 +2722,15 @@ int main(int argc, char ** argv) { const auto task_ids = server_task::get_list_id(tasks); if (!stream) { - ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { - if (format_final) { - res_ok(res, format_final(results)); - } else if (results.size() == 1) { + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + if (results.size() == 1) { // single result - res_ok(res, results[0].to_json()); + res_ok(res, oai_compat ? results[0]->to_json_oai_compat() : results[0]->to_json()); } else { // multiple results (multitask) json arr = json::array(); for (auto & res : results) { - arr.push_back(res.to_json()); + arr.push_back(oai_compat ? res->to_json_oai_compat() : res->to_json()); } res_ok(res, arr); } @@ -2759,22 +2740,23 @@ int main(int argc, char ** argv) { ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { - const auto chunked_content_provider = [task_ids, &ctx_server, format_partial = std::move(format_partial), send_done_event](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool { - if (format_partial) { - for (const auto & res : format_partial(result)) { + const auto chunked_content_provider = [task_ids, &ctx_server, oai_compat](size_t, httplib::DataSink & sink) { + ctx_server.receive_cmpl_results_stream(task_ids, [&](task_result_ptr & result) -> bool { + json res_json = oai_compat ? result->to_json_oai_compat() : result->to_json(); + if (res_json.is_array()) { + for (const auto & res : res_json) { if (!server_sent_event(sink, "data", res)) { return false; } } return true; } else { - return server_sent_event(sink, "data", result.to_json()); + return server_sent_event(sink, "data", res_json); } }, [&](const json & error_data) { server_sent_event(sink, "error", error_data); }); - if (send_done_event) { + if (oai_compat) { static const std::string ev_done = "data: [DONE]\n\n"; sink.write(ev_done.data(), ev_done.size()); } @@ -2792,13 +2774,7 @@ int main(int argc, char ** argv) { const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) { json data = json::parse(req.body); - return handle_completions_generic( - SERVER_TASK_INF_TYPE_COMPLETION, - data, - res, - // TODO: support OAI-compat response via format_partial and format_final - /* format_partial */ nullptr, - /* format_final */ nullptr); + return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res); }; const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { @@ -2851,7 +2827,7 @@ int main(int argc, char ** argv) { return handle_completions_generic(SERVER_TASK_INF_TYPE_INFILL, data, res); }; - const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_generic, verbose](const httplib::Request & req, httplib::Response & res) { + const auto handle_chat_completions = [&ctx_server, ¶ms, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) { if (ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED)); return; @@ -2859,20 +2835,9 @@ int main(int argc, char ** argv) { json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); - const auto completion_id = gen_chatcmplid(); std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - return handle_completions_generic( - SERVER_TASK_INF_TYPE_COMPLETION, - data, - res, - /* format_partial */ [data, model_name, completion_id](server_task_result_cmpl_partial & result) { - return format_partial_response_oaicompat(model_name, result, completion_id); - }, - /* format_final */ [data, verbose, model_name](std::vector & results) { - return format_final_response_oaicompat(data, results[0], model_name, false, verbose); - }, - /* send_done_event */ true); + return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true); }; const auto handle_models = [¶ms, &ctx_server](const httplib::Request &, httplib::Response & res) { @@ -2973,10 +2938,10 @@ int main(int argc, char ** argv) { // get the result std::unordered_set task_ids = server_task::get_list_id(tasks); - ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { for (auto & res : results) { - GGML_ASSERT(res.type == RESULT_TYPE_EMBD); - responses.push_back(res.to_json()); + GGML_ASSERT(dynamic_cast(res.get()) != nullptr); + responses.push_back(res->to_json()); } }, [&](const json & error_data) { res_error(res, error_data); @@ -3052,10 +3017,10 @@ int main(int argc, char ** argv) { // get the result std::unordered_set task_ids = server_task::get_list_id(tasks); - ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { for (auto & res : results) { - GGML_ASSERT(res.type == RESULT_TYPE_RERANK); - responses.push_back(res.to_json()); + GGML_ASSERT(dynamic_cast(res.get()) != nullptr); + responses.push_back(res->to_json()); } }, [&](const json & error_data) { res_error(res, error_data); @@ -3111,18 +3076,16 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - task_result_ptr result_raw = ctx_server.queue_results.recv(id_task); + task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); - if (result_raw->type == RESULT_TYPE_ERROR) { - auto result = server_task_result_error::from_ptr(result_raw); - res_error(res, format_error_response(result.err_msg, result.err_type)); + if (result->is_error()) { + res_error(res, result->to_json()); return; } - GGML_ASSERT(result_raw->type == RESULT_TYPE_APPLY_LORA); - auto result = server_task_result_apply_lora::from_ptr(result_raw); - res_ok(res, result.to_json()); + GGML_ASSERT(dynamic_cast(result.get()) != nullptr); + res_ok(res, result->to_json()); }; // diff --git a/examples/server/server.hpp b/examples/server/server.hpp index 1e65614f62ac7..201f154560e53 100644 --- a/examples/server/server.hpp +++ b/examples/server/server.hpp @@ -15,9 +15,6 @@ using json = nlohmann::ordered_json; -// cast a shared_ptr to a specific type using copy constructor -#define copy_cast_ptr(TYPEOUT, ptr) *(static_cast(ptr.get())); - enum stop_type { STOP_TYPE_NONE, STOP_TYPE_EOS, @@ -68,19 +65,6 @@ enum error_type { ERROR_TYPE_NOT_SUPPORTED, // custom error }; -enum result_type { - RESULT_TYPE_CMPL_FINAL, - RESULT_TYPE_CMPL_PARTIAL, - RESULT_TYPE_EMBD, - RESULT_TYPE_RERANK, - RESULT_TYPE_METRICS, - RESULT_TYPE_SLOT_SAVE_LOAD, - RESULT_TYPE_SLOT_ERASE, - RESULT_TYPE_APPLY_LORA, - RESULT_TYPE_ERROR, - RESULT_TYPE_UNKNOWN, // will throw an error -}; - struct server_task { int id = -1; // to be filled by server_queue int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL @@ -126,6 +110,12 @@ struct slot_params { uint32_t seed_cur; bool can_speculative; + // OAI-compat fields + bool oaicompat = false; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + bool verbose = false; + json to_json() { std::vector samplers; samplers.reserve(sampling.samplers.size()); @@ -205,11 +195,24 @@ struct result_timings { }; struct server_task_result { - result_type type = RESULT_TYPE_UNKNOWN; int id = -1; int id_slot = -1; - server_task_result() = default; - server_task_result(result_type type) : type(type) {} + virtual bool is_error() { + // only used by server_task_result_error + return false; + } + virtual bool is_stop() { + // only used by server_task_result_cmpl_partial + return false; + } + virtual int get_index() { + return -1; + } + virtual json to_json() = 0; + virtual json to_json_oai_compat() { + // used by server_task_result_cmpl_final and server_task_result_cmpl_partial + return json(); + } virtual ~server_task_result() = default; }; @@ -233,12 +236,10 @@ struct completion_token_output { }; struct server_task_result_cmpl_final : server_task_result { - server_task_result_cmpl_final() : server_task_result(RESULT_TYPE_CMPL_FINAL) {} int index = 0; std::string content; bool stream; result_timings timings; - std::string model_alias; std::string prompt; bool truncated; @@ -253,14 +254,23 @@ struct server_task_result_cmpl_final : server_task_result { slot_params generation_params; - json to_json() { + // OAI-compat fields + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + bool verbose = false; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override { // non-OAI-compat JSON return json { {"index", index}, {"content", content}, {"id_slot", id_slot}, {"stop", true}, - {"model", model_alias}, + {"model", oaicompat_model}, {"tokens_predicted", n_decoded}, {"tokens_evaluated", n_prompt_tokens}, {"generation_settings", generation_params.to_json()}, @@ -274,15 +284,55 @@ struct server_task_result_cmpl_final : server_task_result { }; } - static server_task_result_cmpl_final from_ptr(std::unique_ptr & result_ptr) { - return copy_cast_ptr(server_task_result_cmpl_final, result_ptr); - } + virtual json to_json_oai_compat() override { + std::string finish_reason = "length"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } + + json choices = json::array({json{ + {"finish_reason", finish_reason}, + {"index", 0}, + {"message", json{ + {"content", content}, + {"role", "assistant"} + } + }}}); + + std::time_t t = std::time(0); + + json res = json { + {"choices", choices}, + {"created", t}, + {"model", oaicompat_model}, + {"object", "chat.completion"}, + {"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens} + }}, + {"id", oaicompat_cmpl_id} + }; + + // extra fields for debugging purposes + if (verbose) { + res["__verbose"] = to_json(); + } + + // TODO: fix this + // if (result.contains("completion_probabilities")) { + // res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); + // } + + if (timings.prompt_n >= 0) { + res.push_back({"timings", timings.to_json()}); + } - virtual ~server_task_result_cmpl_final() = default; + return res; + } }; struct server_task_result_cmpl_partial : server_task_result { - server_task_result_cmpl_partial() : server_task_result(RESULT_TYPE_CMPL_PARTIAL) {} int index = 0; std::string content; @@ -295,7 +345,20 @@ struct server_task_result_cmpl_partial : server_task_result { std::vector probs_output; result_timings timings; - json to_json() { + // OAI-compat fields + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + bool verbose = false; + + virtual int get_index() override { + return index; + } + + virtual bool is_stop() override { + return stop != STOP_TYPE_NONE; + } + + virtual json to_json() override { bool is_stop = stop != STOP_TYPE_NONE; // non-OAI-compat JSON json res = json { @@ -317,67 +380,186 @@ struct server_task_result_cmpl_partial : server_task_result { return res; } - static server_task_result_cmpl_partial from_ptr(std::unique_ptr & result_ptr) { - return copy_cast_ptr(server_task_result_cmpl_partial, result_ptr); - } + virtual json to_json_oai_compat() override { + bool first = n_decoded == 0; + + std::string finish_reason; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } else if (stop == STOP_TYPE_LIMIT) { + finish_reason = "length"; + } + + std::time_t t = std::time(0); + + json choices; - virtual ~server_task_result_cmpl_partial() = default; + if (!finish_reason.empty()) { + choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + } else { + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}}); + } else { + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + return std::vector({initial_ret, second_ret}); + } + } else { + // Some idiosyncrasy in task processing logic makes several trailing calls + // with empty content, we ignore these at the calee site. + if (content.empty()) { + return std::vector({json::object()}); + } + + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); + } + } + + json ret = json { + {"choices", choices}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"} + }; + + if (timings.prompt_n >= 0) { + ret.push_back({"timings", timings.to_json()}); + } + + if (!finish_reason.empty()) { + ret.push_back({"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens}, + }}); + } + + return std::vector({ret}); + } }; struct server_task_result_embd : server_task_result { - server_task_result_embd() : server_task_result(RESULT_TYPE_EMBD) {} - result_type type = RESULT_TYPE_EMBD; int index = 0; std::vector embedding; - json to_json() { + virtual int get_index() override { + return index; + } + + virtual json to_json() override { return json { {"index", index}, {"embedding", embedding}, }; } - - static server_task_result_embd from_ptr(std::unique_ptr & result_ptr) { - return copy_cast_ptr(server_task_result_embd, result_ptr); - } - - virtual ~server_task_result_embd() = default; }; struct server_task_result_rerank : server_task_result { - server_task_result_rerank() : server_task_result(RESULT_TYPE_RERANK) {} int index = 0; float score = -1e6; - json to_json() { + virtual int get_index() override { + return index; + } + + virtual json to_json() override { return json { {"index", index}, {"score", score}, }; } +}; - static server_task_result_rerank from_ptr(std::unique_ptr & result_ptr) { - return copy_cast_ptr(server_task_result_rerank, result_ptr); +// this function maybe used outside of server_task_result_error +static json format_error_response(const std::string & message, const enum error_type type) { + std::string type_str; + int code = 500; + switch (type) { + case ERROR_TYPE_INVALID_REQUEST: + type_str = "invalid_request_error"; + code = 400; + break; + case ERROR_TYPE_AUTHENTICATION: + type_str = "authentication_error"; + code = 401; + break; + case ERROR_TYPE_NOT_FOUND: + type_str = "not_found_error"; + code = 404; + break; + case ERROR_TYPE_SERVER: + type_str = "server_error"; + code = 500; + break; + case ERROR_TYPE_PERMISSION: + type_str = "permission_error"; + code = 403; + break; + case ERROR_TYPE_NOT_SUPPORTED: + type_str = "not_supported_error"; + code = 501; + break; + case ERROR_TYPE_UNAVAILABLE: + type_str = "unavailable_error"; + code = 503; + break; } - - virtual ~server_task_result_rerank() = default; -}; + return json { + {"code", code}, + {"message", message}, + {"type", type_str}, + }; +} struct server_task_result_error : server_task_result { - server_task_result_error() : server_task_result(RESULT_TYPE_ERROR) {} int index = 0; error_type err_type = ERROR_TYPE_SERVER; std::string err_msg; - static server_task_result_error from_ptr(std::unique_ptr & result_ptr) { - return copy_cast_ptr(server_task_result_error, result_ptr); + virtual bool is_error() override { + return true; } - virtual ~server_task_result_error() = default; + virtual json to_json() override { + return format_error_response(err_msg, err_type); + } }; struct server_task_result_metrics : server_task_result { - server_task_result_metrics() : server_task_result(RESULT_TYPE_METRICS) {} int n_idle_slots; int n_processing_slots; int n_tasks_deferred; @@ -404,7 +586,7 @@ struct server_task_result_metrics : server_task_result { // TODO: get rid of this json object and use to_json() instead json slots_data = json::array(); - json to_json() { + virtual json to_json() override { return json { { "idle", n_idle_slots }, { "processing", n_processing_slots }, @@ -430,16 +612,9 @@ struct server_task_result_metrics : server_task_result { { "slots", slots_data }, }; } - - static server_task_result_metrics from_ptr(std::unique_ptr & result_ptr) { - return copy_cast_ptr(server_task_result_metrics, result_ptr); - } - - virtual ~server_task_result_metrics() = default; }; struct server_task_result_slot_save_load : server_task_result { - server_task_result_slot_save_load() : server_task_result(RESULT_TYPE_SLOT_SAVE_LOAD) {} std::string filename; bool is_save; // true = save, false = load @@ -447,7 +622,7 @@ struct server_task_result_slot_save_load : server_task_result { size_t n_bytes; double t_ms; - json to_json() { + virtual json to_json() override { if (is_save) { return json { { "id_slot", id_slot }, @@ -470,39 +645,21 @@ struct server_task_result_slot_save_load : server_task_result { }; } } - - static server_task_result_slot_save_load from_ptr(std::unique_ptr & result_ptr) { - return copy_cast_ptr(server_task_result_slot_save_load, result_ptr); - } - - virtual ~server_task_result_slot_save_load() = default; }; struct server_task_result_slot_erase : server_task_result { - server_task_result_slot_erase() : server_task_result(RESULT_TYPE_SLOT_ERASE) {} size_t n_erased; - json to_json() { + virtual json to_json() override { return json { { "id_slot", id_slot }, { "n_erased", n_erased }, }; } - - static server_task_result_slot_erase from_ptr(std::unique_ptr & result_ptr) { - return copy_cast_ptr(server_task_result_slot_erase, result_ptr); - } - - virtual ~server_task_result_slot_erase() = default; }; struct server_task_result_apply_lora : server_task_result { - server_task_result_apply_lora() : server_task_result(RESULT_TYPE_APPLY_LORA) {} - json to_json() { + virtual json to_json() override { return json {{ "success", true }}; } - - static server_task_result_apply_lora from_ptr(std::unique_ptr & result_ptr) { - return copy_cast_ptr(server_task_result_apply_lora, result_ptr); - } }; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 98a777192027c..8a8d9f8f7e894 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -583,155 +583,6 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_final_response_oaicompat( - const json & request, - server_task_result_cmpl_final & result, - const std::string & completion_id, - bool streaming = false, - bool verbose = false) { - std::string finish_reason = "length"; - if (result.stop == STOP_TYPE_WORD || result.stop == STOP_TYPE_EOS) { - finish_reason = "stop"; - } - - json choices = - streaming ? json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}) - : json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"message", json{{"content", result.content}, - {"role", "assistant"}}}}}); - - std::time_t t = std::time(0); - - json res = json { - {"choices", choices}, - {"created", t}, - {"model", - json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, - {"usage", json { - {"completion_tokens", result.n_decoded}, - {"prompt_tokens", result.n_prompt_tokens}, - {"total_tokens", result.n_decoded + result.n_prompt_tokens} - }}, - {"id", completion_id} - }; - - // extra fields for debugging purposes - if (verbose) { - res["__verbose"] = result.to_json(); - } - - // TODO: fix this - // if (result.contains("completion_probabilities")) { - // res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); - // } - - if (result.timings.prompt_n >= 0) { - res.push_back({"timings", result.timings.to_json()}); - } - - return res; -} - -// return value is vector as there is one case where we might need to generate two responses -static std::vector format_partial_response_oaicompat( - std::string modelname, - server_task_result_cmpl_partial & result, - const std::string & completion_id) { - bool first = result.n_decoded == 0; - std::string content = result.content; - - std::string finish_reason; - if (result.stop == STOP_TYPE_WORD || result.stop == STOP_TYPE_EOS) { - finish_reason = "stop"; - } else if (result.stop == STOP_TYPE_LIMIT) { - finish_reason = "length"; - } - - std::time_t t = std::time(0); - - json choices; - - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = json{{"choices", json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"role", "assistant"} - }}}})}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - json second_ret = json{ - {"choices", json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"content", content}}} - }})}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } - } else { - // Some idiosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); - } - } - - json ret = json { - {"choices", choices}, - {"created", t}, - {"id", completion_id}, - {"model", modelname}, - {"object", "chat.completion.chunk"} - }; - - if (result.timings.prompt_n >= 0) { - ret.push_back({"timings", result.timings.to_json()}); - } - - if (!finish_reason.empty()) { - ret.push_back({"usage", json { - {"completion_tokens", result.n_decoded}, - {"prompt_tokens", result.n_prompt_tokens}, - {"total_tokens", result.n_decoded + result.n_prompt_tokens} - }}); - } - - return std::vector({ret}); -} - static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { json data = json::array(); int i = 0; @@ -823,43 +674,3 @@ static json format_detokenized_response(const std::string & content) { {"content", content} }; } - -static json format_error_response(const std::string & message, const enum error_type type) { - std::string type_str; - int code = 500; - switch (type) { - case ERROR_TYPE_INVALID_REQUEST: - type_str = "invalid_request_error"; - code = 400; - break; - case ERROR_TYPE_AUTHENTICATION: - type_str = "authentication_error"; - code = 401; - break; - case ERROR_TYPE_NOT_FOUND: - type_str = "not_found_error"; - code = 404; - break; - case ERROR_TYPE_SERVER: - type_str = "server_error"; - code = 500; - break; - case ERROR_TYPE_PERMISSION: - type_str = "permission_error"; - code = 403; - break; - case ERROR_TYPE_NOT_SUPPORTED: - type_str = "not_supported_error"; - code = 501; - break; - case ERROR_TYPE_UNAVAILABLE: - type_str = "unavailable_error"; - code = 503; - break; - } - return json { - {"code", code}, - {"message", message}, - {"type", type_str}, - }; -} From 1cf769be673932e33791372ca1156503c51759b8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Dec 2024 16:04:36 +0100 Subject: [PATCH 11/19] remove server.hpp --- examples/server/server.cpp | 679 ++++++++++++++++++++++++++++++++++++- examples/server/server.hpp | 665 ------------------------------------ examples/server/utils.hpp | 26 -- 3 files changed, 675 insertions(+), 695 deletions(-) delete mode 100644 examples/server/server.hpp diff --git a/examples/server/server.cpp b/examples/server/server.cpp index c8cb48b15c6ba..44e6ead3ae897 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1,5 +1,4 @@ #include "utils.hpp" -#include "server.hpp" #include "arg.h" #include "common.h" @@ -33,9 +32,682 @@ using json = nlohmann::ordered_json; +enum stop_type { + STOP_TYPE_NONE, + STOP_TYPE_EOS, + STOP_TYPE_WORD, + STOP_TYPE_LIMIT, +}; + +// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 +enum slot_state { + SLOT_STATE_IDLE, + SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future + SLOT_STATE_PROCESSING_PROMPT, + SLOT_STATE_DONE_PROMPT, + SLOT_STATE_GENERATING, +}; + +enum server_state { + SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet + SERVER_STATE_READY, // Server is ready and model is loaded +}; + +enum server_task_type { + SERVER_TASK_TYPE_INFERENCE, + SERVER_TASK_TYPE_CANCEL, + SERVER_TASK_TYPE_NEXT_RESPONSE, + SERVER_TASK_TYPE_METRICS, + SERVER_TASK_TYPE_SLOT_SAVE, + SERVER_TASK_TYPE_SLOT_RESTORE, + SERVER_TASK_TYPE_SLOT_ERASE, + SERVER_TASK_TYPE_SET_LORA, +}; + +enum server_task_inf_type { + SERVER_TASK_INF_TYPE_COMPLETION, + SERVER_TASK_INF_TYPE_EMBEDDING, + SERVER_TASK_INF_TYPE_RERANK, + SERVER_TASK_INF_TYPE_INFILL, +}; + +// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 +enum error_type { + ERROR_TYPE_INVALID_REQUEST, + ERROR_TYPE_AUTHENTICATION, + ERROR_TYPE_SERVER, + ERROR_TYPE_NOT_FOUND, + ERROR_TYPE_PERMISSION, + ERROR_TYPE_UNAVAILABLE, // custom error + ERROR_TYPE_NOT_SUPPORTED, // custom error +}; + +struct server_task { + int id = -1; // to be filled by server_queue + int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL + + llama_tokens prompt_tokens; + server_task_type type; + + // TODO @ngxson : we should get rid of json type here + json data; + + server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; + + // utility function + static std::unordered_set get_list_id(const std::vector & tasks) { + std::unordered_set ids(tasks.size()); + for (size_t i = 0; i < tasks.size(); i++) { + ids.insert(tasks[i].id); + } + return ids; + } +}; + +struct slot_params { + bool stream = true; + bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half + int32_t n_predict = -1; // new tokens to predict + int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters + + int64_t t_max_prompt_ms = -1; // TODO: implement + int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit + + std::vector antiprompt; + bool timings_per_token = false; + + struct common_params_sampling sampling; + struct common_params_speculative speculative; + + // params only used in to_json() + int32_t n_ctx; + uint32_t seed_cur; + bool can_speculative; + + // OAI-compat fields + bool oaicompat = false; + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + bool verbose = false; + + json to_json() { + std::vector samplers; + samplers.reserve(sampling.samplers.size()); + for (const auto & sampler : sampling.samplers) { + samplers.emplace_back(common_sampler_type_to_str(sampler)); + } + + return json { + {"n_ctx", n_ctx}, + {"n_predict", n_predict}, // Server configured n_predict + {"temperature", sampling.temp}, + {"dynatemp_range", sampling.dynatemp_range}, + {"dynatemp_exponent", sampling.dynatemp_exponent}, + {"top_k", sampling.top_k}, + {"top_p", sampling.top_p}, + {"min_p", sampling.min_p}, + {"xtc_probability", sampling.xtc_probability}, + {"xtc_threshold", sampling.xtc_threshold}, + {"typical_p", sampling.typ_p}, + {"repeat_last_n", sampling.penalty_last_n}, + {"repeat_penalty", sampling.penalty_repeat}, + {"presence_penalty", sampling.penalty_present}, + {"frequency_penalty", sampling.penalty_freq}, + {"dry_multiplier", sampling.dry_multiplier}, + {"dry_base", sampling.dry_base}, + {"dry_allowed_length", sampling.dry_allowed_length}, + {"dry_penalty_last_n", sampling.dry_penalty_last_n}, + {"dry_sequence_breakers", sampling.dry_sequence_breakers}, + {"mirostat", sampling.mirostat}, + {"mirostat_tau", sampling.mirostat_tau}, + {"mirostat_eta", sampling.mirostat_eta}, + {"penalize_nl", sampling.penalize_nl}, + {"stop", antiprompt}, + {"max_tokens", n_predict}, // User configured n_predict + {"n_keep", n_keep}, + {"n_discard", n_discard}, + {"ignore_eos", sampling.ignore_eos}, + {"stream", stream}, + //{"logit_bias", sampling.logit_bias}, + {"n_probs", sampling.n_probs}, + {"min_keep", sampling.min_keep}, + {"grammar", sampling.grammar}, + {"samplers", samplers}, + {"speculative", can_speculative}, + {"speculative.n_max", speculative.n_max}, + {"speculative.n_min", speculative.n_min}, + {"speculative.p_min", speculative.p_min}, + {"timings_per_token", timings_per_token}, + }; + } +}; + +struct result_timings { + int32_t prompt_n = -1; + double prompt_ms; + double prompt_per_token_ms; + double prompt_per_second; + + int32_t predicted_n = -1; + double predicted_ms; + double predicted_per_token_ms; + double predicted_per_second; + + json to_json() { + return { + {"prompt_n", prompt_n}, + {"prompt_ms", prompt_ms}, + {"prompt_per_token_ms", prompt_per_token_ms}, + {"prompt_per_second", prompt_per_second}, + + {"predicted_n", predicted_n}, + {"predicted_ms", predicted_ms}, + {"predicted_per_token_ms", predicted_per_token_ms}, + {"predicted_per_second", predicted_per_second}, + }; + } +}; + +struct server_task_result { + int id = -1; + int id_slot = -1; + virtual bool is_error() { + // only used by server_task_result_error + return false; + } + virtual bool is_stop() { + // only used by server_task_result_cmpl_partial + return false; + } + virtual int get_index() { + return -1; + } + virtual json to_json() = 0; + virtual json to_json_oai_compat() { + // used by server_task_result_cmpl_final and server_task_result_cmpl_partial + return json(); + } + virtual ~server_task_result() = default; +}; + // using shared_ptr for polymorphism of server_task_result using task_result_ptr = std::unique_ptr; +inline std::string stop_type_to_str(stop_type type) { + switch (type) { + case STOP_TYPE_EOS: return "eos"; + case STOP_TYPE_WORD: return "word"; + case STOP_TYPE_LIMIT: return "limit"; + default: return "none"; + } +} + +struct completion_token_output { + llama_token tok; + std::string text_to_send; + struct token_prob { + llama_token tok; + float prob; + }; + std::vector probs; + + json to_json(const llama_context * ctx) const { + json probs_for_token = json::array(); + for (const auto & p : probs) { + const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); + probs_for_token.push_back(json { + {"tok_str", tok_str}, + {"prob", p.prob}, + }); + } + return probs_for_token; + } + + static json probs_vector_to_json(const llama_context * ctx, const std::vector & probs) { + json out = json::array(); + for (const auto & prob : probs) { + const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); + out.push_back(json { + {"content", tok_str}, + {"probs", prob.to_json(ctx)}, + }); + } + return out; + } +}; + +struct server_task_result_cmpl_final : server_task_result { + int index = 0; + std::string content; + bool stream; + result_timings timings; + std::string prompt; + + bool truncated; + int32_t n_decoded; + int32_t n_prompt_tokens; + int32_t n_tokens_cached; + int32_t has_new_line; + std::string stopping_word; + stop_type stop = STOP_TYPE_NONE; + + std::vector probs_output; + + slot_params generation_params; + + // OAI-compat fields + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + bool verbose = false; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override { + // non-OAI-compat JSON + return json { + {"index", index}, + {"content", content}, + {"id_slot", id_slot}, + {"stop", true}, + {"model", oaicompat_model}, + {"tokens_predicted", n_decoded}, + {"tokens_evaluated", n_prompt_tokens}, + {"generation_settings", generation_params.to_json()}, + {"prompt", prompt}, + {"has_new_line", has_new_line}, + {"truncated", truncated}, + {"stop_type", stop_type_to_str(stop)}, + {"stopping_word", stopping_word}, + {"tokens_cached", n_tokens_cached}, + {"timings", timings.to_json()}, + }; + } + + virtual json to_json_oai_compat() override { + std::string finish_reason = "length"; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } + + json choices = json::array({json{ + {"finish_reason", finish_reason}, + {"index", 0}, + {"message", json{ + {"content", content}, + {"role", "assistant"} + } + }}}); + + std::time_t t = std::time(0); + + json res = json { + {"choices", choices}, + {"created", t}, + {"model", oaicompat_model}, + {"object", "chat.completion"}, + {"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens} + }}, + {"id", oaicompat_cmpl_id} + }; + + // extra fields for debugging purposes + if (verbose) { + res["__verbose"] = to_json(); + } + + // TODO: fix this + // if (result.contains("completion_probabilities")) { + // res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); + // } + + if (timings.prompt_n >= 0) { + res.push_back({"timings", timings.to_json()}); + } + + return res; + } +}; + +struct server_task_result_cmpl_partial : server_task_result { + int index = 0; + std::string content; + + bool truncated; + int32_t n_decoded; + int32_t n_prompt_tokens; + + stop_type stop = STOP_TYPE_NONE; + + std::vector probs_output; + result_timings timings; + + // OAI-compat fields + std::string oaicompat_model; + std::string oaicompat_cmpl_id; + bool verbose = false; + + virtual int get_index() override { + return index; + } + + virtual bool is_stop() override { + return stop != STOP_TYPE_NONE; + } + + virtual json to_json() override { + bool is_stop = stop != STOP_TYPE_NONE; + // non-OAI-compat JSON + json res = json { + {"index", index}, + {"content", content}, + {"stop_type", stop_type_to_str(stop)}, + {"stop", is_stop}, + {"id_slot", id_slot}, + {"tokens_predicted", n_decoded}, + {"tokens_evaluated", n_prompt_tokens}, + }; + // populate the timings object when needed (usually for the last response or with timings_per_token enabled) + if (timings.prompt_n > 0) { + res.push_back({"timings", timings.to_json()}); + } + if (is_stop) { + res.push_back({"truncated", truncated}); + } + return res; + } + + virtual json to_json_oai_compat() override { + bool first = n_decoded == 0; + + std::string finish_reason; + if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { + finish_reason = "stop"; + } else if (stop == STOP_TYPE_LIMIT) { + finish_reason = "length"; + } + + std::time_t t = std::time(0); + + json choices; + + if (!finish_reason.empty()) { + choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + } else { + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}}); + } else { + // We have to send this as two updates to conform to openai behavior + json initial_ret = json{{"choices", json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"role", "assistant"} + }}}})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + json second_ret = json{ + {"choices", json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{ + {"content", content}}} + }})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"}}; + + return std::vector({initial_ret, second_ret}); + } + } else { + // Some idiosyncrasy in task processing logic makes several trailing calls + // with empty content, we ignore these at the calee site. + if (content.empty()) { + return std::vector({json::object()}); + } + + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); + } + } + + json ret = json { + {"choices", choices}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"object", "chat.completion.chunk"} + }; + + if (timings.prompt_n >= 0) { + ret.push_back({"timings", timings.to_json()}); + } + + if (!finish_reason.empty()) { + ret.push_back({"usage", json { + {"completion_tokens", n_decoded}, + {"prompt_tokens", n_prompt_tokens}, + {"total_tokens", n_decoded + n_prompt_tokens}, + }}); + } + + return std::vector({ret}); + } +}; + +struct server_task_result_embd : server_task_result { + int index = 0; + std::vector embedding; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override { + return json { + {"index", index}, + {"embedding", embedding}, + }; + } +}; + +struct server_task_result_rerank : server_task_result { + int index = 0; + float score = -1e6; + + virtual int get_index() override { + return index; + } + + virtual json to_json() override { + return json { + {"index", index}, + {"score", score}, + }; + } +}; + +// this function maybe used outside of server_task_result_error +static json format_error_response(const std::string & message, const enum error_type type) { + std::string type_str; + int code = 500; + switch (type) { + case ERROR_TYPE_INVALID_REQUEST: + type_str = "invalid_request_error"; + code = 400; + break; + case ERROR_TYPE_AUTHENTICATION: + type_str = "authentication_error"; + code = 401; + break; + case ERROR_TYPE_NOT_FOUND: + type_str = "not_found_error"; + code = 404; + break; + case ERROR_TYPE_SERVER: + type_str = "server_error"; + code = 500; + break; + case ERROR_TYPE_PERMISSION: + type_str = "permission_error"; + code = 403; + break; + case ERROR_TYPE_NOT_SUPPORTED: + type_str = "not_supported_error"; + code = 501; + break; + case ERROR_TYPE_UNAVAILABLE: + type_str = "unavailable_error"; + code = 503; + break; + } + return json { + {"code", code}, + {"message", message}, + {"type", type_str}, + }; +} + +struct server_task_result_error : server_task_result { + int index = 0; + error_type err_type = ERROR_TYPE_SERVER; + std::string err_msg; + + virtual bool is_error() override { + return true; + } + + virtual json to_json() override { + return format_error_response(err_msg, err_type); + } +}; + +struct server_task_result_metrics : server_task_result { + int n_idle_slots; + int n_processing_slots; + int n_tasks_deferred; + int64_t t_start; + + int32_t kv_cache_tokens_count; + int32_t kv_cache_used_cells; + + // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields + uint64_t n_prompt_tokens_processed_total = 0; + uint64_t t_prompt_processing_total = 0; + uint64_t n_tokens_predicted_total = 0; + uint64_t t_tokens_generation_total = 0; + + uint64_t n_prompt_tokens_processed = 0; + uint64_t t_prompt_processing = 0; + + uint64_t n_tokens_predicted = 0; + uint64_t t_tokens_generation = 0; + + uint64_t n_decode_total = 0; + uint64_t n_busy_slots_total = 0; + + // TODO: get rid of this json object and use to_json() instead + json slots_data = json::array(); + + virtual json to_json() override { + return json { + { "idle", n_idle_slots }, + { "processing", n_processing_slots }, + { "deferred", n_tasks_deferred }, + { "t_start", t_start }, + + { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total }, + { "t_tokens_generation_total", t_tokens_generation_total }, + { "n_tokens_predicted_total", n_tokens_predicted_total }, + { "t_prompt_processing_total", t_prompt_processing_total }, + + { "n_prompt_tokens_processed", n_prompt_tokens_processed }, + { "t_prompt_processing", t_prompt_processing }, + { "n_tokens_predicted", n_tokens_predicted }, + { "t_tokens_generation", t_tokens_generation }, + + { "n_decode_total", n_decode_total }, + { "n_busy_slots_total", n_busy_slots_total }, + + { "kv_cache_tokens_count", kv_cache_tokens_count }, + { "kv_cache_used_cells", kv_cache_used_cells }, + + { "slots", slots_data }, + }; + } +}; + +struct server_task_result_slot_save_load : server_task_result { + std::string filename; + bool is_save; // true = save, false = load + + size_t n_tokens; + size_t n_bytes; + double t_ms; + + virtual json to_json() override { + if (is_save) { + return json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_saved", n_tokens }, + { "n_written", n_bytes }, + { "timings", { + { "save_ms", t_ms } + }}, + }; + } else { + return json { + { "id_slot", id_slot }, + { "filename", filename }, + { "n_restored", n_tokens }, + { "n_read", n_bytes }, + { "timings", { + { "restore_ms", t_ms } + }}, + }; + } + } +}; + +struct server_task_result_slot_erase : server_task_result { + size_t n_erased; + + virtual json to_json() override { + return json { + { "id_slot", id_slot }, + { "n_erased", n_erased }, + }; + } +}; + +struct server_task_result_apply_lora : server_task_result { + virtual json to_json() override { + return json {{ "success", true }}; + } +}; + struct server_slot { int id; int id_task = -1; @@ -786,8 +1458,9 @@ struct server_context { const auto & data = task.data; if (data.count("__oaicompat") != 0) { + std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias; slot.params.oaicompat = true; - slot.params.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + slot.params.oaicompat_model = json_value(data, "model", model_name); slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string()); } else { slot.params.oaicompat = false; @@ -2835,8 +3508,6 @@ int main(int argc, char ** argv) { json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); - std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true); }; diff --git a/examples/server/server.hpp b/examples/server/server.hpp deleted file mode 100644 index 201f154560e53..0000000000000 --- a/examples/server/server.hpp +++ /dev/null @@ -1,665 +0,0 @@ -#pragma once - -#include "common.h" -#include "llama.h" -#include "sampling.h" -#include "speculative.h" - -// Change JSON_ASSERT from assert() to GGML_ASSERT: -#define JSON_ASSERT GGML_ASSERT -#include "json.hpp" - -#include -#include -#include - -using json = nlohmann::ordered_json; - -enum stop_type { - STOP_TYPE_NONE, - STOP_TYPE_EOS, - STOP_TYPE_WORD, - STOP_TYPE_LIMIT, -}; - -// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283 -enum slot_state { - SLOT_STATE_IDLE, - SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future - SLOT_STATE_PROCESSING_PROMPT, - SLOT_STATE_DONE_PROMPT, - SLOT_STATE_GENERATING, -}; - -enum server_state { - SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet - SERVER_STATE_READY, // Server is ready and model is loaded -}; - -enum server_task_type { - SERVER_TASK_TYPE_INFERENCE, - SERVER_TASK_TYPE_CANCEL, - SERVER_TASK_TYPE_NEXT_RESPONSE, - SERVER_TASK_TYPE_METRICS, - SERVER_TASK_TYPE_SLOT_SAVE, - SERVER_TASK_TYPE_SLOT_RESTORE, - SERVER_TASK_TYPE_SLOT_ERASE, - SERVER_TASK_TYPE_SET_LORA, -}; - -enum server_task_inf_type { - SERVER_TASK_INF_TYPE_COMPLETION, - SERVER_TASK_INF_TYPE_EMBEDDING, - SERVER_TASK_INF_TYPE_RERANK, - SERVER_TASK_INF_TYPE_INFILL, -}; - -// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 -enum error_type { - ERROR_TYPE_INVALID_REQUEST, - ERROR_TYPE_AUTHENTICATION, - ERROR_TYPE_SERVER, - ERROR_TYPE_NOT_FOUND, - ERROR_TYPE_PERMISSION, - ERROR_TYPE_UNAVAILABLE, // custom error - ERROR_TYPE_NOT_SUPPORTED, // custom error -}; - -struct server_task { - int id = -1; // to be filled by server_queue - int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL - - llama_tokens prompt_tokens; - server_task_type type; - - // TODO @ngxson : we should get rid of json type here - json data; - - server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION; - - // utility function - static std::unordered_set get_list_id(const std::vector & tasks) { - std::unordered_set ids(tasks.size()); - for (size_t i = 0; i < tasks.size(); i++) { - ids.insert(tasks[i].id); - } - return ids; - } -}; - -struct slot_params { - bool stream = true; - bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt - - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half - int32_t n_predict = -1; // new tokens to predict - int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters - - int64_t t_max_prompt_ms = -1; // TODO: implement - int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit - - std::vector antiprompt; - bool timings_per_token = false; - - struct common_params_sampling sampling; - struct common_params_speculative speculative; - - // params only used in to_json() - int32_t n_ctx; - uint32_t seed_cur; - bool can_speculative; - - // OAI-compat fields - bool oaicompat = false; - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - bool verbose = false; - - json to_json() { - std::vector samplers; - samplers.reserve(sampling.samplers.size()); - for (const auto & sampler : sampling.samplers) { - samplers.emplace_back(common_sampler_type_to_str(sampler)); - } - - return json { - {"n_ctx", n_ctx}, - {"n_predict", n_predict}, // Server configured n_predict - {"temperature", sampling.temp}, - {"dynatemp_range", sampling.dynatemp_range}, - {"dynatemp_exponent", sampling.dynatemp_exponent}, - {"top_k", sampling.top_k}, - {"top_p", sampling.top_p}, - {"min_p", sampling.min_p}, - {"xtc_probability", sampling.xtc_probability}, - {"xtc_threshold", sampling.xtc_threshold}, - {"typical_p", sampling.typ_p}, - {"repeat_last_n", sampling.penalty_last_n}, - {"repeat_penalty", sampling.penalty_repeat}, - {"presence_penalty", sampling.penalty_present}, - {"frequency_penalty", sampling.penalty_freq}, - {"dry_multiplier", sampling.dry_multiplier}, - {"dry_base", sampling.dry_base}, - {"dry_allowed_length", sampling.dry_allowed_length}, - {"dry_penalty_last_n", sampling.dry_penalty_last_n}, - {"dry_sequence_breakers", sampling.dry_sequence_breakers}, - {"mirostat", sampling.mirostat}, - {"mirostat_tau", sampling.mirostat_tau}, - {"mirostat_eta", sampling.mirostat_eta}, - {"penalize_nl", sampling.penalize_nl}, - {"stop", antiprompt}, - {"max_tokens", n_predict}, // User configured n_predict - {"n_keep", n_keep}, - {"n_discard", n_discard}, - {"ignore_eos", sampling.ignore_eos}, - {"stream", stream}, - //{"logit_bias", sampling.logit_bias}, - {"n_probs", sampling.n_probs}, - {"min_keep", sampling.min_keep}, - {"grammar", sampling.grammar}, - {"samplers", samplers}, - {"speculative", can_speculative}, - {"speculative.n_max", speculative.n_max}, - {"speculative.n_min", speculative.n_min}, - {"speculative.p_min", speculative.p_min}, - {"timings_per_token", timings_per_token}, - }; - } -}; - -struct result_timings { - int32_t prompt_n = -1; - double prompt_ms; - double prompt_per_token_ms; - double prompt_per_second; - - int32_t predicted_n = -1; - double predicted_ms; - double predicted_per_token_ms; - double predicted_per_second; - - json to_json() { - return { - {"prompt_n", prompt_n}, - {"prompt_ms", prompt_ms}, - {"prompt_per_token_ms", prompt_per_token_ms}, - {"prompt_per_second", prompt_per_second}, - - {"predicted_n", predicted_n}, - {"predicted_ms", predicted_ms}, - {"predicted_per_token_ms", predicted_per_token_ms}, - {"predicted_per_second", predicted_per_second}, - }; - } -}; - -struct server_task_result { - int id = -1; - int id_slot = -1; - virtual bool is_error() { - // only used by server_task_result_error - return false; - } - virtual bool is_stop() { - // only used by server_task_result_cmpl_partial - return false; - } - virtual int get_index() { - return -1; - } - virtual json to_json() = 0; - virtual json to_json_oai_compat() { - // used by server_task_result_cmpl_final and server_task_result_cmpl_partial - return json(); - } - virtual ~server_task_result() = default; -}; - -inline std::string stop_type_to_str(stop_type type) { - switch (type) { - case STOP_TYPE_EOS: return "eos"; - case STOP_TYPE_WORD: return "word"; - case STOP_TYPE_LIMIT: return "limit"; - default: return "none"; - } -} - -struct completion_token_output { - llama_token tok; - std::string text_to_send; - struct token_prob { - llama_token tok; - float prob; - }; - std::vector probs; -}; - -struct server_task_result_cmpl_final : server_task_result { - int index = 0; - std::string content; - bool stream; - result_timings timings; - std::string prompt; - - bool truncated; - int32_t n_decoded; - int32_t n_prompt_tokens; - int32_t n_tokens_cached; - int32_t has_new_line; - std::string stopping_word; - stop_type stop = STOP_TYPE_NONE; - - std::vector probs_output; - - slot_params generation_params; - - // OAI-compat fields - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - bool verbose = false; - - virtual int get_index() override { - return index; - } - - virtual json to_json() override { - // non-OAI-compat JSON - return json { - {"index", index}, - {"content", content}, - {"id_slot", id_slot}, - {"stop", true}, - {"model", oaicompat_model}, - {"tokens_predicted", n_decoded}, - {"tokens_evaluated", n_prompt_tokens}, - {"generation_settings", generation_params.to_json()}, - {"prompt", prompt}, - {"has_new_line", has_new_line}, - {"truncated", truncated}, - {"stop_type", stop_type_to_str(stop)}, - {"stopping_word", stopping_word}, - {"tokens_cached", n_tokens_cached}, - {"timings", timings.to_json()}, - }; - } - - virtual json to_json_oai_compat() override { - std::string finish_reason = "length"; - if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - finish_reason = "stop"; - } - - json choices = json::array({json{ - {"finish_reason", finish_reason}, - {"index", 0}, - {"message", json{ - {"content", content}, - {"role", "assistant"} - } - }}}); - - std::time_t t = std::time(0); - - json res = json { - {"choices", choices}, - {"created", t}, - {"model", oaicompat_model}, - {"object", "chat.completion"}, - {"usage", json { - {"completion_tokens", n_decoded}, - {"prompt_tokens", n_prompt_tokens}, - {"total_tokens", n_decoded + n_prompt_tokens} - }}, - {"id", oaicompat_cmpl_id} - }; - - // extra fields for debugging purposes - if (verbose) { - res["__verbose"] = to_json(); - } - - // TODO: fix this - // if (result.contains("completion_probabilities")) { - // res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); - // } - - if (timings.prompt_n >= 0) { - res.push_back({"timings", timings.to_json()}); - } - - return res; - } -}; - -struct server_task_result_cmpl_partial : server_task_result { - int index = 0; - std::string content; - - bool truncated; - int32_t n_decoded; - int32_t n_prompt_tokens; - - stop_type stop = STOP_TYPE_NONE; - - std::vector probs_output; - result_timings timings; - - // OAI-compat fields - std::string oaicompat_model; - std::string oaicompat_cmpl_id; - bool verbose = false; - - virtual int get_index() override { - return index; - } - - virtual bool is_stop() override { - return stop != STOP_TYPE_NONE; - } - - virtual json to_json() override { - bool is_stop = stop != STOP_TYPE_NONE; - // non-OAI-compat JSON - json res = json { - {"index", index}, - {"content", content}, - {"stop_type", stop_type_to_str(stop)}, - {"stop", is_stop}, - {"id_slot", id_slot}, - {"tokens_predicted", n_decoded}, - {"tokens_evaluated", n_prompt_tokens}, - }; - // populate the timings object when needed (usually for the last response or with timings_per_token enabled) - if (timings.prompt_n > 0) { - res.push_back({"timings", timings.to_json()}); - } - if (is_stop) { - res.push_back({"truncated", truncated}); - } - return res; - } - - virtual json to_json_oai_compat() override { - bool first = n_decoded == 0; - - std::string finish_reason; - if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { - finish_reason = "stop"; - } else if (stop == STOP_TYPE_LIMIT) { - finish_reason = "length"; - } - - std::time_t t = std::time(0); - - json choices; - - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = json{{"choices", json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"role", "assistant"} - }}}})}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}}; - - json second_ret = json{ - {"choices", json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{ - {"content", content}}} - }})}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } - } else { - // Some idiosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); - } - } - - json ret = json { - {"choices", choices}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"} - }; - - if (timings.prompt_n >= 0) { - ret.push_back({"timings", timings.to_json()}); - } - - if (!finish_reason.empty()) { - ret.push_back({"usage", json { - {"completion_tokens", n_decoded}, - {"prompt_tokens", n_prompt_tokens}, - {"total_tokens", n_decoded + n_prompt_tokens}, - }}); - } - - return std::vector({ret}); - } -}; - -struct server_task_result_embd : server_task_result { - int index = 0; - std::vector embedding; - - virtual int get_index() override { - return index; - } - - virtual json to_json() override { - return json { - {"index", index}, - {"embedding", embedding}, - }; - } -}; - -struct server_task_result_rerank : server_task_result { - int index = 0; - float score = -1e6; - - virtual int get_index() override { - return index; - } - - virtual json to_json() override { - return json { - {"index", index}, - {"score", score}, - }; - } -}; - -// this function maybe used outside of server_task_result_error -static json format_error_response(const std::string & message, const enum error_type type) { - std::string type_str; - int code = 500; - switch (type) { - case ERROR_TYPE_INVALID_REQUEST: - type_str = "invalid_request_error"; - code = 400; - break; - case ERROR_TYPE_AUTHENTICATION: - type_str = "authentication_error"; - code = 401; - break; - case ERROR_TYPE_NOT_FOUND: - type_str = "not_found_error"; - code = 404; - break; - case ERROR_TYPE_SERVER: - type_str = "server_error"; - code = 500; - break; - case ERROR_TYPE_PERMISSION: - type_str = "permission_error"; - code = 403; - break; - case ERROR_TYPE_NOT_SUPPORTED: - type_str = "not_supported_error"; - code = 501; - break; - case ERROR_TYPE_UNAVAILABLE: - type_str = "unavailable_error"; - code = 503; - break; - } - return json { - {"code", code}, - {"message", message}, - {"type", type_str}, - }; -} - -struct server_task_result_error : server_task_result { - int index = 0; - error_type err_type = ERROR_TYPE_SERVER; - std::string err_msg; - - virtual bool is_error() override { - return true; - } - - virtual json to_json() override { - return format_error_response(err_msg, err_type); - } -}; - -struct server_task_result_metrics : server_task_result { - int n_idle_slots; - int n_processing_slots; - int n_tasks_deferred; - int64_t t_start; - - int32_t kv_cache_tokens_count; - int32_t kv_cache_used_cells; - - // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields - uint64_t n_prompt_tokens_processed_total = 0; - uint64_t t_prompt_processing_total = 0; - uint64_t n_tokens_predicted_total = 0; - uint64_t t_tokens_generation_total = 0; - - uint64_t n_prompt_tokens_processed = 0; - uint64_t t_prompt_processing = 0; - - uint64_t n_tokens_predicted = 0; - uint64_t t_tokens_generation = 0; - - uint64_t n_decode_total = 0; - uint64_t n_busy_slots_total = 0; - - // TODO: get rid of this json object and use to_json() instead - json slots_data = json::array(); - - virtual json to_json() override { - return json { - { "idle", n_idle_slots }, - { "processing", n_processing_slots }, - { "deferred", n_tasks_deferred }, - { "t_start", t_start }, - - { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total }, - { "t_tokens_generation_total", t_tokens_generation_total }, - { "n_tokens_predicted_total", n_tokens_predicted_total }, - { "t_prompt_processing_total", t_prompt_processing_total }, - - { "n_prompt_tokens_processed", n_prompt_tokens_processed }, - { "t_prompt_processing", t_prompt_processing }, - { "n_tokens_predicted", n_tokens_predicted }, - { "t_tokens_generation", t_tokens_generation }, - - { "n_decode_total", n_decode_total }, - { "n_busy_slots_total", n_busy_slots_total }, - - { "kv_cache_tokens_count", kv_cache_tokens_count }, - { "kv_cache_used_cells", kv_cache_used_cells }, - - { "slots", slots_data }, - }; - } -}; - -struct server_task_result_slot_save_load : server_task_result { - std::string filename; - bool is_save; // true = save, false = load - - size_t n_tokens; - size_t n_bytes; - double t_ms; - - virtual json to_json() override { - if (is_save) { - return json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_saved", n_tokens }, - { "n_written", n_bytes }, - { "timings", { - { "save_ms", t_ms } - }}, - }; - } else { - return json { - { "id_slot", id_slot }, - { "filename", filename }, - { "n_restored", n_tokens }, - { "n_read", n_bytes }, - { "timings", { - { "restore_ms", t_ms } - }}, - }; - } - } -}; - -struct server_task_result_slot_erase : server_task_result { - size_t n_erased; - - virtual json to_json() override { - return json { - { "id_slot", id_slot }, - { "n_erased", n_erased }, - }; - } -}; - -struct server_task_result_apply_lora : server_task_result { - virtual json to_json() override { - return json {{ "success", true }}; - } -}; diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 8a8d9f8f7e894..70bcaf17c272c 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -3,7 +3,6 @@ #include "common.h" #include "log.h" #include "llama.h" -#include "server.hpp" #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error @@ -476,31 +475,6 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx, return out; } -// convert a vector of completion_token_output to json -static json probs_vector_to_json(const llama_context * ctx, const std::vector & probs) { - json out = json::array(); - - for (const auto & prob : probs) { - json probs_for_token = json::array(); - - for (const auto & p : prob.probs) { - const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); - probs_for_token.push_back(json { - {"tok_str", tok_str}, - {"prob", p.prob}, - }); - } - - const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); - out.push_back(json { - {"content", tok_str}, - {"probs", probs_for_token}, - }); - } - - return out; -} - static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) { const std::string str = std::string(event) + ": " + From 2e560f90ff06e9a9b5d12b9ddf3498ab9e1e9b44 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Dec 2024 16:13:52 +0100 Subject: [PATCH 12/19] clarify server_sent_event RFC specs --- examples/server/utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 70bcaf17c272c..a96116ac36caa 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -479,7 +479,7 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons const std::string str = std::string(event) + ": " + data.dump(-1, ' ', false, json::error_handler_t::replace) + - "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain) + "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row). LOG_DBG("data stream, to_send: %s", str.c_str()); From a43e1dc66c911804483dfb67b675ff99034229d8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Dec 2024 22:35:07 +0100 Subject: [PATCH 13/19] apply review comments --- examples/server/server.cpp | 46 +++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 44e6ead3ae897..b58f1018628b2 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -234,7 +234,7 @@ struct server_task_result { }; // using shared_ptr for polymorphism of server_task_result -using task_result_ptr = std::unique_ptr; +using server_task_result_ptr = std::unique_ptr; inline std::string stop_type_to_str(stop_type type) { switch (type) { @@ -1097,7 +1097,7 @@ struct server_response { std::unordered_set waiting_task_ids; // the main result queue (using ptr for polymorphism) - std::vector queue_results; + std::vector queue_results; std::mutex mutex_results; std::condition_variable condition_results; @@ -1137,7 +1137,7 @@ struct server_response { } // This function blocks the thread until there is a response for one of the id_tasks - task_result_ptr recv(const std::unordered_set & id_tasks) { + server_task_result_ptr recv(const std::unordered_set & id_tasks) { while (true) { std::unique_lock lock(mutex_results); condition_results.wait(lock, [&]{ @@ -1146,7 +1146,7 @@ struct server_response { for (int i = 0; i < (int) queue_results.size(); i++) { if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) { - task_result_ptr res = std::move(queue_results[i]); + server_task_result_ptr res = std::move(queue_results[i]); queue_results.erase(queue_results.begin() + i); return res; } @@ -1157,13 +1157,13 @@ struct server_response { } // single-task version of recv() - task_result_ptr recv(int id_task) { + server_task_result_ptr recv(int id_task) { std::unordered_set id_tasks = {id_task}; return recv(id_tasks); } // Send a new result to a waiting id_task - void send(task_result_ptr && result) { + void send(server_task_result_ptr && result) { SRV_DBG("sending result for task id = %d\n", result->id); std::unique_lock lock(mutex_results); @@ -2078,11 +2078,11 @@ struct server_context { // receive the results from task(s) created by create_tasks_inference void receive_multi_results( const std::unordered_set & id_tasks, - const std::function&)> & result_handler, + const std::function&)> & result_handler, const std::function & error_handler) { - std::vector results(id_tasks.size()); + std::vector results(id_tasks.size()); for (size_t i = 0; i < id_tasks.size(); i++) { - task_result_ptr result = queue_results.recv(id_tasks); + server_task_result_ptr result = queue_results.recv(id_tasks); if (result->is_error()) { error_handler(result->to_json()); @@ -2104,12 +2104,12 @@ struct server_context { // receive the results from task(s) created by create_tasks_inference, in stream mode void receive_cmpl_results_stream( - const std::unordered_set & id_tasks, const - std::function & result_handler, const - std::function & error_handler) { + const std::unordered_set & id_tasks, + const std::function & result_handler, + const std::function & error_handler) { size_t n_finished = 0; while (true) { - task_result_ptr result = queue_results.recv(id_tasks); + server_task_result_ptr result = queue_results.recv(id_tasks); if (result->is_error()) { error_handler(result->to_json()); @@ -3108,7 +3108,7 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.post(task, true); // high-priority task // get the result - task_result_ptr result = ctx_server.queue_results.recv(task.id); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); if (result->is_error()) { @@ -3148,7 +3148,7 @@ int main(int argc, char ** argv) { ctx_server.queue_tasks.post(task, true); // high-priority task // get the result - task_result_ptr result = ctx_server.queue_results.recv(task.id); + server_task_result_ptr result = ctx_server.queue_results.recv(task.id); ctx_server.queue_results.remove_waiting_task_id(task.id); if (result->is_error()) { @@ -3257,7 +3257,7 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - task_result_ptr result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); if (result->is_error()) { @@ -3288,7 +3288,7 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - task_result_ptr result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); if (result->is_error()) { @@ -3310,7 +3310,7 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - task_result_ptr result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); if (result->is_error()) { @@ -3395,7 +3395,7 @@ int main(int argc, char ** argv) { const auto task_ids = server_task::get_list_id(tasks); if (!stream) { - ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { if (results.size() == 1) { // single result res_ok(res, oai_compat ? results[0]->to_json_oai_compat() : results[0]->to_json()); @@ -3414,7 +3414,7 @@ int main(int argc, char ** argv) { ctx_server.queue_results.remove_waiting_task_ids(task_ids); } else { const auto chunked_content_provider = [task_ids, &ctx_server, oai_compat](size_t, httplib::DataSink & sink) { - ctx_server.receive_cmpl_results_stream(task_ids, [&](task_result_ptr & result) -> bool { + ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool { json res_json = oai_compat ? result->to_json_oai_compat() : result->to_json(); if (res_json.is_array()) { for (const auto & res : res_json) { @@ -3609,7 +3609,7 @@ int main(int argc, char ** argv) { // get the result std::unordered_set task_ids = server_task::get_list_id(tasks); - ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { for (auto & res : results) { GGML_ASSERT(dynamic_cast(res.get()) != nullptr); responses.push_back(res->to_json()); @@ -3688,7 +3688,7 @@ int main(int argc, char ** argv) { // get the result std::unordered_set task_ids = server_task::get_list_id(tasks); - ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { + ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { for (auto & res : results) { GGML_ASSERT(dynamic_cast(res.get()) != nullptr); responses.push_back(res->to_json()); @@ -3747,7 +3747,7 @@ int main(int argc, char ** argv) { const int id_task = ctx_server.queue_tasks.post(task); ctx_server.queue_results.add_waiting_task_id(id_task); - task_result_ptr result = ctx_server.queue_results.recv(id_task); + server_task_result_ptr result = ctx_server.queue_results.recv(id_task); ctx_server.queue_results.remove_waiting_task_id(id_task); if (result->is_error()) { From fb4b9be6025a4c934d19f47307adc919fbceee4e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Dec 2024 23:13:06 +0100 Subject: [PATCH 14/19] fix model_alias and completion_probabilities --- common/common.h | 2 +- examples/server/server.cpp | 37 +++++++++--------- examples/server/tests/README.md | 6 +++ .../server/tests/unit/test_chat_completion.py | 20 +++++----- examples/server/tests/unit/test_completion.py | 39 +++++++++++++++++++ 5 files changed, 73 insertions(+), 31 deletions(-) diff --git a/common/common.h b/common/common.h index 0373fd3ead49e..95d20401d2a9a 100644 --- a/common/common.h +++ b/common/common.h @@ -215,7 +215,7 @@ struct common_params { struct common_params_speculative speculative; std::string model = ""; // model path // NOLINT - std::string model_alias = "unknown"; // model alias // NOLINT + std::string model_alias = ""; // model alias // NOLINT std::string model_url = ""; // model url to download // NOLINT std::string hf_token = ""; // HF token // NOLINT std::string hf_repo = ""; // HF repo // NOLINT diff --git a/examples/server/server.cpp b/examples/server/server.cpp index b58f1018628b2..95d4bfd37f5c0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -250,29 +250,29 @@ struct completion_token_output { std::string text_to_send; struct token_prob { llama_token tok; + std::string tok_str; float prob; }; std::vector probs; - json to_json(const llama_context * ctx) const { + json to_json() const { json probs_for_token = json::array(); for (const auto & p : probs) { - const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); probs_for_token.push_back(json { - {"tok_str", tok_str}, + {"tok_str", p.tok_str}, {"prob", p.prob}, }); } return probs_for_token; } - static json probs_vector_to_json(const llama_context * ctx, const std::vector & probs) { + static json probs_vector_to_json(const std::vector & probs) { json out = json::array(); for (const auto & prob : probs) { - const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); + const std::string tok_str = prob.text_to_send; out.push_back(json { {"content", tok_str}, - {"probs", prob.to_json(ctx)}, + {"probs", prob.to_json()}, }); } return out; @@ -309,7 +309,7 @@ struct server_task_result_cmpl_final : server_task_result { virtual json to_json() override { // non-OAI-compat JSON - return json { + json res = json { {"index", index}, {"content", content}, {"id_slot", id_slot}, @@ -326,6 +326,10 @@ struct server_task_result_cmpl_final : server_task_result { {"tokens_cached", n_tokens_cached}, {"timings", timings.to_json()}, }; + if (!probs_output.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); + } + return res; } virtual json to_json_oai_compat() override { @@ -362,12 +366,6 @@ struct server_task_result_cmpl_final : server_task_result { if (verbose) { res["__verbose"] = to_json(); } - - // TODO: fix this - // if (result.contains("completion_probabilities")) { - // res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array()); - // } - if (timings.prompt_n >= 0) { res.push_back({"timings", timings.to_json()}); } @@ -418,6 +416,9 @@ struct server_task_result_cmpl_partial : server_task_result { if (timings.prompt_n > 0) { res.push_back({"timings", timings.to_json()}); } + if (!probs_output.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); + } if (is_stop) { res.push_back({"truncated", truncated}); } @@ -2786,9 +2787,11 @@ struct server_context { const auto * cur_p = common_sampler_get_candidates(slot.smpl); for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) { + auto tok_id = cur_p->data[i].id; result.probs.push_back({ - cur_p->data[i].id, - i >= cur_p->size ? 0.0f : cur_p->data[i].p, + tok_id, + tokens_to_output_formatted_string(ctx, tok_id), + i >= cur_p->size ? 0.0f : cur_p->data[i].p, }); } @@ -2920,10 +2923,6 @@ int main(int argc, char ** argv) { // struct that contains llama context and inference server_context ctx_server; - if (params.model_alias == "unknown") { - params.model_alias = params.model; - } - llama_backend_init(); llama_numa_init(params.numa); diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 2930a2e0dea0f..fa3d0a2f5ff66 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -44,4 +44,10 @@ To run with stdout/stderr display in real time (verbose output, but useful for d DEBUG=1 ./tests.sh -s -v -x ``` +Hint: You can compile and run test in single command, useful for local developement: + +```shell +cmake --build build -j --target llama-server && ./examples/server/tests/tests.sh +``` + To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html) diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index 486c1f87a0856..11bf712b6fe72 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -14,7 +14,7 @@ def create_server(): @pytest.mark.parametrize( "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason", [ - ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"), + (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"), ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"), ] ) @@ -30,6 +30,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte ], }) assert res.status_code == 200 + assert res.body["model"] == model if model is not None else server.model_alias assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["completion_tokens"] == n_predicted choice = res.body["choices"][0] @@ -39,17 +40,17 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte @pytest.mark.parametrize( - "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated", + "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason", [ - ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False), - ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False), + ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"), + ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"), ] ) -def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated): +def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason): global server + server.model_alias = None server.start() res = server.make_stream_request("POST", "/chat/completions", data={ - "model": model, "max_tokens": max_tokens, "messages": [ {"role": "system", "content": system_prompt}, @@ -60,16 +61,13 @@ def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, r content = "" for data in res: choice = data["choices"][0] + assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future if choice["finish_reason"] in ["stop", "length"]: assert data["usage"]["prompt_tokens"] == n_prompt assert data["usage"]["completion_tokens"] == n_predicted assert "content" not in choice["delta"] assert match_regex(re_content, content) - # FIXME: not sure why this is incorrect in stream mode - # if truncated: - # assert choice["finish_reason"] == "length" - # else: - # assert choice["finish_reason"] == "stop" + assert choice["finish_reason"] == finish_reason else: assert choice["finish_reason"] is None content += choice["delta"]["content"] diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index 2fa30dd033431..1c3aa77de5bba 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -51,6 +51,24 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp content += data["content"] +def test_completion_stream_vs_non_stream(): + global server + server.start() + res_stream = server.make_stream_request("POST", "/completion", data={ + "n_predict": 8, + "prompt": "I believe the meaning of life is", + "stream": True, + }) + res_non_stream = server.make_request("POST", "/completion", data={ + "n_predict": 8, + "prompt": "I believe the meaning of life is", + }) + content_stream = "" + for data in res_stream: + content_stream += data["content"] + assert content_stream == res_non_stream.body["content"] + + @pytest.mark.parametrize("n_slots", [1, 2]) def test_consistent_result_same_seed(n_slots: int): global server @@ -221,3 +239,24 @@ def check_slots_status(): assert len(res.body["content"]) > 10 # FIXME: the result is not deterministic when using other slot than slot 0 # assert match_regex(re_content, res.body["content"]) + + +def test_n_probs(): + global server + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "I believe the meaning of life is", + "n_probs": 10, + "temperature": 0.0, + "n_predict": 5, + }) + assert res.status_code == 200 + assert "completion_probabilities" in res.body + assert len(res.body["completion_probabilities"]) == 5 + for tok in res.body["completion_probabilities"]: + assert "probs" in tok + assert len(tok["probs"]) == 10 + for prob in tok["probs"]: + assert "prob" in prob + assert "tok_str" in prob + assert 0.0 <= prob["prob"] <= 1.0 From 4c3d2580b28e566affe917740462a242b2b283c8 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Dec 2024 23:16:27 +0100 Subject: [PATCH 15/19] small clean up --- examples/server/tests/unit/test_chat_completion.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index 11bf712b6fe72..f13c6c4ca4bd3 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -40,15 +40,15 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte @pytest.mark.parametrize( - "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason", + "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason", [ - ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"), - ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"), + ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"), + ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"), ] ) -def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason): +def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason): global server - server.model_alias = None + server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL server.start() res = server.make_stream_request("POST", "/chat/completions", data={ "max_tokens": max_tokens, From ffc4441b1d9c03a8c5b65ee53bdc961d4dfe0de0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Dec 2024 23:29:27 +0100 Subject: [PATCH 16/19] remove virtual for to_json_oai_compat() --- examples/server/server.cpp | 46 ++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 95d4bfd37f5c0..3685df0d99767 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -128,10 +128,11 @@ struct slot_params { bool can_speculative; // OAI-compat fields - bool oaicompat = false; + bool verbose = false; + bool oaicompat = false; + bool oaicompat_chat = true; std::string oaicompat_model; std::string oaicompat_cmpl_id; - bool verbose = false; json to_json() { std::vector samplers; @@ -226,10 +227,6 @@ struct server_task_result { return -1; } virtual json to_json() = 0; - virtual json to_json_oai_compat() { - // used by server_task_result_cmpl_final and server_task_result_cmpl_partial - return json(); - } virtual ~server_task_result() = default; }; @@ -299,16 +296,21 @@ struct server_task_result_cmpl_final : server_task_result { slot_params generation_params; // OAI-compat fields + bool verbose = false; + bool oaicompat = false; + bool oaicompat_chat = true; // TODO: support oaicompat for non-chat std::string oaicompat_model; std::string oaicompat_cmpl_id; - bool verbose = false; virtual int get_index() override { return index; } virtual json to_json() override { - // non-OAI-compat JSON + if (oaicompat) { + return to_json_oai_compat(); + } + // otherwise, non-OAI-compat JSON json res = json { {"index", index}, {"content", content}, @@ -332,7 +334,7 @@ struct server_task_result_cmpl_final : server_task_result { return res; } - virtual json to_json_oai_compat() override { + json to_json_oai_compat() { std::string finish_reason = "length"; if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { finish_reason = "stop"; @@ -388,9 +390,11 @@ struct server_task_result_cmpl_partial : server_task_result { result_timings timings; // OAI-compat fields + bool verbose = false; + bool oaicompat = false; + bool oaicompat_chat = true; // TODO: support oaicompat for non-chat std::string oaicompat_model; std::string oaicompat_cmpl_id; - bool verbose = false; virtual int get_index() override { return index; @@ -401,6 +405,9 @@ struct server_task_result_cmpl_partial : server_task_result { } virtual json to_json() override { + if (oaicompat) { + return to_json_oai_compat(); + } bool is_stop = stop != STOP_TYPE_NONE; // non-OAI-compat JSON json res = json { @@ -425,7 +432,7 @@ struct server_task_result_cmpl_partial : server_task_result { return res; } - virtual json to_json_oai_compat() override { + json to_json_oai_compat() { bool first = n_decoded == 0; std::string finish_reason; @@ -1461,6 +1468,7 @@ struct server_context { if (data.count("__oaicompat") != 0) { std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias; slot.params.oaicompat = true; + slot.params.oaicompat_chat = json_value(data, "__oaicompat_chat", false); slot.params.oaicompat_model = json_value(data, "model", model_name); slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string()); } else { @@ -1850,9 +1858,11 @@ struct server_context { res->stop = slot.stop; + res->verbose = slot.params.verbose; + res->oaicompat = slot.params.oaicompat; + res->oaicompat_chat = slot.params.oaicompat_chat; res->oaicompat_model = slot.params.oaicompat_model; res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; - res->verbose = slot.params.verbose; // populate res.probs_output if (slot.params.sampling.n_probs > 0) { @@ -1899,9 +1909,11 @@ struct server_context { res->stopping_word = slot.stopping_word; res->stop = slot.stop; + res->verbose = slot.params.verbose; + res->oaicompat = slot.params.oaicompat; + res->oaicompat_chat = slot.params.oaicompat_chat; res->oaicompat_model = slot.params.oaicompat_model; res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id; - res->verbose = slot.params.verbose; // populate res.probs_output if (slot.params.sampling.n_probs > 0) { @@ -3397,12 +3409,12 @@ int main(int argc, char ** argv) { ctx_server.receive_multi_results(task_ids, [&](std::vector & results) { if (results.size() == 1) { // single result - res_ok(res, oai_compat ? results[0]->to_json_oai_compat() : results[0]->to_json()); + res_ok(res, results[0]->to_json()); } else { // multiple results (multitask) json arr = json::array(); for (auto & res : results) { - arr.push_back(oai_compat ? res->to_json_oai_compat() : res->to_json()); + arr.push_back(res->to_json()); } res_ok(res, arr); } @@ -3414,7 +3426,7 @@ int main(int argc, char ** argv) { } else { const auto chunked_content_provider = [task_ids, &ctx_server, oai_compat](size_t, httplib::DataSink & sink) { ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool { - json res_json = oai_compat ? result->to_json_oai_compat() : result->to_json(); + json res_json = result->to_json(); if (res_json.is_array()) { for (const auto & res : res_json) { if (!server_sent_event(sink, "data", res)) { @@ -3506,7 +3518,7 @@ int main(int argc, char ** argv) { } json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template); - + data["__oaicompat_chat"] = true; return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true); }; From db66153d921c8bf59227d0e2efa7f010bf65ec2e Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Dec 2024 23:34:24 +0100 Subject: [PATCH 17/19] naming oai_compat --> oaicompat --- examples/server/server.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3685df0d99767..50415c6a0ce7a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -308,7 +308,7 @@ struct server_task_result_cmpl_final : server_task_result { virtual json to_json() override { if (oaicompat) { - return to_json_oai_compat(); + return to_json_oaicompat(); } // otherwise, non-OAI-compat JSON json res = json { @@ -334,7 +334,7 @@ struct server_task_result_cmpl_final : server_task_result { return res; } - json to_json_oai_compat() { + json to_json_oaicompat() { std::string finish_reason = "length"; if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { finish_reason = "stop"; @@ -406,7 +406,7 @@ struct server_task_result_cmpl_partial : server_task_result { virtual json to_json() override { if (oaicompat) { - return to_json_oai_compat(); + return to_json_oaicompat(); } bool is_stop = stop != STOP_TYPE_NONE; // non-OAI-compat JSON @@ -432,7 +432,7 @@ struct server_task_result_cmpl_partial : server_task_result { return res; } - json to_json_oai_compat() { + json to_json_oaicompat() { bool first = n_decoded == 0; std::string finish_reason; From dfa59b908f1c1f47fd79679af5a414f13beed1c0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 5 Dec 2024 23:43:48 +0100 Subject: [PATCH 18/19] fix unwanted recursive call --- examples/server/server.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 50415c6a0ce7a..881a7b902ba91 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -307,10 +307,10 @@ struct server_task_result_cmpl_final : server_task_result { } virtual json to_json() override { - if (oaicompat) { - return to_json_oaicompat(); - } - // otherwise, non-OAI-compat JSON + return oaicompat ? to_json_oaicompat_chat() : to_json_non_oaicompat(); + } + + json to_json_non_oaicompat() { json res = json { {"index", index}, {"content", content}, @@ -334,7 +334,7 @@ struct server_task_result_cmpl_final : server_task_result { return res; } - json to_json_oaicompat() { + json to_json_oaicompat_chat() { std::string finish_reason = "length"; if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) { finish_reason = "stop"; @@ -366,7 +366,7 @@ struct server_task_result_cmpl_final : server_task_result { // extra fields for debugging purposes if (verbose) { - res["__verbose"] = to_json(); + res["__verbose"] = to_json_non_oaicompat(); } if (timings.prompt_n >= 0) { res.push_back({"timings", timings.to_json()}); @@ -3594,12 +3594,12 @@ int main(int argc, char ** argv) { const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { const json body = json::parse(req.body); - bool is_openai = false; + bool oaicompat = false; // an input prompt can be a string or a list of tokens (integer) json prompt; if (body.count("input") != 0) { - is_openai = true; + oaicompat = true; prompt = body.at("input"); } else if (body.count("content") != 0) { // with "content", we only support single prompt @@ -3638,7 +3638,7 @@ int main(int argc, char ** argv) { } // write JSON response - json root = is_openai + json root = oaicompat ? format_embeddings_response_oaicompat(body, responses) : responses[0]; res_ok(res, root); From 25be4ccc89459b10fb60f6e1b87cd12f4d59928d Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Fri, 6 Dec 2024 10:47:52 +0100 Subject: [PATCH 19/19] update docs --- examples/server/README.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/server/README.md b/examples/server/README.md index b2dd7b65a990c..8dbed2626a444 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -473,9 +473,11 @@ Notice that each `probs` is an array of length `n_probs`. - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.). - `model`: The path to the model loaded with `-m` - `prompt`: The provided `prompt` -- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token -- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered -- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided +- `stop_type`: Indicating whether the completion has stopped. Possible values are: + - `none`: Generating (not stopped) + - `eos`: Stopped because it encountered the EOS token + - `limit`: Stopped because `n_predict` tokens were generated before stop words or EOS was encountered + - `word`: Stopped due to encountering a stopping word from `stop` JSON array provided - `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word) - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second` - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)