From b7d38eef0c04bacfba6dd0608350073845bfdf72 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Tue, 3 Dec 2024 23:37:03 +0100
Subject: [PATCH 01/19] server : (refactoring) reduce usage of json internally

---
 examples/server/server.cpp | 274 ++++++++-----------------------------
 examples/server/server.hpp | 191 ++++++++++++++++++++++++++
 examples/server/utils.hpp  |  24 +---
 3 files changed, 246 insertions(+), 243 deletions(-)
 create mode 100644 examples/server/server.hpp
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 9bca3f30e7574..1482ecbee29df 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,4 +1,5 @@
 #include "utils.hpp"
+#include "server.hpp"
 
 #include "arg.h"
 #include "common.h"
@@ -32,90 +33,6 @@
 
 using json = nlohmann::ordered_json;
 
-enum stop_type {
-    STOP_TYPE_FULL,
-    STOP_TYPE_PARTIAL,
-};
-
-// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
-enum slot_state {
-    SLOT_STATE_IDLE,
-    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
-    SLOT_STATE_PROCESSING_PROMPT,
-    SLOT_STATE_DONE_PROMPT,
-    SLOT_STATE_GENERATING,
-};
-
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-};
-
-enum server_task_type {
-    SERVER_TASK_TYPE_INFERENCE,
-    SERVER_TASK_TYPE_CANCEL,
-    SERVER_TASK_TYPE_NEXT_RESPONSE,
-    SERVER_TASK_TYPE_METRICS,
-    SERVER_TASK_TYPE_SLOT_SAVE,
-    SERVER_TASK_TYPE_SLOT_RESTORE,
-    SERVER_TASK_TYPE_SLOT_ERASE,
-    SERVER_TASK_TYPE_SET_LORA,
-};
-
-enum server_task_inf_type {
-    SERVER_TASK_INF_TYPE_COMPLETION,
-    SERVER_TASK_INF_TYPE_EMBEDDING,
-    SERVER_TASK_INF_TYPE_RERANK,
-    SERVER_TASK_INF_TYPE_INFILL,
-};
-
-struct server_task {
-    int id        = -1; // to be filled by server_queue
-    int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
-
-    llama_tokens prompt_tokens;
-    server_task_type type;
-    json data;
-
-    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
-
-    // utility function
-    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
-        std::unordered_set<int> ids(tasks.size());
-        for (size_t i = 0; i < tasks.size(); i++) {
-            ids.insert(tasks[i].id);
-        }
-        return ids;
-    }
-};
-
-struct server_task_result {
-    int id       = -1;
-
-    json data;
-
-    bool stop;
-    bool error;
-};
-
-struct slot_params {
-    bool stream       = true;
-    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
-
-    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
-    int32_t n_predict = -1; // new tokens to predict
-    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
-
-    int64_t t_max_prompt_ms  = -1; // TODO: implement
-    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
-
-    std::vector<std::string> antiprompt;
-
-    struct common_params_sampling sampling;
-    struct common_params_speculative speculative;
-};
-
 struct server_slot {
     int id;
     int id_task = -1;
@@ -166,8 +83,6 @@ struct server_slot {
     bool stopped_word   = false;
     bool stopped_limit  = false;
 
-    bool timings_per_token = false;
-
     bool oaicompat = false;
 
     std::string oaicompat_model;
@@ -255,37 +170,39 @@ struct server_slot {
         }
     }
 
-    json get_formated_timings() const {
-        return json {
-            {"prompt_n",               n_prompt_tokens_processed},
-            {"prompt_ms",              t_prompt_processing},
-            {"prompt_per_token_ms",    t_prompt_processing / n_prompt_tokens_processed},
-            {"prompt_per_second",      1e3 / t_prompt_processing * n_prompt_tokens_processed},
-
-            {"predicted_n",            n_decoded},
-            {"predicted_ms",           t_token_generation},
-            {"predicted_per_token_ms", t_token_generation / n_decoded},
-            {"predicted_per_second",   1e3 / t_token_generation * n_decoded},
-        };
+    result_timings get_timings() const {
+        result_timings timings;
+        timings.prompt_n = n_prompt_tokens_processed;
+        timings.prompt_ms = t_prompt_processing;
+        timings.prompt_per_token_ms = t_prompt_processing / n_prompt_tokens_processed;
+        timings.prompt_per_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed;
+
+        timings.predicted_n = n_decoded;
+        timings.predicted_ms = t_token_generation;
+        timings.predicted_per_token_ms = t_token_generation / n_decoded;
+        timings.predicted_per_second = 1e3 / t_token_generation * n_decoded;
+
+        return timings;
     }
 
-    size_t find_stopping_strings(const std::string & text, const size_t last_token_size, const stop_type type) {
+    size_t find_stopping_strings(const std::string & text, const size_t last_token_size, bool is_full_stop) {
         size_t stop_pos = std::string::npos;
 
         for (const std::string & word : params.antiprompt) {
             size_t pos;
 
-            if (type == STOP_TYPE_FULL) {
+            if (is_full_stop) {
                 const size_t tmp      = word.size() + last_token_size;
                 const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
 
                 pos = text.find(word, from_pos);
             } else {
+                // otherwise, partial stop
                 pos = find_partial_stop_string(word, text);
             }
 
             if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
-                if (type == STOP_TYPE_FULL) {
+                if (is_full_stop) {
                     stopped_word   = true;
                     stopping_word  = word;
                     has_next_token = false;
@@ -1108,14 +1025,14 @@ struct server_context {
             const std::string str_test = slot.generated_text.substr(pos);
             bool send_text = true;
 
-            size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_FULL);
+            size_t stop_pos = slot.find_stopping_strings(str_test, token_str.size(), true);
             if (stop_pos != std::string::npos) {
                 slot.generated_text.erase(
                     slot.generated_text.begin() + pos + stop_pos,
                     slot.generated_text.end());
                 pos = std::min(slot.n_sent_text, slot.generated_text.size());
             } else if (slot.has_next_token) {
-                stop_pos = slot.find_stopping_strings(str_test, token_str.size(), STOP_TYPE_PARTIAL);
+                stop_pos = slot.find_stopping_strings(str_test, token_str.size(), false);
                 send_text = stop_pos == std::string::npos;
             }
 
@@ -1229,60 +1146,6 @@ struct server_context {
         return slot.has_next_token; // continue
     }
 
-    json get_formated_generation(const server_slot & slot) const {
-        std::vector<std::string> samplers;
-        samplers.reserve(slot.params.sampling.samplers.size());
-        for (const auto & sampler : slot.params.sampling.samplers) {
-            samplers.emplace_back(common_sampler_type_to_str(sampler));
-        }
-
-        return json {
-            {"n_ctx",                     slot.n_ctx},
-            {"n_predict",                 slot.n_predict},     // Server configured n_predict
-            {"model",                     params_base.model_alias},
-            {"seed",                      slot.params.sampling.seed},
-            {"seed_cur",                  slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
-            {"temperature",               slot.params.sampling.temp},
-            {"dynatemp_range",            slot.params.sampling.dynatemp_range},
-            {"dynatemp_exponent",         slot.params.sampling.dynatemp_exponent},
-            {"top_k",                     slot.params.sampling.top_k},
-            {"top_p",                     slot.params.sampling.top_p},
-            {"min_p",                     slot.params.sampling.min_p},
-            {"xtc_probability",           slot.params.sampling.xtc_probability},
-            {"xtc_threshold",             slot.params.sampling.xtc_threshold},
-            {"typical_p",                 slot.params.sampling.typ_p},
-            {"repeat_last_n",             slot.params.sampling.penalty_last_n},
-            {"repeat_penalty",            slot.params.sampling.penalty_repeat},
-            {"presence_penalty",          slot.params.sampling.penalty_present},
-            {"frequency_penalty",         slot.params.sampling.penalty_freq},
-            {"dry_multiplier",            slot.params.sampling.dry_multiplier},
-            {"dry_base",                  slot.params.sampling.dry_base},
-            {"dry_allowed_length",        slot.params.sampling.dry_allowed_length},
-            {"dry_penalty_last_n",        slot.params.sampling.dry_penalty_last_n},
-            {"dry_sequence_breakers",     slot.params.sampling.dry_sequence_breakers},
-            {"mirostat",                  slot.params.sampling.mirostat},
-            {"mirostat_tau",              slot.params.sampling.mirostat_tau},
-            {"mirostat_eta",              slot.params.sampling.mirostat_eta},
-            {"penalize_nl",               slot.params.sampling.penalize_nl},
-            {"stop",                      slot.params.antiprompt},
-            {"max_tokens",                slot.params.n_predict}, // User configured n_predict
-            {"n_keep",                    slot.params.n_keep},
-            {"n_discard",                 slot.params.n_discard},
-            {"ignore_eos",                slot.params.sampling.ignore_eos},
-            {"stream",                    slot.params.stream},
-          //{"logit_bias",                slot.params.sampling.logit_bias},
-            {"n_probs",                   slot.params.sampling.n_probs},
-            {"min_keep",                  slot.params.sampling.min_keep},
-            {"grammar",                   slot.params.sampling.grammar},
-            {"samplers",                  samplers},
-            {"speculative",               slot.can_speculate()},
-            {"speculative.n_max",         slot.params.speculative.n_max},
-            {"speculative.n_min",         slot.params.speculative.n_min},
-            {"speculative.p_min",         slot.params.speculative.p_min},
-            {"timings_per_token",         slot.timings_per_token},
-        };
-    }
-
     void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
         send_error(task.id, error, type);
     }
@@ -1294,27 +1157,18 @@ struct server_context {
     void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
         SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
 
-        server_task_result res;
+        server_task_result_error res;
         res.id       = id_task;
-        res.stop     = false;
-        res.error    = true;
-        res.data     = format_error_response(error, type);
+        res.err_type = type;
+        res.err_msg  = error;
 
         queue_results.send(res);
     }
 
     void send_partial_response(server_slot & slot, completion_token_output tkn) {
-        server_task_result res;
+        server_task_result_cmpl_partial res;
         res.id       = slot.id_task;
-        res.error    = false;
-        res.stop     = false;
-        res.data     = json {
-            {"content",    tkn.text_to_send},
-            {"stop",       false},
-            {"id_slot",    slot.id},
-            {"multimodal", false},
-            {"index",      slot.index},
-        };
+        res.content  = tkn.text_to_send;
 
         if (slot.params.sampling.n_probs > 0) {
             const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
@@ -1323,30 +1177,35 @@ struct server_context {
 
             std::vector<completion_token_output> probs_output;
             if (probs_pos < probs_stop_pos) {
-                probs_output = std::vector<completion_token_output>(
+                res.probs_output = std::vector<completion_token_output>(
                         slot.generated_token_probs.begin() + probs_pos,
                         slot.generated_token_probs.begin() + probs_stop_pos);
             }
-            slot.n_sent_token_probs = probs_stop_pos;
-
-            res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs_output);
-        }
-
-        if (slot.oaicompat) {
-            res.data["oaicompat_token_ctr"] = slot.n_decoded;
-            res.data["model"] = slot.oaicompat_model;
         }
 
-        if (slot.timings_per_token) {
-            res.data["timings"] = slot.get_formated_timings();
+        if (slot.params.timings_per_token) {
+            res.timings = slot.get_timings();
         }
 
         queue_results.send(res);
     }
 
     void send_final_response(const server_slot & slot) {
-        server_task_result res;
-        res.id       = slot.id_task;
+        server_task_result_cmpl_final res;
+        res.id              = slot.id_task;
+        res.id_slot         = slot.id;
+        res.content         = slot.generated_text;
+
+        res.n_decoded       = slot.n_decoded;
+        res.n_prompt_tokens = slot.n_prompt_tokens;
+        res.has_new_line    = slot.has_new_line;
+        res.n_tokens_cached = slot.n_past;
+        res.content         = slot.generated_text;
+
+        res.params          = slot.params; // copy the parameters
+
+
+
         res.error    = false;
         res.stop     = true;
         res.data     = json {
@@ -1370,36 +1229,27 @@ struct server_context {
         };
 
         if (slot.params.sampling.n_probs > 0) {
-            std::vector<completion_token_output> probs;
             if (!slot.params.stream && slot.stopped_word) {
                 const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
 
                 size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
-                probs = std::vector<completion_token_output>(
+                res.probs_output = std::vector<completion_token_output>(
                         slot.generated_token_probs.begin(),
                         slot.generated_token_probs.end() - safe_offset);
             } else {
-                probs = std::vector<completion_token_output>(
+                res.probs_output = std::vector<completion_token_output>(
                         slot.generated_token_probs.begin(),
                         slot.generated_token_probs.end());
             }
-
-            res.data["completion_probabilities"] = probs_vector_to_json(ctx, probs);
-        }
-
-        if (slot.oaicompat) {
-            res.data["oaicompat_token_ctr"] = slot.n_decoded;
-            res.data["model"] = slot.oaicompat_model;
         }
 
         queue_results.send(res);
     }
 
     void send_embedding(const server_slot & slot, const llama_batch & batch) {
-        server_task_result res;
+        server_task_result_embd res;
         res.id    = slot.id_task;
-        res.error = false;
-        res.stop  = true;
+        res.index = slot.index;
 
         const int n_embd = llama_n_embd(model);
 
@@ -1418,20 +1268,12 @@ struct server_context {
             if (embd == NULL) {
                 SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
 
-                res.data = json {
-                    {"embedding", std::vector<float>(n_embd, 0.0f)},
-                    {"index",     slot.index},
-                };
-
+                res.embedding = std::vector<float>(n_embd, 0.0f);
                 continue;
             }
 
             common_embd_normalize(embd, embd_res.data(), n_embd);
-
-            res.data = json {
-                {"embedding", embd_res},
-                {"index",     slot.index},
-            };
+            res.embedding = embd_res;
         }
 
         SLT_DBG(slot, "%s", "sending embeddings\n");
@@ -1440,10 +1282,9 @@ struct server_context {
     }
 
     void send_rerank(const server_slot & slot, const llama_batch & batch) {
-        server_task_result res;
+        server_task_result_rerank res;
         res.id    = slot.id_task;
-        res.error = false;
-        res.stop  = true;
+        res.index = slot.index;
 
         for (int i = 0; i < batch.n_tokens; ++i) {
             if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
@@ -1458,21 +1299,14 @@ struct server_context {
             if (embd == NULL) {
                 SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
 
-                res.data = json {
-                    {"index", slot.index},
-                    {"score", -1e6},
-                };
-
+                res.score = -1e6;
                 continue;
             }
 
-            res.data = json {
-                {"index", slot.index},
-                {"score", embd[0]},
-            };
+            res.score = embd[0];
         }
 
-        SLT_DBG(slot, "sending rerank result, res = '%s'\n", res.data.dump().c_str());
+        SLT_DBG(slot, "sending rerank result, res.score = %f\n", res.score);
 
         queue_results.send(res);
     }
diff --git a/examples/server/server.hpp b/examples/server/server.hpp
new file mode 100644
index 0000000000000..a9287bf6dbaaf
--- /dev/null
+++ b/examples/server/server.hpp
@@ -0,0 +1,191 @@
+#pragma once
+
+#include "common.h"
+#include "llama.h"
+#include "sampling.h"
+#include "speculative.h"
+
+// Change JSON_ASSERT from assert() to GGML_ASSERT:
+#define JSON_ASSERT GGML_ASSERT
+#include "json.hpp"
+
+#include <string>
+#include <memory>
+#include <unordered_set>
+
+using json = nlohmann::ordered_json;
+
+enum stop_type {
+    STOP_TYPE_NONE,
+    STOP_TYPE_EOS,
+    STOP_TYPE_WORD,
+    STOP_TYPE_LIMIT,
+};
+
+// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
+enum slot_state {
+    SLOT_STATE_IDLE,
+    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
+    SLOT_STATE_PROCESSING_PROMPT,
+    SLOT_STATE_DONE_PROMPT,
+    SLOT_STATE_GENERATING,
+};
+
+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+};
+
+enum server_task_type {
+    SERVER_TASK_TYPE_INFERENCE,
+    SERVER_TASK_TYPE_CANCEL,
+    SERVER_TASK_TYPE_NEXT_RESPONSE,
+    SERVER_TASK_TYPE_METRICS,
+    SERVER_TASK_TYPE_SLOT_SAVE,
+    SERVER_TASK_TYPE_SLOT_RESTORE,
+    SERVER_TASK_TYPE_SLOT_ERASE,
+    SERVER_TASK_TYPE_SET_LORA,
+};
+
+enum server_task_inf_type {
+    SERVER_TASK_INF_TYPE_COMPLETION,
+    SERVER_TASK_INF_TYPE_EMBEDDING,
+    SERVER_TASK_INF_TYPE_RERANK,
+    SERVER_TASK_INF_TYPE_INFILL,
+};
+
+// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
+enum error_type {
+    ERROR_TYPE_INVALID_REQUEST,
+    ERROR_TYPE_AUTHENTICATION,
+    ERROR_TYPE_SERVER,
+    ERROR_TYPE_NOT_FOUND,
+    ERROR_TYPE_PERMISSION,
+    ERROR_TYPE_UNAVAILABLE, // custom error
+    ERROR_TYPE_NOT_SUPPORTED, // custom error
+};
+
+struct server_task {
+    int id        = -1; // to be filled by server_queue
+    int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
+
+    llama_tokens prompt_tokens;
+    server_task_type type;
+
+    // TODO @ngxson : we should get rid of json type here
+    json data;
+
+    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
+
+    // utility function
+    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
+        std::unordered_set<int> ids(tasks.size());
+        for (size_t i = 0; i < tasks.size(); i++) {
+            ids.insert(tasks[i].id);
+        }
+        return ids;
+    }
+};
+
+struct result_timings {
+    int32_t prompt_n;
+    double prompt_ms;
+    double prompt_per_token_ms;
+    double prompt_per_second;
+
+    int32_t predicted_n;
+    double predicted_ms;
+    double predicted_per_token_ms;
+    double predicted_per_second;
+};
+
+enum result_type {
+    RESULT_TYPE_CMPL_FINAL,
+    RESULT_TYPE_CMPL_PARTIAL,
+    RESULT_TYPE_EMBD,
+    RESULT_TYPE_RERANK,
+    RESULT_TYPE_ERROR,
+    RESULT_TYPE_UNKNOWN, // will throw an error
+};
+
+struct server_task_result {
+    result_type type = RESULT_TYPE_UNKNOWN;
+    int id           = -1;
+    int id_slot      = -1;
+};
+
+struct server_task_result_cmpl_final : server_task_result {
+    result_type type = RESULT_TYPE_CMPL_FINAL;
+    int index = 0;
+    std::string content;
+    bool stream;
+    bool timings_per_token;
+    result_timings timings;
+
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+    int32_t has_new_line;
+    int32_t stopping_word;
+    int32_t n_tokens_cached;
+    stop_type stop = STOP_TYPE_NONE;
+    std::vector<completion_token_output> probs_output;
+
+    slot_params params;
+};
+
+struct completion_token_output {
+    llama_token tok;
+    std::string text_to_send;
+    struct token_prob {
+        llama_token tok;
+        float prob;
+    };
+    std::vector<token_prob> probs;
+};
+
+struct server_task_result_cmpl_partial : server_task_result {
+    result_type type = RESULT_TYPE_CMPL_PARTIAL;
+    int index = 0;
+    std::string content;
+    stop_type stop = STOP_TYPE_NONE;
+    std::vector<completion_token_output> probs_output;
+    result_timings timings;
+};
+
+struct server_task_result_embd : server_task_result {
+    result_type type = RESULT_TYPE_EMBD;
+    int index = 0;
+    std::vector<float> embedding;
+};
+
+struct server_task_result_rerank : server_task_result {
+    result_type type = RESULT_TYPE_RERANK;
+    int index = 0;
+    float score;
+};
+
+struct server_task_result_error : server_task_result {
+    result_type type = RESULT_TYPE_ERROR;
+    int index = 0;
+    error_type err_type;
+    std::string err_msg;
+};
+
+struct slot_params {
+    bool stream       = true;
+    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
+
+    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
+    int32_t n_predict = -1; // new tokens to predict
+    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
+
+    int64_t t_max_prompt_ms  = -1; // TODO: implement
+    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
+
+    std::vector<std::string> antiprompt;
+    bool timings_per_token = false;
+
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
+};
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index e4451532c9d0c..d65773addf231 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -3,6 +3,7 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
+#include "server.hpp"
 
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -40,17 +41,6 @@ using json = nlohmann::ordered_json;
 #define QUE_ERR(fmt, ...) LOG_ERR("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 #define QUE_DBG(fmt, ...) LOG_DBG("que  %12.*s: " fmt, 12, __func__, __VA_ARGS__)
 
-// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
-enum error_type {
-    ERROR_TYPE_INVALID_REQUEST,
-    ERROR_TYPE_AUTHENTICATION,
-    ERROR_TYPE_SERVER,
-    ERROR_TYPE_NOT_FOUND,
-    ERROR_TYPE_PERMISSION,
-    ERROR_TYPE_UNAVAILABLE, // custom error
-    ERROR_TYPE_NOT_SUPPORTED, // custom error
-};
-
 template <typename T>
 static T json_value(const json & body, const std::string & key, const T & default_value) {
     // Fallback null to default value
@@ -485,18 +475,6 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
     return out;
 }
 
-struct completion_token_output {
-    llama_token tok;
-    std::string text_to_send;
-
-    struct token_prob {
-        llama_token tok;
-        float prob;
-    };
-
-    std::vector<token_prob> probs;
-};
-
 // convert a vector of completion_token_output to json
 static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
     json out = json::array();

From 1011a51b8780a1b53ece91201583ad0c756a7e88 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 4 Dec 2024 14:16:01 +0100
Subject: [PATCH 02/19] move all response types to struct

---
 examples/server/server.cpp | 381 ++++++++++++++++++-------------------
 examples/server/server.hpp | 365 +++++++++++++++++++++++++++++++----
 examples/server/utils.hpp  |   1 +
 3 files changed, 511 insertions(+), 236 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1482ecbee29df..de073b085dbe9 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -33,6 +33,9 @@
 
 using json = nlohmann::ordered_json;
 
+// using shared_ptr for polymorphism of server_task_result
+using task_result_ptr = std::unique_ptr<server_task_result>;
+
 struct server_slot {
     int id;
     int id_task = -1;
@@ -79,9 +82,7 @@ struct server_slot {
     bool has_next_token = true;
     bool has_new_line   = false;
     bool truncated      = false;
-    bool stopped_eos    = false;
-    bool stopped_word   = false;
-    bool stopped_limit  = false;
+    stop_type stop;
 
     bool oaicompat = false;
 
@@ -115,9 +116,7 @@ struct server_slot {
         generated_text     = "";
         has_new_line       = false;
         truncated          = false;
-        stopped_eos        = false;
-        stopped_word       = false;
-        stopped_limit      = false;
+        stop               = STOP_TYPE_NONE;
         stopping_word      = "";
         n_past             = 0;
         n_sent_text        = 0;
@@ -203,7 +202,7 @@ struct server_slot {
 
             if (pos != std::string::npos && (stop_pos == std::string::npos || pos < stop_pos)) {
                 if (is_full_stop) {
-                    stopped_word   = true;
+                    stop           = STOP_TYPE_WORD;
                     stopping_word  = word;
                     has_next_token = false;
                 }
@@ -428,8 +427,8 @@ struct server_response {
     // for keeping track of all tasks waiting for the result
     std::unordered_set<int> waiting_task_ids;
 
-    // the main result queue
-    std::vector<server_task_result> queue_results;
+    // the main result queue (using ptr for polymorphism)
+    std::vector<task_result_ptr> queue_results;
 
     std::mutex mutex_results;
     std::condition_variable condition_results;
@@ -469,7 +468,7 @@ struct server_response {
     }
 
     // This function blocks the thread until there is a response for one of the id_tasks
-    server_task_result recv(const std::unordered_set<int> & id_tasks) {
+    task_result_ptr recv(const std::unordered_set<int> & id_tasks) {
         while (true) {
             std::unique_lock<std::mutex> lock(mutex_results);
             condition_results.wait(lock, [&]{
@@ -477,8 +476,8 @@ struct server_response {
             });
 
             for (int i = 0; i < (int) queue_results.size(); i++) {
-                if (id_tasks.find(queue_results[i].id) != id_tasks.end()) {
-                    server_task_result res = queue_results[i];
+                if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
+                    task_result_ptr res = std::move(queue_results[i]);
                     queue_results.erase(queue_results.begin() + i);
                     return res;
                 }
@@ -489,7 +488,7 @@ struct server_response {
     }
 
     // single-task version of recv()
-    server_task_result recv(int id_task) {
+    task_result_ptr recv(int id_task) {
         std::unordered_set<int> id_tasks = {id_task};
         return recv(id_tasks);
     }
@@ -501,9 +500,9 @@ struct server_response {
         std::unique_lock<std::mutex> lock(mutex_results);
         for (const auto & id_task : waiting_task_ids) {
             if (result.id == id_task) {
-                SRV_DBG("task id = %d moved to result queue\n", result.id);
+                SRV_DBG("task id = %d pushed to result queue\n", result.id);
 
-                queue_results.push_back(std::move(result));
+                queue_results.push_back(std::make_unique<server_task_result>(result));
                 condition_results.notify_all();
                 return;
             }
@@ -694,7 +693,7 @@ struct server_context {
             slots.push_back(slot);
         }
 
-        default_generation_settings_for_props = get_formated_generation(slots.front());
+        default_generation_settings_for_props = slots[0].params.to_json();
         default_generation_settings_for_props["seed"] = -1;
 
         // the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
@@ -797,7 +796,7 @@ struct server_context {
             slot.oaicompat_model = "";
         }
 
-        slot.timings_per_token       = json_value(data, "timings_per_token",  false);
+        slot.params.timings_per_token = json_value(data, "timings_per_token", false);
 
         slot.params.stream           = json_value(data, "stream",             false);
         slot.params.cache_prompt     = json_value(data, "cache_prompt",       true);
@@ -1056,7 +1055,7 @@ struct server_context {
 
         // check the limits
         if (slot.n_decoded > 0 && slot.has_next_token && !slot.has_budget(params_base)) {
-            slot.stopped_limit  = true;
+            slot.stop           = STOP_TYPE_LIMIT;
             slot.has_next_token = false;
 
             SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
@@ -1065,7 +1064,7 @@ struct server_context {
         if (slot.has_new_line) {
             // if we have already seen a new line, we stop after a certain time limit
             if (slot.params.t_max_predict_ms > 0 && (ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
-                slot.stopped_limit  = true;
+                slot.stop           = STOP_TYPE_LIMIT;
                 slot.has_next_token = false;
 
                 SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
@@ -1085,7 +1084,7 @@ struct server_context {
                     }
 
                     if (pos < slot.generated_text.size() && n_indent < slot.params.n_indent) {
-                        slot.stopped_limit  = true;
+                        slot.stop           = STOP_TYPE_LIMIT;
                         slot.has_next_token = false;
 
                         // cut the last line
@@ -1114,7 +1113,7 @@ struct server_context {
         // if context shift is disabled, we stop when it reaches the context limit
         if (slot.n_past >= slot.n_ctx) {
             slot.truncated      = true;
-            slot.stopped_limit  = true;
+            slot.stop           = STOP_TYPE_LIMIT;
             slot.has_next_token = false;
 
             SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
@@ -1122,7 +1121,7 @@ struct server_context {
         }
 
         if (llama_token_is_eog(model, result.tok)) {
-            slot.stopped_eos    = true;
+            slot.stop           = STOP_TYPE_EOS;
             slot.has_next_token = false;
 
             SLT_DBG(slot, "%s", "stopped by EOS\n");
@@ -1132,7 +1131,7 @@ struct server_context {
 
         if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
             slot.truncated      = true;
-            slot.stopped_limit  = true;
+            slot.stop           = STOP_TYPE_LIMIT;
             slot.has_next_token = false; // stop prediction
 
             SLT_WRN(slot,
@@ -1201,35 +1200,12 @@ struct server_context {
         res.has_new_line    = slot.has_new_line;
         res.n_tokens_cached = slot.n_past;
         res.content         = slot.generated_text;
+        res.stop            = slot.stop;
 
-        res.params          = slot.params; // copy the parameters
-
-
-
-        res.error    = false;
-        res.stop     = true;
-        res.data     = json {
-            {"content",             !slot.params.stream ? slot.generated_text : ""},
-            {"id_slot",             slot.id},
-            {"stop",                true},
-            {"model",               params_base.model_alias},
-            {"tokens_predicted",    slot.n_decoded},
-            {"tokens_evaluated",    slot.n_prompt_tokens},
-            {"generation_settings", get_formated_generation(slot)},
-            {"prompt",              common_detokenize(ctx, slot.prompt_tokens)},
-            {"has_new_line",        slot.has_new_line},
-            {"truncated",           slot.truncated},
-            {"stopped_eos",         slot.stopped_eos},
-            {"stopped_word",        slot.stopped_word},
-            {"stopped_limit",       slot.stopped_limit},
-            {"stopping_word",       slot.stopping_word},
-            {"tokens_cached",       slot.n_past},
-            {"timings",             slot.get_formated_timings()},
-            {"index",               slot.index},
-        };
+        res.generation_params = slot.params; // copy the parameters
 
         if (slot.params.sampling.n_probs > 0) {
-            if (!slot.params.stream && slot.stopped_word) {
+            if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) {
                 const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
 
                 size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
@@ -1399,25 +1375,34 @@ struct server_context {
     }
 
     // receive the results from task(s) created by create_tasks_inference
-    void receive_cmpl_results(
+    template<typename T>
+    void receive_multi_results(
             const std::unordered_set<int> & id_tasks,
-            const std::function<void(std::vector<server_task_result>&)> & result_handler,
+            const std::function<void(std::vector<T>&)> & result_handler,
             const std::function<void(json)> & error_handler) {
-        // TODO: currently, there is no way to detect the client has cancelled the request
-        std::vector<server_task_result> results(id_tasks.size());
+        std::vector<T> results(id_tasks.size());
         for (size_t i = 0; i < id_tasks.size(); i++) {
-            server_task_result result = queue_results.recv(id_tasks);
+            task_result_ptr result_raw = queue_results.recv(id_tasks);
 
-            if (result.error) {
-                error_handler(result.data);
+            if (result_raw->type == RESULT_TYPE_ERROR) {
+                auto result = server_task_result_error::from_ptr(result_raw);
+                error_handler(format_error_response(result.err_msg, result.err_type));
                 cancel_tasks(id_tasks);
                 return;
             }
 
-            const size_t idx = result.data["index"];
-            GGML_ASSERT(idx < results.size() && "index out of range");
-
-            results[idx] = result;
+            if (
+                result_raw->type == RESULT_TYPE_CMPL_FINAL
+                || result_raw->type == RESULT_TYPE_EMBD
+                || result_raw->type == RESULT_TYPE_RERANK
+            ) {
+                auto result = T::from_ptr(result_raw);
+                const size_t idx = result.index;
+                GGML_ASSERT(idx < results.size() && "index out of range");
+                results[idx] = result;
+            } else {
+                GGML_ASSERT(false && "unexpected result type");
+            }
         }
         result_handler(results);
     }
@@ -1425,23 +1410,27 @@ struct server_context {
     // receive the results from task(s) created by create_tasks_inference, in stream mode
     void receive_cmpl_results_stream(
             const std::unordered_set<int> & id_tasks, const
-            std::function<bool(server_task_result&)> & result_handler, const
+            std::function<bool(server_task_result_cmpl_partial&)> & result_handler, const
             std::function<void(json)> & error_handler) {
         size_t n_finished = 0;
         while (true) {
-            server_task_result result = queue_results.recv(id_tasks);
-            if (!result_handler(result)) {
+            task_result_ptr result_raw = queue_results.recv(id_tasks);
+
+            if (result_raw->type == RESULT_TYPE_ERROR) {
+                auto result = server_task_result_error::from_ptr(result_raw);
+                error_handler(format_error_response(result.err_msg, result.err_type));
                 cancel_tasks(id_tasks);
-                break;
+                return;
             }
 
-            if (result.error) {
-                error_handler(result.data);
+            GGML_ASSERT(result_raw->type == RESULT_TYPE_CMPL_PARTIAL);
+            auto result = server_task_result_cmpl_partial::from_ptr(result_raw);
+            if (!result_handler(result)) {
                 cancel_tasks(id_tasks);
                 break;
             }
 
-            if (result.stop) {
+            if (result.stop != STOP_TYPE_NONE) {
                 if (++n_finished == id_tasks.size()) {
                     break;
                 }
@@ -1508,7 +1497,7 @@ struct server_context {
                     int n_processing_slots = 0;
 
                     for (server_slot & slot : slots) {
-                        json slot_data = get_formated_generation(slot);
+                        json slot_data = slot.params.to_json();
                         slot_data["id"]            = slot.id;
                         slot_data["id_task"]       = slot.id_task;
                         slot_data["is_processing"] = slot.is_processing();
@@ -1518,9 +1507,6 @@ struct server_context {
                             {"has_new_line",   slot.has_new_line},
                             {"n_remain",       slot.n_remaining},
                             {"n_decoded",      slot.n_decoded},
-                            {"stopped_eos",    slot.stopped_eos},
-                            {"stopped_word",   slot.stopped_word},
-                            {"stopped_limit",  slot.stopped_limit},
                             {"stopping_word",  slot.stopping_word},
                         };
 
@@ -1534,34 +1520,28 @@ struct server_context {
                     }
                     SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots);
 
-                    server_task_result res;
-                    res.id       = task.id;
-                    res.stop     = true;
-                    res.error    = false;
-                    res.data     = {
-                        { "idle",                            n_idle_slots       },
-                        { "processing",                      n_processing_slots },
-                        { "deferred",                        queue_tasks.queue_tasks_deferred.size() },
-                        { "t_start",                         metrics.t_start},
+                    server_task_result_metrics res;
+                    res.id                  = task.id;
+                    res.n_idle_slots        = n_idle_slots;
+                    res.n_processing_slots  = n_processing_slots;
+                    res.n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size(); 
+                    res.t_start             = metrics.t_start;
 
-                        { "n_prompt_tokens_processed_total", metrics.n_prompt_tokens_processed_total},
-                        { "t_tokens_generation_total",       metrics.t_tokens_generation_total},
-                        { "n_tokens_predicted_total",        metrics.n_tokens_predicted_total},
-                        { "t_prompt_processing_total",       metrics.t_prompt_processing_total},
+                    res.kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
+                    res.kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx);
 
-                        { "n_prompt_tokens_processed",       metrics.n_prompt_tokens_processed},
-                        { "t_prompt_processing",             metrics.t_prompt_processing},
-                        { "n_tokens_predicted",              metrics.n_tokens_predicted},
-                        { "t_tokens_generation",             metrics.t_tokens_generation},
+                    res.n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
+                    res.t_prompt_processing_total       = metrics.t_prompt_processing_total;
+                    res.n_tokens_predicted_total        = metrics.n_tokens_predicted_total;
+                    res.t_tokens_generation_total       = metrics.t_tokens_generation_total;
 
-                        { "n_decode_total",                  metrics.n_decode_total},
-                        { "n_busy_slots_total",              metrics.n_busy_slots_total},
+                    res.n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
+                    res.t_prompt_processing       = metrics.t_prompt_processing;
+                    res.n_tokens_predicted        = metrics.n_tokens_predicted;
+                    res.t_tokens_generation       = metrics.t_tokens_generation;
 
-                        { "kv_cache_tokens_count",           llama_get_kv_cache_token_count(ctx)},
-                        { "kv_cache_used_cells",             llama_get_kv_cache_used_cells(ctx)},
-
-                        { "slots",                           slots_data },
-                    };
+                    res.n_decode_total          = metrics.n_decode_total;
+                    res.n_busy_slots_total      = metrics.n_busy_slots_total;
 
                     if (json_value(task.data, "reset_bucket", false)) {
                         metrics.reset_bucket();
@@ -1594,19 +1574,14 @@ struct server_context {
                     const int64_t t_end = ggml_time_us();
                     const double t_save_ms = (t_end - t_start) / 1000.0;
 
-                    server_task_result result;
-                    result.id = task.id;
-                    result.stop = true;
-                    result.error = false;
-                    result.data = json {
-                        { "id_slot",   id_slot },
-                        { "filename",  filename },
-                        { "n_saved",   token_count }, // tokens saved
-                        { "n_written", nwrite },      // bytes written
-                        { "timings", {
-                            { "save_ms", t_save_ms }
-                        } }
-                    };
+                    server_task_result_slot_save_load result;
+                    result.id        = task.id;
+                    result.id_slot   = id_slot;
+                    result.filename  = filename;
+                    result.is_save   = true;
+                    result.n_saved   = token_count;
+                    result.n_written = nwrite;
+                    result.t_ms      = t_save_ms;
                     queue_results.send(result);
                 } break;
             case SERVER_TASK_TYPE_SLOT_RESTORE:
@@ -1642,19 +1617,14 @@ struct server_context {
                     const int64_t t_end = ggml_time_us();
                     const double t_restore_ms = (t_end - t_start) / 1000.0;
 
-                    server_task_result result;
-                    result.id = task.id;
-                    result.stop = true;
-                    result.error = false;
-                    result.data = json {
-                        { "id_slot",    id_slot },
-                        { "filename",   filename },
-                        { "n_restored", token_count }, // tokens restored
-                        { "n_read",     nread },       // bytes read
-                        { "timings", {
-                            { "restore_ms", t_restore_ms }
-                        } }
-                    };
+                    server_task_result_slot_save_load result;
+                    result.id        = task.id;
+                    result.id_slot   = id_slot;
+                    result.filename  = filename;
+                    result.is_save   = false;
+                    result.n_saved   = token_count;
+                    result.n_read    = nread;
+                    result.t_ms      = t_restore_ms;
                     queue_results.send(result);
                 } break;
             case SERVER_TASK_TYPE_SLOT_ERASE:
@@ -1677,24 +1647,17 @@ struct server_context {
                     llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
-                    server_task_result result;
-                    result.id = task.id;
-                    result.stop = true;
-                    result.error = false;
-                    result.data = json {
-                        { "id_slot",  id_slot },
-                        { "n_erased", n_erased }
-                    };
+                    server_task_result_slot_erase result;
+                    result.id       = task.id;
+                    result.id_slot  = id_slot;
+                    result.n_erased = n_erased;
                     queue_results.send(result);
                 } break;
             case SERVER_TASK_TYPE_SET_LORA:
                 {
                     common_lora_adapters_apply(ctx, loras);
-                    server_task_result result;
+                    server_task_result_apply_lora result;
                     result.id = task.id;
-                    result.stop = true;
-                    result.error = false;
-                    result.data = json{{ "success", true }};
                     queue_results.send(result);
                 } break;
         }
@@ -2456,19 +2419,26 @@ int main(int argc, char ** argv) {
         ctx_server.queue_tasks.post(task, true); // high-priority task
 
         // get the result
-        server_task_result result = ctx_server.queue_results.recv(task.id);
+        task_result_ptr result_raw = ctx_server.queue_results.recv(task.id);
         ctx_server.queue_results.remove_waiting_task_id(task.id);
 
+        if (result_raw->type != RESULT_TYPE_METRICS) {
+            SRV_ERR("Unexpected result type: %d\n", result_raw->type);
+            res_error(res, format_error_response("Unexpected result type", ERROR_TYPE_SERVER));
+            return;
+        }
+
+        auto result = server_task_result_metrics::from_ptr(result_raw);
+
         // optionally return "fail_on_no_slot" error
-        const int n_idle_slots = result.data.at("idle");
         if (req.has_param("fail_on_no_slot")) {
-            if (n_idle_slots == 0) {
+            if (result.n_idle_slots == 0) {
                 res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE));
                 return;
             }
         }
 
-        res_ok(res, result.data.at("slots"));
+        res_ok(res, result.slots_data);
     };
 
     const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
@@ -2488,73 +2458,68 @@ int main(int argc, char ** argv) {
         ctx_server.queue_tasks.post(task, true); // high-priority task
 
         // get the result
-        server_task_result result = ctx_server.queue_results.recv(task.id);
+        task_result_ptr result_raw = ctx_server.queue_results.recv(task.id);
         ctx_server.queue_results.remove_waiting_task_id(task.id);
+        if (result_raw->type == RESULT_TYPE_ERROR) {
+            auto result = server_task_result_error::from_ptr(result_raw);
+            res_error(res, format_error_response(result.err_msg, result.err_type));
+            return;
+        }
 
-        json data = result.data;
-
-        const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed");
-        const uint64_t t_prompt_processing       = data.at("t_prompt_processing");
-
-        const uint64_t n_tokens_predicted  = data.at("n_tokens_predicted");
-        const uint64_t t_tokens_generation = data.at("t_tokens_generation");
-
-        const uint64_t n_decode_total     = data.at("n_decode_total");
-        const uint64_t n_busy_slots_total = data.at("n_busy_slots_total");
-
-        const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
+        GGML_ASSERT(result_raw->type == RESULT_TYPE_METRICS);
+        auto result = server_task_result_metrics::from_ptr(result_raw);
 
         // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
         json all_metrics_def = json {
             {"counter", {{
                     {"name",  "prompt_tokens_total"},
                     {"help",  "Number of prompt tokens processed."},
-                    {"value",  (uint64_t) data.at("n_prompt_tokens_processed_total")}
+                    {"value",  (uint64_t) result.n_prompt_tokens_processed_total}
             }, {
                     {"name",  "prompt_seconds_total"},
                     {"help",  "Prompt process time"},
-                    {"value",  (uint64_t) data.at("t_prompt_processing_total") / 1.e3}
+                    {"value",  (uint64_t) result.t_prompt_processing_total / 1.e3}
             }, {
                     {"name",  "tokens_predicted_total"},
                     {"help",  "Number of generation tokens processed."},
-                    {"value",  (uint64_t) data.at("n_tokens_predicted_total")}
+                    {"value",  (uint64_t) result.n_tokens_predicted_total}
             }, {
                     {"name",  "tokens_predicted_seconds_total"},
                     {"help",  "Predict process time"},
-                    {"value",  (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
+                    {"value",  (uint64_t) result.t_tokens_generation_total / 1.e3}
             }, {
                     {"name",  "n_decode_total"},
                     {"help",  "Total number of llama_decode() calls"},
-                    {"value",  n_decode_total}
+                    {"value",  result.n_decode_total}
             }, {
                     {"name",  "n_busy_slots_per_decode"},
                     {"help",  "Average number of busy slots per llama_decode() call"},
-                    {"value",  (float) n_busy_slots_total / (float) n_decode_total}
+                    {"value",  (float) result.n_busy_slots_total / (float) result.n_decode_total}
             }}},
             {"gauge", {{
                     {"name",  "prompt_tokens_seconds"},
                     {"help",  "Average prompt throughput in tokens/s."},
-                    {"value",  n_prompt_tokens_processed ? 1.e3 / t_prompt_processing * n_prompt_tokens_processed : 0.}
+                    {"value",  result.n_prompt_tokens_processed ? 1.e3 / result.t_prompt_processing * result.n_prompt_tokens_processed : 0.}
             },{
                     {"name",  "predicted_tokens_seconds"},
                     {"help",  "Average generation throughput in tokens/s."},
-                    {"value",  n_tokens_predicted ? 1.e3 / t_tokens_generation * n_tokens_predicted : 0.}
+                    {"value",  result.n_tokens_predicted ? 1.e3 / result.t_tokens_generation * result.n_tokens_predicted : 0.}
             },{
                     {"name",  "kv_cache_usage_ratio"},
                     {"help",  "KV-cache usage. 1 means 100 percent usage."},
-                    {"value",  1. * kv_cache_used_cells / params.n_ctx}
+                    {"value",  1. * result.kv_cache_used_cells / params.n_ctx}
             },{
                     {"name",  "kv_cache_tokens"},
                     {"help",  "KV-cache tokens."},
-                    {"value",  (uint64_t) data.at("kv_cache_tokens_count")}
+                    {"value",  (uint64_t) result.kv_cache_tokens_count}
             },{
                     {"name",  "requests_processing"},
                     {"help",  "Number of request processing."},
-                    {"value",  (uint64_t) data.at("processing")}
+                    {"value",  (uint64_t) result.n_processing_slots}
             },{
                     {"name",  "requests_deferred"},
                     {"help",  "Number of request deferred."},
-                    {"value",  (uint64_t) data.at("deferred")}
+                    {"value",  (uint64_t) result.n_tasks_deferred}
             }}}
         };
 
@@ -2575,8 +2540,7 @@ int main(int argc, char ** argv) {
             }
         }
 
-        const int64_t t_start = data.at("t_start");
-        res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
+        res.set_header("Process-Start-Time-Unix", std::to_string(result.t_start));
 
         res.set_content(prometheus.str(), "text/plain; version=0.0.4");
         res.status = 200; // HTTP OK
@@ -2602,14 +2566,18 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        server_task_result result = ctx_server.queue_results.recv(id_task);
+        task_result_ptr result_raw = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
-        if (result.error) {
-            res_error(res, result.data);
-        } else {
-            res_ok(res, result.data);
+        if (result_raw->type == RESULT_TYPE_ERROR) {
+            auto result = server_task_result_error::from_ptr(result_raw);
+            res_error(res, format_error_response(result.err_msg, result.err_type));
+            return;
         }
+
+        GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_SAVE_LOAD);
+        auto result = server_task_result_slot_save_load::from_ptr(result_raw);
+        res_ok(res, result.to_json());
     };
 
     const auto handle_slots_restore = [&ctx_server, &res_error, &res_ok, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
@@ -2632,14 +2600,18 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        server_task_result result = ctx_server.queue_results.recv(id_task);
+        task_result_ptr result_raw = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
-        if (result.error) {
-            res_error(res, result.data);
-        } else {
-            res_ok(res, result.data);
+        if (result_raw->type == RESULT_TYPE_ERROR) {
+            auto result = server_task_result_error::from_ptr(result_raw);
+            res_error(res, format_error_response(result.err_msg, result.err_type));
+            return;
         }
+
+        GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_SAVE_LOAD);
+        auto result = server_task_result_slot_save_load::from_ptr(result_raw);
+        res_ok(res, result.to_json());
     };
 
     const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
@@ -2652,14 +2624,18 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        server_task_result result = ctx_server.queue_results.recv(id_task);
+        task_result_ptr result_raw = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
-        if (result.error) {
-            res_error(res, result.data);
-        } else {
-            res_ok(res, result.data);
+        if (result_raw->type == RESULT_TYPE_ERROR) {
+            auto result = server_task_result_error::from_ptr(result_raw);
+            res_error(res, format_error_response(result.err_msg, result.err_type));
+            return;
         }
+
+        GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_ERASE);
+        auto result = server_task_result_slot_erase::from_ptr(result_raw);
+        res_ok(res, result.to_json());
     };
 
     const auto handle_slots_action = [&params, &res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
@@ -2728,15 +2704,15 @@ int main(int argc, char ** argv) {
         const auto task_ids = server_task::get_list_id(tasks);
 
         if (!stream) {
-            ctx_server.receive_cmpl_results(task_ids, [&](std::vector<server_task_result> & results) {
+            ctx_server.receive_multi_results<server_task_result_cmpl_final>(task_ids, [&](std::vector<server_task_result_cmpl_final> & results) {
                 if (results.size() == 1) {
                     // single result
-                    res_ok(res, results[0].data);
+                    res_ok(res, results[0].to_json());
                 } else {
                     // multiple results (multitask)
                     json arr = json::array();
-                    for (const auto & res : results) {
-                        arr.push_back(res.data);
+                    for (auto & res : results) {
+                        arr.push_back(res.to_json());
                     }
                     res_ok(res, arr);
                 }
@@ -2747,8 +2723,8 @@ int main(int argc, char ** argv) {
             ctx_server.queue_results.remove_waiting_task_ids(task_ids);
         } else {
             const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) {
-                ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool {
-                    return server_sent_event(sink, "data", result.data);
+                ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool {
+                    return server_sent_event(sink, "data", result.to_json());
                 }, [&](const json & error_data) {
                     server_sent_event(sink, "error", error_data);
                 });
@@ -2837,9 +2813,9 @@ int main(int argc, char ** argv) {
         const auto completion_id = gen_chatcmplid();
 
         if (!stream) {
-            ctx_server.receive_cmpl_results(task_ids, [&](const std::vector<server_task_result> & results) {
+            ctx_server.receive_multi_results<server_task_result_cmpl_final>(task_ids, [&](std::vector<server_task_result_cmpl_final> & results) {
                 // multitask is never support in chat completion, there is only one result
-                json result_oai = format_final_response_oaicompat(data, results[0].data, completion_id, /*.streaming =*/ false, verbose);
+                json result_oai = format_final_response_oaicompat(data, results[0].to_json(), completion_id, /*.streaming =*/ false, verbose);
                 res_ok(res, result_oai);
             }, [&](const json & error_data) {
                 res_error(res, error_data);
@@ -2848,8 +2824,8 @@ int main(int argc, char ** argv) {
             ctx_server.queue_results.remove_waiting_task_ids(task_ids);
         } else {
             const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) {
-                ctx_server.receive_cmpl_results_stream(task_ids, [&](const server_task_result & result) -> bool {
-                    std::vector<json> result_array = format_partial_response_oaicompat(result.data, completion_id);
+                ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool {
+                    std::vector<json> result_array = format_partial_response_oaicompat(result.to_json(), completion_id);
                     for (auto & event_data : result_array) {
                         if (event_data.empty()) {
                             continue; // skip the stop token
@@ -2974,9 +2950,10 @@ int main(int argc, char ** argv) {
             // get the result
             std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
 
-            ctx_server.receive_cmpl_results(task_ids, [&](std::vector<server_task_result> & results) {
-                for (const auto & res : results) {
-                    responses.push_back(res.data);
+            ctx_server.receive_multi_results<server_task_result_embd>(task_ids, [&](std::vector<server_task_result_embd> & results) {
+                for (auto & res : results) {
+                    GGML_ASSERT(res.type == RESULT_TYPE_EMBD);
+                    responses.push_back(res.to_json());
                 }
             }, [&](const json & error_data) {
                 res_error(res, error_data);
@@ -3052,9 +3029,10 @@ int main(int argc, char ** argv) {
             // get the result
             std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
 
-            ctx_server.receive_cmpl_results(task_ids, [&](std::vector<server_task_result> & results) {
-                for (const auto & res : results) {
-                    responses.push_back(res.data);
+            ctx_server.receive_multi_results<server_task_result_rerank>(task_ids, [&](std::vector<server_task_result_rerank> & results) {
+                for (auto & res : results) {
+                    GGML_ASSERT(res.type == RESULT_TYPE_RERANK);
+                    responses.push_back(res.to_json());
                 }
             }, [&](const json & error_data) {
                 res_error(res, error_data);
@@ -3110,11 +3088,18 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        server_task_result result = ctx_server.queue_results.recv(id_task);
+        task_result_ptr result_raw = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
-        res_ok(res, result.data);
-        res.status = 200; // HTTP OK
+        if (result_raw->type == RESULT_TYPE_ERROR) {
+            auto result = server_task_result_error::from_ptr(result_raw);
+            res_error(res, format_error_response(result.err_msg, result.err_type));
+            return;
+        }
+
+        GGML_ASSERT(result_raw->type == RESULT_TYPE_APPLY_LORA);
+        auto result = server_task_result_apply_lora::from_ptr(result_raw);
+        res_ok(res, result.to_json());
     };
 
     //
diff --git a/examples/server/server.hpp b/examples/server/server.hpp
index a9287bf6dbaaf..081ad2069b05e 100644
--- a/examples/server/server.hpp
+++ b/examples/server/server.hpp
@@ -15,6 +15,8 @@
 
 using json = nlohmann::ordered_json;
 
+#define copy_cast_ptr(TYPEOUT, ptr) *(static_cast<TYPEOUT*>(ptr.get()))
+
 enum stop_type {
     STOP_TYPE_NONE,
     STOP_TYPE_EOS,
@@ -65,6 +67,19 @@ enum error_type {
     ERROR_TYPE_NOT_SUPPORTED, // custom error
 };
 
+enum result_type {
+    RESULT_TYPE_CMPL_FINAL,
+    RESULT_TYPE_CMPL_PARTIAL,
+    RESULT_TYPE_EMBD,
+    RESULT_TYPE_RERANK,
+    RESULT_TYPE_METRICS,
+    RESULT_TYPE_SLOT_SAVE_LOAD,
+    RESULT_TYPE_SLOT_ERASE,
+    RESULT_TYPE_APPLY_LORA,
+    RESULT_TYPE_ERROR,
+    RESULT_TYPE_UNKNOWN, // will throw an error
+};
+
 struct server_task {
     int id        = -1; // to be filled by server_queue
     int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
@@ -87,41 +102,145 @@ struct server_task {
     }
 };
 
+struct slot_params {
+    bool stream       = true;
+    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
+
+    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
+    int32_t n_predict = -1; // new tokens to predict
+    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
+
+    int64_t t_max_prompt_ms  = -1; // TODO: implement
+    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
+
+    std::vector<std::string> antiprompt;
+    bool timings_per_token = false;
+
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
+
+    // params only used in to_json()
+    int32_t n_ctx;
+    uint32_t seed_cur;
+    bool can_speculative;
+
+    json to_json() {
+        std::vector<std::string> samplers;
+        samplers.reserve(sampling.samplers.size());
+        for (const auto & sampler : sampling.samplers) {
+            samplers.emplace_back(common_sampler_type_to_str(sampler));
+        }
+
+        return json {
+            {"n_ctx",                     n_ctx},
+            {"n_predict",                 n_predict},     // Server configured n_predict
+            {"temperature",               sampling.temp},
+            {"dynatemp_range",            sampling.dynatemp_range},
+            {"dynatemp_exponent",         sampling.dynatemp_exponent},
+            {"top_k",                     sampling.top_k},
+            {"top_p",                     sampling.top_p},
+            {"min_p",                     sampling.min_p},
+            {"xtc_probability",           sampling.xtc_probability},
+            {"xtc_threshold",             sampling.xtc_threshold},
+            {"typical_p",                 sampling.typ_p},
+            {"repeat_last_n",             sampling.penalty_last_n},
+            {"repeat_penalty",            sampling.penalty_repeat},
+            {"presence_penalty",          sampling.penalty_present},
+            {"frequency_penalty",         sampling.penalty_freq},
+            {"dry_multiplier",            sampling.dry_multiplier},
+            {"dry_base",                  sampling.dry_base},
+            {"dry_allowed_length",        sampling.dry_allowed_length},
+            {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
+            {"dry_sequence_breakers",     sampling.dry_sequence_breakers},
+            {"mirostat",                  sampling.mirostat},
+            {"mirostat_tau",              sampling.mirostat_tau},
+            {"mirostat_eta",              sampling.mirostat_eta},
+            {"penalize_nl",               sampling.penalize_nl},
+            {"stop",                      antiprompt},
+            {"max_tokens",                n_predict}, // User configured n_predict
+            {"n_keep",                    n_keep},
+            {"n_discard",                 n_discard},
+            {"ignore_eos",                sampling.ignore_eos},
+            {"stream",                    stream},
+            //{"logit_bias",                sampling.logit_bias},
+            {"n_probs",                   sampling.n_probs},
+            {"min_keep",                  sampling.min_keep},
+            {"grammar",                   sampling.grammar},
+            {"samplers",                  samplers},
+            {"speculative",               can_speculative},
+            {"speculative.n_max",         speculative.n_max},
+            {"speculative.n_min",         speculative.n_min},
+            {"speculative.p_min",         speculative.p_min},
+            {"timings_per_token",         timings_per_token},
+        };
+    }
+};
+
 struct result_timings {
-    int32_t prompt_n;
+    int32_t prompt_n = -1;
     double prompt_ms;
     double prompt_per_token_ms;
     double prompt_per_second;
 
-    int32_t predicted_n;
+    int32_t predicted_n = -1;
     double predicted_ms;
     double predicted_per_token_ms;
     double predicted_per_second;
-};
 
-enum result_type {
-    RESULT_TYPE_CMPL_FINAL,
-    RESULT_TYPE_CMPL_PARTIAL,
-    RESULT_TYPE_EMBD,
-    RESULT_TYPE_RERANK,
-    RESULT_TYPE_ERROR,
-    RESULT_TYPE_UNKNOWN, // will throw an error
+    json to_json() {
+        return {
+            {"prompt_n",               prompt_n},
+            {"prompt_ms",              prompt_ms},
+            {"prompt_per_token_ms",    prompt_per_token_ms},
+            {"prompt_per_second",      prompt_per_second},
+
+            {"predicted_n",            predicted_n},
+            {"predicted_ms",           predicted_ms},
+            {"predicted_per_token_ms", predicted_per_token_ms},
+            {"predicted_per_second",   predicted_per_second},
+        };
+    }
 };
 
 struct server_task_result {
     result_type type = RESULT_TYPE_UNKNOWN;
     int id           = -1;
     int id_slot      = -1;
+    server_task_result() = default;
+    server_task_result(result_type type) : type(type) {}
+};
+
+inline std::string stop_type_to_str(stop_type type) {
+    switch (type) {
+        case STOP_TYPE_EOS:   return "eos";
+        case STOP_TYPE_WORD:  return "word";
+        case STOP_TYPE_LIMIT: return "limit";
+        default:              return "none";
+    }
+}
+
+struct completion_token_output {
+    llama_token tok;
+    std::string text_to_send;
+    struct token_prob {
+        llama_token tok;
+        float prob;
+    };
+    std::vector<token_prob> probs;
 };
 
 struct server_task_result_cmpl_final : server_task_result {
-    result_type type = RESULT_TYPE_CMPL_FINAL;
+    server_task_result_cmpl_final() : server_task_result(RESULT_TYPE_CMPL_FINAL) {}
     int index = 0;
     std::string content;
     bool stream;
     bool timings_per_token;
     result_timings timings;
+    std::string model_alias;
+    std::string prompt;
 
+    bool truncated;
     int32_t n_decoded;
     int32_t n_prompt_tokens;
     int32_t has_new_line;
@@ -130,62 +249,232 @@ struct server_task_result_cmpl_final : server_task_result {
     stop_type stop = STOP_TYPE_NONE;
     std::vector<completion_token_output> probs_output;
 
-    slot_params params;
-};
+    slot_params generation_params;
+
+    json to_json() {
+        // non-OAI-compat JSON
+        return json {
+            {"index",               index},
+            {"content",             content},
+            {"id_slot",             id_slot},
+            {"stop",                true},
+            {"model",               model_alias},
+            {"tokens_predicted",    n_decoded},
+            {"tokens_evaluated",    n_prompt_tokens},
+            {"generation_settings", generation_params.to_json()},
+            {"prompt",              prompt},
+            {"has_new_line",        has_new_line},
+            {"truncated",           truncated},
+            {"stop_type",           stop_type_to_str(stop)},
+            {"stopping_word",       stopping_word},
+            {"tokens_cached",       n_tokens_cached},
+            {"timings",             timings.to_json()},
+        };
+    }
 
-struct completion_token_output {
-    llama_token tok;
-    std::string text_to_send;
-    struct token_prob {
-        llama_token tok;
-        float prob;
-    };
-    std::vector<token_prob> probs;
+    static server_task_result_cmpl_final from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
+        return copy_cast_ptr(server_task_result_cmpl_final, result_ptr);
+    }
 };
 
 struct server_task_result_cmpl_partial : server_task_result {
-    result_type type = RESULT_TYPE_CMPL_PARTIAL;
+    server_task_result_cmpl_partial() : server_task_result(RESULT_TYPE_CMPL_PARTIAL) {}
     int index = 0;
     std::string content;
     stop_type stop = STOP_TYPE_NONE;
     std::vector<completion_token_output> probs_output;
     result_timings timings;
+
+    json to_json() {
+        json res = json {
+            {"index",      index},
+            {"content",    content},
+            {"stop",       stop != STOP_TYPE_NONE},
+            {"id_slot",    id_slot},
+        };
+        // populate the timings object when timings_per_token is set
+        if (timings.prompt_n > 0) {
+            res.push_back({"timings", timings.to_json()});
+        }
+        return res;
+    }
+
+    static server_task_result_cmpl_partial from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
+        return copy_cast_ptr(server_task_result_cmpl_partial, result_ptr);
+    }
 };
 
 struct server_task_result_embd : server_task_result {
+    server_task_result_embd() : server_task_result(RESULT_TYPE_EMBD) {}
     result_type type = RESULT_TYPE_EMBD;
     int index = 0;
     std::vector<float> embedding;
+
+    json to_json() {
+        return json {
+            {"index",     index},
+            {"embedding", embedding},
+        };
+    }
+
+    static server_task_result_embd from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
+        return copy_cast_ptr(server_task_result_embd, result_ptr);
+    }
 };
 
 struct server_task_result_rerank : server_task_result {
-    result_type type = RESULT_TYPE_RERANK;
+    server_task_result_rerank() : server_task_result(RESULT_TYPE_RERANK) {}
     int index = 0;
     float score;
+
+    json to_json() {
+        return json {
+            {"index", index},
+            {"score", score},
+        };
+    }
+
+    static server_task_result_rerank from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
+        return copy_cast_ptr(server_task_result_rerank, result_ptr);
+    }
 };
 
 struct server_task_result_error : server_task_result {
-    result_type type = RESULT_TYPE_ERROR;
+    server_task_result_error() : server_task_result(RESULT_TYPE_ERROR) {}
     int index = 0;
-    error_type err_type;
+    error_type err_type = ERROR_TYPE_SERVER;
     std::string err_msg;
+
+    static server_task_result_error from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
+        return copy_cast_ptr(server_task_result_error, result_ptr);
+    }
 };
 
-struct slot_params {
-    bool stream       = true;
-    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
+struct server_task_result_metrics : server_task_result {
+    server_task_result_metrics() : server_task_result(RESULT_TYPE_METRICS) {}
+    int n_idle_slots;
+    int n_processing_slots;
+    int n_tasks_deferred;
+    int64_t t_start;
 
-    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
-    int32_t n_predict = -1; // new tokens to predict
-    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
+    int32_t kv_cache_tokens_count;
+    int32_t kv_cache_used_cells;
 
-    int64_t t_max_prompt_ms  = -1; // TODO: implement
-    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
+    // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
+    uint64_t n_prompt_tokens_processed_total = 0;
+    uint64_t t_prompt_processing_total       = 0;
+    uint64_t n_tokens_predicted_total        = 0;
+    uint64_t t_tokens_generation_total       = 0;
 
-    std::vector<std::string> antiprompt;
-    bool timings_per_token = false;
+    uint64_t n_prompt_tokens_processed = 0;
+    uint64_t t_prompt_processing       = 0;
 
-    struct common_params_sampling sampling;
-    struct common_params_speculative speculative;
+    uint64_t n_tokens_predicted  = 0;
+    uint64_t t_tokens_generation = 0;
+
+    uint64_t n_decode_total     = 0;
+    uint64_t n_busy_slots_total = 0;
+
+    // TODO: get rid of this json object and use to_json() instead
+    json slots_data = json::array();
+
+    json to_json() {
+        return json {
+            { "idle",                            n_idle_slots },
+            { "processing",                      n_processing_slots },
+            { "deferred",                        n_tasks_deferred },
+            { "t_start",                         t_start },
+
+            { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
+            { "t_tokens_generation_total",       t_tokens_generation_total },
+            { "n_tokens_predicted_total",        n_tokens_predicted_total },
+            { "t_prompt_processing_total",       t_prompt_processing_total },
+
+            { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
+            { "t_prompt_processing",             t_prompt_processing },
+            { "n_tokens_predicted",              n_tokens_predicted },
+            { "t_tokens_generation",             t_tokens_generation },
+
+            { "n_decode_total",                  n_decode_total },
+            { "n_busy_slots_total",              n_busy_slots_total },
+
+            { "kv_cache_tokens_count",           kv_cache_tokens_count },
+            { "kv_cache_used_cells",             kv_cache_used_cells },
+
+            { "slots",                           slots_data },
+        };
+    }
+
+    static server_task_result_metrics from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
+        return copy_cast_ptr(server_task_result_metrics, result_ptr);
+    }
+};
+
+struct server_task_result_slot_save_load : server_task_result {
+    server_task_result_slot_save_load() : server_task_result(RESULT_TYPE_SLOT_SAVE_LOAD) {}
+    std::string filename;
+    bool is_save; // true = save, false = load
+
+    size_t n_saved;
+    size_t n_written;
+
+    size_t n_restored;
+    size_t n_read;
+
+    double t_ms;
+
+    json to_json() {
+        if (is_save) {
+            return json {
+                { "id_slot",   id_slot },
+                { "filename",  filename },
+                { "n_saved",   n_saved },
+                { "n_written", n_written },
+                { "timings", {
+                    { "save_ms", t_ms }
+                }},
+            };
+        } else {
+            return json {
+                { "id_slot",    id_slot },
+                { "filename",   filename },
+                { "n_restored", n_restored },
+                { "n_read",     n_read },
+                { "timings", {
+                    { "restore_ms", t_ms }
+                }},
+            };
+        }
+    }
+
+    static server_task_result_slot_save_load from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
+        return copy_cast_ptr(server_task_result_slot_save_load, result_ptr);
+    }
+};
+
+struct server_task_result_slot_erase : server_task_result {
+    server_task_result_slot_erase() : server_task_result(RESULT_TYPE_SLOT_ERASE) {}
+    size_t n_erased;
+
+    json to_json() {
+        return json {
+            { "id_slot",  id_slot },
+            { "n_erased", n_erased },
+        };
+    }
+    
+    static server_task_result_slot_erase from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
+        return copy_cast_ptr(server_task_result_slot_erase, result_ptr);
+    }
+};
+
+struct server_task_result_apply_lora : server_task_result {
+    server_task_result_apply_lora() : server_task_result(RESULT_TYPE_APPLY_LORA) {}
+    json to_json() {
+        return json {{ "success", true }};
+    }
+
+    static server_task_result_apply_lora from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
+        return copy_cast_ptr(server_task_result_apply_lora, result_ptr);
+    }
 };
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index d65773addf231..b01a7757fc259 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -21,6 +21,7 @@
 #include <sstream>
 #include <string>
 #include <vector>
+#include <memory>
 
 #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
 

From 0d6485f0f830d9fd3de5680e861f897d6e9312aa Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 4 Dec 2024 15:03:37 +0100
Subject: [PATCH 03/19] wip [no ci]

---
 examples/server/server.cpp | 26 +++++++++-----
 examples/server/server.hpp |  2 ++
 examples/server/utils.hpp  | 71 +++++++++++++++++---------------------
 3 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index de073b085dbe9..a673fb4158540 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -494,7 +494,9 @@ struct server_response {
     }
 
     // Send a new result to a waiting id_task
-    void send(server_task_result & result) {
+    template<typename T>
+    void send(T & result) {
+        static_assert(std::is_base_of<server_task_result, T>::value, "T must be derived from server_task_result");
         SRV_DBG("sending result for task id = %d\n", result.id);
 
         std::unique_lock<std::mutex> lock(mutex_results);
@@ -502,7 +504,7 @@ struct server_response {
             if (result.id == id_task) {
                 SRV_DBG("task id = %d pushed to result queue\n", result.id);
 
-                queue_results.push_back(std::make_unique<server_task_result>(result));
+                queue_results.push_back(std::make_unique<T>(std::move(result)));
                 condition_results.notify_all();
                 return;
             }
@@ -1166,8 +1168,10 @@ struct server_context {
 
     void send_partial_response(server_slot & slot, completion_token_output tkn) {
         server_task_result_cmpl_partial res;
-        res.id       = slot.id_task;
-        res.content  = tkn.text_to_send;
+        res.id              = slot.id_task;
+        res.n_decoded       = slot.n_decoded;
+        res.n_prompt_tokens = slot.n_prompt_tokens;
+        res.content         = tkn.text_to_send;
 
         if (slot.params.sampling.n_probs > 0) {
             const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
@@ -1189,7 +1193,11 @@ struct server_context {
         queue_results.send(res);
     }
 
-    void send_final_response(const server_slot & slot) {
+    void send_final_response(server_slot & slot) {
+        if (slot.params.stream) {
+            return send_partial_response(slot, {0, "", {}});
+        }
+
         server_task_result_cmpl_final res;
         res.id              = slot.id_task;
         res.id_slot         = slot.id;
@@ -1380,6 +1388,7 @@ struct server_context {
             const std::unordered_set<int> & id_tasks,
             const std::function<void(std::vector<T>&)> & result_handler,
             const std::function<void(json)> & error_handler) {
+        static_assert(std::is_base_of<server_task_result, T>::value, "T must be derived from server_task_result");
         std::vector<T> results(id_tasks.size());
         for (size_t i = 0; i < id_tasks.size(); i++) {
             task_result_ptr result_raw = queue_results.recv(id_tasks);
@@ -2815,7 +2824,7 @@ int main(int argc, char ** argv) {
         if (!stream) {
             ctx_server.receive_multi_results<server_task_result_cmpl_final>(task_ids, [&](std::vector<server_task_result_cmpl_final> & results) {
                 // multitask is never support in chat completion, there is only one result
-                json result_oai = format_final_response_oaicompat(data, results[0].to_json(), completion_id, /*.streaming =*/ false, verbose);
+                json result_oai = format_final_response_oaicompat(data, results[0], completion_id, /*.streaming =*/ false, verbose);
                 res_ok(res, result_oai);
             }, [&](const json & error_data) {
                 res_error(res, error_data);
@@ -2823,9 +2832,10 @@ int main(int argc, char ** argv) {
 
             ctx_server.queue_results.remove_waiting_task_ids(task_ids);
         } else {
-            const auto chunked_content_provider = [task_ids, &ctx_server, completion_id](size_t, httplib::DataSink & sink) {
+            std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+            const auto chunked_content_provider = [task_ids, &ctx_server, completion_id, model_name](size_t, httplib::DataSink & sink) {
                 ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool {
-                    std::vector<json> result_array = format_partial_response_oaicompat(result.to_json(), completion_id);
+                    std::vector<json> result_array = format_partial_response_oaicompat(model_name, result, completion_id);
                     for (auto & event_data : result_array) {
                         if (event_data.empty()) {
                             continue; // skip the stop token
diff --git a/examples/server/server.hpp b/examples/server/server.hpp
index 081ad2069b05e..6197ae56519fe 100644
--- a/examples/server/server.hpp
+++ b/examples/server/server.hpp
@@ -281,6 +281,8 @@ struct server_task_result_cmpl_partial : server_task_result {
     server_task_result_cmpl_partial() : server_task_result(RESULT_TYPE_CMPL_PARTIAL) {}
     int index = 0;
     std::string content;
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
     stop_type stop = STOP_TYPE_NONE;
     std::vector<completion_token_output> probs_output;
     result_timings timings;
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index b01a7757fc259..98a777192027c 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -583,15 +583,14 @@ static json oaicompat_completion_params_parse(
     return llama_params;
 }
 
-static json format_final_response_oaicompat(const json & request, const json & result, const std::string & completion_id, bool streaming = false, bool verbose = false) {
-    bool stopped_word        = result.count("stopped_word") != 0;
-    bool stopped_eos         = json_value(result, "stopped_eos", false);
-    int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-    int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
-    std::string content      = json_value(result, "content", std::string(""));
-
+static json format_final_response_oaicompat(
+        const json & request,
+        server_task_result_cmpl_final & result,
+        const std::string & completion_id,
+        bool streaming = false,
+        bool verbose = false) {
     std::string finish_reason = "length";
-    if (stopped_word || stopped_eos) {
+    if (result.stop == STOP_TYPE_WORD || result.stop == STOP_TYPE_EOS) {
         finish_reason = "stop";
     }
 
@@ -601,7 +600,7 @@ static json format_final_response_oaicompat(const json & request, const json & r
                                         {"delta", json::object()}}})
                   : json::array({json{{"finish_reason", finish_reason},
                                         {"index", 0},
-                                        {"message", json{{"content", content},
+                                        {"message", json{{"content", result.content},
                                                          {"role", "assistant"}}}}});
 
     std::time_t t = std::time(0);
@@ -613,48 +612,42 @@ static json format_final_response_oaicompat(const json & request, const json & r
             json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
         {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
         {"usage", json {
-            {"completion_tokens", num_tokens_predicted},
-            {"prompt_tokens",     num_prompt_tokens},
-            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
+            {"completion_tokens", result.n_decoded},
+            {"prompt_tokens",     result.n_prompt_tokens},
+            {"total_tokens",      result.n_decoded + result.n_prompt_tokens}
         }},
         {"id", completion_id}
     };
 
     // extra fields for debugging purposes
     if (verbose) {
-        res["__verbose"] = result;
+        res["__verbose"] = result.to_json();
     }
 
-    if (result.contains("completion_probabilities")) {
-        res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
-    }
+    // TODO: fix this
+    // if (result.contains("completion_probabilities")) {
+    //     res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
+    // }
 
-    if (result.contains("timings")) {
-        res.push_back({"timings", json_value(result, "timings", json::object())});
+    if (result.timings.prompt_n >= 0) {
+        res.push_back({"timings", result.timings.to_json()});
     }
 
     return res;
 }
 
 // return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(const json & result, const std::string & completion_id) {
-    if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
-        return std::vector<json>({result});
-    }
-
-    bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
-    std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-
-    bool stopped_word   = json_value(result, "stopped_word",  false);
-    bool stopped_eos    = json_value(result, "stopped_eos",   false);
-    bool stopped_limit  = json_value(result, "stopped_limit", false);
-    std::string content = json_value(result, "content",       std::string(""));
+static std::vector<json> format_partial_response_oaicompat(
+        std::string modelname,
+        server_task_result_cmpl_partial & result,
+        const std::string & completion_id) {
+    bool first = result.n_decoded == 0;
+    std::string content = result.content;
 
     std::string finish_reason;
-    if (stopped_word || stopped_eos) {
+    if (result.stop == STOP_TYPE_WORD || result.stop == STOP_TYPE_EOS) {
         finish_reason = "stop";
-    }
-    if (stopped_limit) {
+    } else if (result.stop == STOP_TYPE_LIMIT) {
         finish_reason = "length";
     }
 
@@ -724,17 +717,15 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
         {"object",  "chat.completion.chunk"}
     };
 
-    if (result.contains("timings")) {
-        ret.push_back({"timings", json_value(result, "timings", json::object())});
+    if (result.timings.prompt_n >= 0) {
+        ret.push_back({"timings", result.timings.to_json()});
     }
 
     if (!finish_reason.empty()) {
-        int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-        int num_prompt_tokens    = json_value(result, "tokens_evaluated", 0);
         ret.push_back({"usage", json {
-            {"completion_tokens", num_tokens_predicted},
-            {"prompt_tokens",     num_prompt_tokens},
-            {"total_tokens",      num_tokens_predicted + num_prompt_tokens}
+            {"completion_tokens", result.n_decoded},
+            {"prompt_tokens",     result.n_prompt_tokens},
+            {"total_tokens",      result.n_decoded + result.n_prompt_tokens}
         }});
     }
 

From d2419b325588e4086819e5be412b274679ee527a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 4 Dec 2024 18:58:16 +0100
Subject: [PATCH 04/19] many fixes

---
 examples/server/server.cpp                    | 26 ++++++++++++-------
 examples/server/server.hpp                    | 24 ++++++++++++-----
 examples/server/tests/tests.sh                |  4 +++
 .../server/tests/unit/test_chat_completion.py | 15 +++++------
 4 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a673fb4158540..c26bc08674d58 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1172,6 +1172,8 @@ struct server_context {
         res.n_decoded       = slot.n_decoded;
         res.n_prompt_tokens = slot.n_prompt_tokens;
         res.content         = tkn.text_to_send;
+        res.stop            = slot.stop;
+        res.truncated       = slot.truncated;
 
         if (slot.params.sampling.n_probs > 0) {
             const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
@@ -1186,7 +1188,8 @@ struct server_context {
             }
         }
 
-        if (slot.params.timings_per_token) {
+        // populate timings if this is final response or timings_per_token is enabled
+        if (slot.stop != STOP_TYPE_NONE || slot.params.timings_per_token) {
             res.timings = slot.get_timings();
         }
 
@@ -1195,6 +1198,7 @@ struct server_context {
 
     void send_final_response(server_slot & slot) {
         if (slot.params.stream) {
+            // if in stream mode, send the last partial response
             return send_partial_response(slot, {0, "", {}});
         }
 
@@ -1209,6 +1213,8 @@ struct server_context {
         res.n_tokens_cached = slot.n_past;
         res.content         = slot.generated_text;
         res.stop            = slot.stop;
+        res.truncated       = slot.truncated;
+        res.timings         = slot.get_timings();
 
         res.generation_params = slot.params; // copy the parameters
 
@@ -1439,6 +1445,8 @@ struct server_context {
                 break;
             }
 
+            SRV_ERR("received partial result, %s\n", result.to_json().dump().c_str());
+
             if (result.stop != STOP_TYPE_NONE) {
                 if (++n_finished == id_tasks.size()) {
                     break;
@@ -1533,7 +1541,7 @@ struct server_context {
                     res.id                  = task.id;
                     res.n_idle_slots        = n_idle_slots;
                     res.n_processing_slots  = n_processing_slots;
-                    res.n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size(); 
+                    res.n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                     res.t_start             = metrics.t_start;
 
                     res.kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
@@ -1627,13 +1635,13 @@ struct server_context {
                     const double t_restore_ms = (t_end - t_start) / 1000.0;
 
                     server_task_result_slot_save_load result;
-                    result.id        = task.id;
-                    result.id_slot   = id_slot;
-                    result.filename  = filename;
-                    result.is_save   = false;
-                    result.n_saved   = token_count;
-                    result.n_read    = nread;
-                    result.t_ms      = t_restore_ms;
+                    result.id         = task.id;
+                    result.id_slot    = id_slot;
+                    result.filename   = filename;
+                    result.is_save    = false;
+                    result.n_restored = token_count;
+                    result.n_read     = nread;
+                    result.t_ms       = t_restore_ms;
                     queue_results.send(result);
                 } break;
             case SERVER_TASK_TYPE_SLOT_ERASE:
diff --git a/examples/server/server.hpp b/examples/server/server.hpp
index 6197ae56519fe..3e2fd2f527e28 100644
--- a/examples/server/server.hpp
+++ b/examples/server/server.hpp
@@ -15,6 +15,7 @@
 
 using json = nlohmann::ordered_json;
 
+// cast a shared_ptr to a specific type using copy constructor
 #define copy_cast_ptr(TYPEOUT, ptr) *(static_cast<TYPEOUT*>(ptr.get()))
 
 enum stop_type {
@@ -281,23 +282,34 @@ struct server_task_result_cmpl_partial : server_task_result {
     server_task_result_cmpl_partial() : server_task_result(RESULT_TYPE_CMPL_PARTIAL) {}
     int index = 0;
     std::string content;
+
+    bool truncated;
     int32_t n_decoded;
     int32_t n_prompt_tokens;
+
     stop_type stop = STOP_TYPE_NONE;
     std::vector<completion_token_output> probs_output;
     result_timings timings;
 
     json to_json() {
+        bool is_stop = stop != STOP_TYPE_NONE;
+        // non-OAI-compat JSON
         json res = json {
-            {"index",      index},
-            {"content",    content},
-            {"stop",       stop != STOP_TYPE_NONE},
-            {"id_slot",    id_slot},
+            {"index",            index},
+            {"content",          content},
+            {"stop_type",        stop_type_to_str(stop)},
+            {"stop",             is_stop},
+            {"id_slot",          id_slot},
+            {"tokens_predicted", n_decoded},
+            {"tokens_evaluated", n_prompt_tokens},
         };
-        // populate the timings object when timings_per_token is set
+        // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
         if (timings.prompt_n > 0) {
             res.push_back({"timings", timings.to_json()});
         }
+        if (is_stop) {
+            res.push_back({"truncated", truncated});
+        }
         return res;
     }
 
@@ -464,7 +476,7 @@ struct server_task_result_slot_erase : server_task_result {
             { "n_erased", n_erased },
         };
     }
-    
+
     static server_task_result_slot_erase from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
         return copy_cast_ptr(server_task_result_slot_erase, result_ptr);
     }
diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh
index 1e285dcdac14b..1e0777de367fc 100755
--- a/examples/server/tests/tests.sh
+++ b/examples/server/tests/tests.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+# make sure we are in the right directory
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+cd $SCRIPT_DIR
+
 set -eu
 
 if [ $# -lt 1 ]
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
index 8a439f9ef0f29..486c1f87a0856 100644
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -12,13 +12,13 @@ def create_server():
 
 
 @pytest.mark.parametrize(
-    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
     [
-        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
-        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
+        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
     ]
 )
-def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
+def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
     global server
     server.start()
     res = server.make_request("POST", "/chat/completions", data={
@@ -35,10 +35,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
     choice = res.body["choices"][0]
     assert "assistant" == choice["message"]["role"]
     assert match_regex(re_content, choice["message"]["content"])
-    if truncated:
-        assert choice["finish_reason"] == "length"
-    else:
-        assert choice["finish_reason"] == "stop"
+    assert choice["finish_reason"] == finish_reason
 
 
 @pytest.mark.parametrize(
@@ -93,7 +90,7 @@ def test_chat_completion_with_openai_library():
         temperature=0.8,
     )
     print(res)
-    assert res.choices[0].finish_reason == "stop"
+    assert res.choices[0].finish_reason == "length"
     assert res.choices[0].message.content is not None
     assert match_regex("(Suddenly)+", res.choices[0].message.content)
 

From ea1be7f8acf7ec0a04327b6ec899d16c9293629b Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 4 Dec 2024 19:18:56 +0100
Subject: [PATCH 05/19] add virtual function

---
 examples/server/server.cpp | 28 ++++++++++++++--------------
 examples/server/server.hpp | 35 ++++++++++++++++++++++++-----------
 2 files changed, 38 insertions(+), 25 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c26bc08674d58..d299d7274e91e 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1592,13 +1592,13 @@ struct server_context {
                     const double t_save_ms = (t_end - t_start) / 1000.0;
 
                     server_task_result_slot_save_load result;
-                    result.id        = task.id;
-                    result.id_slot   = id_slot;
-                    result.filename  = filename;
-                    result.is_save   = true;
-                    result.n_saved   = token_count;
-                    result.n_written = nwrite;
-                    result.t_ms      = t_save_ms;
+                    result.id       = task.id;
+                    result.id_slot  = id_slot;
+                    result.filename = filename;
+                    result.is_save  = true;
+                    result.n_tokens = token_count;
+                    result.n_bytes  = nwrite;
+                    result.t_ms     = t_save_ms;
                     queue_results.send(result);
                 } break;
             case SERVER_TASK_TYPE_SLOT_RESTORE:
@@ -1635,13 +1635,13 @@ struct server_context {
                     const double t_restore_ms = (t_end - t_start) / 1000.0;
 
                     server_task_result_slot_save_load result;
-                    result.id         = task.id;
-                    result.id_slot    = id_slot;
-                    result.filename   = filename;
-                    result.is_save    = false;
-                    result.n_restored = token_count;
-                    result.n_read     = nread;
-                    result.t_ms       = t_restore_ms;
+                    result.id       = task.id;
+                    result.id_slot  = id_slot;
+                    result.filename = filename;
+                    result.is_save  = false;
+                    result.n_tokens = token_count;
+                    result.n_bytes  = nread;
+                    result.t_ms     = t_restore_ms;
                     queue_results.send(result);
                 } break;
             case SERVER_TASK_TYPE_SLOT_ERASE:
diff --git a/examples/server/server.hpp b/examples/server/server.hpp
index 3e2fd2f527e28..e9c94fa56484c 100644
--- a/examples/server/server.hpp
+++ b/examples/server/server.hpp
@@ -16,7 +16,7 @@
 using json = nlohmann::ordered_json;
 
 // cast a shared_ptr to a specific type using copy constructor
-#define copy_cast_ptr(TYPEOUT, ptr) *(static_cast<TYPEOUT*>(ptr.get()))
+#define copy_cast_ptr(TYPEOUT, ptr) *(static_cast<TYPEOUT*>(ptr.get()));
 
 enum stop_type {
     STOP_TYPE_NONE,
@@ -210,6 +210,7 @@ struct server_task_result {
     int id_slot      = -1;
     server_task_result() = default;
     server_task_result(result_type type) : type(type) {}
+    virtual ~server_task_result() = default;
 };
 
 inline std::string stop_type_to_str(stop_type type) {
@@ -276,6 +277,8 @@ struct server_task_result_cmpl_final : server_task_result {
     static server_task_result_cmpl_final from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
         return copy_cast_ptr(server_task_result_cmpl_final, result_ptr);
     }
+
+    virtual ~server_task_result_cmpl_final() = default;
 };
 
 struct server_task_result_cmpl_partial : server_task_result {
@@ -316,6 +319,8 @@ struct server_task_result_cmpl_partial : server_task_result {
     static server_task_result_cmpl_partial from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
         return copy_cast_ptr(server_task_result_cmpl_partial, result_ptr);
     }
+
+    virtual ~server_task_result_cmpl_partial() = default;
 };
 
 struct server_task_result_embd : server_task_result {
@@ -334,6 +339,8 @@ struct server_task_result_embd : server_task_result {
     static server_task_result_embd from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
         return copy_cast_ptr(server_task_result_embd, result_ptr);
     }
+
+    virtual ~server_task_result_embd() = default;
 };
 
 struct server_task_result_rerank : server_task_result {
@@ -351,6 +358,8 @@ struct server_task_result_rerank : server_task_result {
     static server_task_result_rerank from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
         return copy_cast_ptr(server_task_result_rerank, result_ptr);
     }
+
+    virtual ~server_task_result_rerank() = default;
 };
 
 struct server_task_result_error : server_task_result {
@@ -362,6 +371,8 @@ struct server_task_result_error : server_task_result {
     static server_task_result_error from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
         return copy_cast_ptr(server_task_result_error, result_ptr);
     }
+
+    virtual ~server_task_result_error() = default;
 };
 
 struct server_task_result_metrics : server_task_result {
@@ -422,6 +433,8 @@ struct server_task_result_metrics : server_task_result {
     static server_task_result_metrics from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
         return copy_cast_ptr(server_task_result_metrics, result_ptr);
     }
+
+    virtual ~server_task_result_metrics() = default;
 };
 
 struct server_task_result_slot_save_load : server_task_result {
@@ -429,12 +442,8 @@ struct server_task_result_slot_save_load : server_task_result {
     std::string filename;
     bool is_save; // true = save, false = load
 
-    size_t n_saved;
-    size_t n_written;
-
-    size_t n_restored;
-    size_t n_read;
-
+    size_t n_tokens;
+    size_t n_bytes;
     double t_ms;
 
     json to_json() {
@@ -442,8 +451,8 @@ struct server_task_result_slot_save_load : server_task_result {
             return json {
                 { "id_slot",   id_slot },
                 { "filename",  filename },
-                { "n_saved",   n_saved },
-                { "n_written", n_written },
+                { "n_saved",   n_tokens },
+                { "n_written", n_bytes },
                 { "timings", {
                     { "save_ms", t_ms }
                 }},
@@ -452,8 +461,8 @@ struct server_task_result_slot_save_load : server_task_result {
             return json {
                 { "id_slot",    id_slot },
                 { "filename",   filename },
-                { "n_restored", n_restored },
-                { "n_read",     n_read },
+                { "n_restored", n_tokens },
+                { "n_read",     n_bytes },
                 { "timings", {
                     { "restore_ms", t_ms }
                 }},
@@ -464,6 +473,8 @@ struct server_task_result_slot_save_load : server_task_result {
     static server_task_result_slot_save_load from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
         return copy_cast_ptr(server_task_result_slot_save_load, result_ptr);
     }
+
+    virtual ~server_task_result_slot_save_load() = default;
 };
 
 struct server_task_result_slot_erase : server_task_result {
@@ -480,6 +491,8 @@ struct server_task_result_slot_erase : server_task_result {
     static server_task_result_slot_erase from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
         return copy_cast_ptr(server_task_result_slot_erase, result_ptr);
     }
+
+    virtual ~server_task_result_slot_erase() = default;
 };
 
 struct server_task_result_apply_lora : server_task_result {

From 3b41ad53a3223fa7ace9d1323191f5654fd498e4 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 4 Dec 2024 19:26:36 +0100
Subject: [PATCH 06/19] fix index

---
 examples/server/server.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index d299d7274e91e..60947a17f6a77 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1169,6 +1169,7 @@ struct server_context {
     void send_partial_response(server_slot & slot, completion_token_output tkn) {
         server_task_result_cmpl_partial res;
         res.id              = slot.id_task;
+        res.index           = slot.index;
         res.n_decoded       = slot.n_decoded;
         res.n_prompt_tokens = slot.n_prompt_tokens;
         res.content         = tkn.text_to_send;
@@ -1205,6 +1206,7 @@ struct server_context {
         server_task_result_cmpl_final res;
         res.id              = slot.id_task;
         res.id_slot         = slot.id;
+        res.index           = slot.index;
         res.content         = slot.generated_text;
 
         res.n_decoded       = slot.n_decoded;
@@ -1411,7 +1413,7 @@ struct server_context {
                 || result_raw->type == RESULT_TYPE_EMBD
                 || result_raw->type == RESULT_TYPE_RERANK
             ) {
-                auto result = T::from_ptr(result_raw);
+                T result = T::from_ptr(result_raw);
                 const size_t idx = result.index;
                 GGML_ASSERT(idx < results.size() && "index out of range");
                 results[idx] = result;

From 12610861639c30201bf6071fc951fd5954bb2b2e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 4 Dec 2024 19:36:37 +0100
Subject: [PATCH 07/19] minor style fix

---
 examples/server/server.cpp | 23 +++++++++++++++--------
 examples/server/server.hpp |  9 +++++----
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 60947a17f6a77..469663b2e2573 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1170,12 +1170,15 @@ struct server_context {
         server_task_result_cmpl_partial res;
         res.id              = slot.id_task;
         res.index           = slot.index;
+        res.content         = tkn.text_to_send;
+
+        res.truncated       = slot.truncated;
         res.n_decoded       = slot.n_decoded;
         res.n_prompt_tokens = slot.n_prompt_tokens;
-        res.content         = tkn.text_to_send;
+
         res.stop            = slot.stop;
-        res.truncated       = slot.truncated;
 
+        // populate res.probs_output
         if (slot.params.sampling.n_probs > 0) {
             const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
             const size_t probs_pos      = std::min(slot.n_sent_token_probs,                       slot.generated_token_probs.size());
@@ -1206,20 +1209,22 @@ struct server_context {
         server_task_result_cmpl_final res;
         res.id              = slot.id_task;
         res.id_slot         = slot.id;
+
         res.index           = slot.index;
         res.content         = slot.generated_text;
+        res.timings         = slot.get_timings();
+        res.model_alias     = slot.oaicompat_model;
+        res.prompt          = common_detokenize(ctx, slot.prompt_tokens, true);
 
+        res.truncated       = slot.truncated;
         res.n_decoded       = slot.n_decoded;
         res.n_prompt_tokens = slot.n_prompt_tokens;
-        res.has_new_line    = slot.has_new_line;
         res.n_tokens_cached = slot.n_past;
-        res.content         = slot.generated_text;
+        res.has_new_line    = slot.has_new_line;
+        res.stopping_word   = slot.stopping_word;
         res.stop            = slot.stop;
-        res.truncated       = slot.truncated;
-        res.timings         = slot.get_timings();
-
-        res.generation_params = slot.params; // copy the parameters
 
+        // populate res.probs_output
         if (slot.params.sampling.n_probs > 0) {
             if (!slot.params.stream && slot.stop == STOP_TYPE_WORD) {
                 const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
@@ -1235,6 +1240,8 @@ struct server_context {
             }
         }
 
+        res.generation_params = slot.params; // copy the parameters
+
         queue_results.send(res);
     }
 
diff --git a/examples/server/server.hpp b/examples/server/server.hpp
index e9c94fa56484c..1e65614f62ac7 100644
--- a/examples/server/server.hpp
+++ b/examples/server/server.hpp
@@ -237,7 +237,6 @@ struct server_task_result_cmpl_final : server_task_result {
     int index = 0;
     std::string content;
     bool stream;
-    bool timings_per_token;
     result_timings timings;
     std::string model_alias;
     std::string prompt;
@@ -245,10 +244,11 @@ struct server_task_result_cmpl_final : server_task_result {
     bool truncated;
     int32_t n_decoded;
     int32_t n_prompt_tokens;
-    int32_t has_new_line;
-    int32_t stopping_word;
     int32_t n_tokens_cached;
+    int32_t has_new_line;
+    std::string stopping_word;
     stop_type stop = STOP_TYPE_NONE;
+
     std::vector<completion_token_output> probs_output;
 
     slot_params generation_params;
@@ -291,6 +291,7 @@ struct server_task_result_cmpl_partial : server_task_result {
     int32_t n_prompt_tokens;
 
     stop_type stop = STOP_TYPE_NONE;
+
     std::vector<completion_token_output> probs_output;
     result_timings timings;
 
@@ -346,7 +347,7 @@ struct server_task_result_embd : server_task_result {
 struct server_task_result_rerank : server_task_result {
     server_task_result_rerank() : server_task_result(RESULT_TYPE_RERANK) {}
     int index = 0;
-    float score;
+    float score = -1e6;
 
     json to_json() {
         return json {

From eaa12887da2c73d4ca50170eee112d4c847ba4c7 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 4 Dec 2024 19:52:28 +0100
Subject: [PATCH 08/19] add std::move

---
 examples/server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 469663b2e2573..9057c0a4c5d0d 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1423,7 +1423,7 @@ struct server_context {
                 T result = T::from_ptr(result_raw);
                 const size_t idx = result.index;
                 GGML_ASSERT(idx < results.size() && "index out of range");
-                results[idx] = result;
+                results[idx] = std::move(result);
             } else {
                 GGML_ASSERT(false && "unexpected result type");
             }

From cb666718b1d4fe94de819f4888035d53b73b4133 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 4 Dec 2024 23:53:25 +0100
Subject: [PATCH 09/19] refactor handle_completions_generic

---
 examples/server/server.cpp | 102 ++++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 53 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 9057c0a4c5d0d..0ab09db22934c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2716,7 +2716,16 @@ int main(int argc, char ** argv) {
         res_ok(res, {{ "success", true }});
     };
 
-    const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_inf_type inf_type, json & data, httplib::Response & res) {
+    // handle completion-like requests (completion, chat, infill)
+    // we can optionally provide a custom format for partial results and final results
+    const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](
+            server_task_inf_type inf_type,
+            json & data,
+            httplib::Response & res,
+            const std::function<std::vector<json>(server_task_result_cmpl_partial&)> & format_partial = nullptr,
+            const std::function<json(std::vector<server_task_result_cmpl_final>&)> & format_final = nullptr,
+            // wether to send [DONE] event after completion (required for OAI-compat)
+            bool send_done_event = false) {
         if (ctx_server.params_base.embedding) {
             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
             return;
@@ -2731,7 +2740,9 @@ int main(int argc, char ** argv) {
 
         if (!stream) {
             ctx_server.receive_multi_results<server_task_result_cmpl_final>(task_ids, [&](std::vector<server_task_result_cmpl_final> & results) {
-                if (results.size() == 1) {
+                if (format_final) {
+                    res_ok(res, format_final(results));
+                } else if (results.size() == 1) {
                     // single result
                     res_ok(res, results[0].to_json());
                 } else {
@@ -2748,12 +2759,25 @@ int main(int argc, char ** argv) {
 
             ctx_server.queue_results.remove_waiting_task_ids(task_ids);
         } else {
-            const auto chunked_content_provider = [task_ids, &ctx_server](size_t, httplib::DataSink & sink) {
+            const auto chunked_content_provider = [task_ids, &ctx_server, format_partial = std::move(format_partial), send_done_event](size_t, httplib::DataSink & sink) {
                 ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool {
-                    return server_sent_event(sink, "data", result.to_json());
+                    if (format_partial) {
+                        for (const auto & res : format_partial(result)) {
+                            if (!server_sent_event(sink, "data", res)) {
+                                return false;
+                            }
+                        }
+                        return true;
+                    } else {
+                        return server_sent_event(sink, "data", result.to_json());
+                    }
                 }, [&](const json & error_data) {
                     server_sent_event(sink, "error", error_data);
                 });
+                if (send_done_event) {
+                    static const std::string ev_done = "data: [DONE]\n\n";
+                    sink.write(ev_done.data(), ev_done.size());
+                }
                 sink.done();
                 return false;
             };
@@ -2768,7 +2792,13 @@ int main(int argc, char ** argv) {
 
     const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
         json data = json::parse(req.body);
-        return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res);
+        return handle_completions_generic(
+            SERVER_TASK_INF_TYPE_COMPLETION,
+            data,
+            res,
+            // TODO: support OAI-compat response via format_partial and format_final
+            /* format_partial */ nullptr,
+            /* format_final */ nullptr);
     };
 
     const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
@@ -2821,8 +2851,7 @@ int main(int argc, char ** argv) {
         return handle_completions_generic(SERVER_TASK_INF_TYPE_INFILL, data, res);
     };
 
-    // TODO: maybe merge this function with "handle_completions_generic"
-    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &res_ok, verbose](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_generic, verbose](const httplib::Request & req, httplib::Response & res) {
         if (ctx_server.params_base.embedding) {
             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
             return;
@@ -2830,53 +2859,20 @@ int main(int argc, char ** argv) {
 
         json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
 
-        std::vector<server_task> tasks = ctx_server.create_tasks_inference(data, SERVER_TASK_INF_TYPE_COMPLETION);
-        ctx_server.queue_results.add_waiting_tasks(tasks);
-        ctx_server.queue_tasks.post(tasks);
-
-        bool stream = json_value(data, "stream", false);
-        const auto task_ids = server_task::get_list_id(tasks);
         const auto completion_id = gen_chatcmplid();
-
-        if (!stream) {
-            ctx_server.receive_multi_results<server_task_result_cmpl_final>(task_ids, [&](std::vector<server_task_result_cmpl_final> & results) {
-                // multitask is never support in chat completion, there is only one result
-                json result_oai = format_final_response_oaicompat(data, results[0], completion_id, /*.streaming =*/ false, verbose);
-                res_ok(res, result_oai);
-            }, [&](const json & error_data) {
-                res_error(res, error_data);
-            });
-
-            ctx_server.queue_results.remove_waiting_task_ids(task_ids);
-        } else {
-            std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-            const auto chunked_content_provider = [task_ids, &ctx_server, completion_id, model_name](size_t, httplib::DataSink & sink) {
-                ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool {
-                    std::vector<json> result_array = format_partial_response_oaicompat(model_name, result, completion_id);
-                    for (auto & event_data : result_array) {
-                        if (event_data.empty()) {
-                            continue; // skip the stop token
-                        }
-                        if (!server_sent_event(sink, "data", event_data)) {
-                            return false; // connection is closed
-                        }
-                    }
-                    return true; // ok
-                }, [&](const json & error_data) {
-                    server_sent_event(sink, "error", error_data);
-                });
-                static const std::string ev_done = "data: [DONE]\n\n";
-                sink.write(ev_done.data(), ev_done.size());
-                sink.done();
-                return true;
-            };
-
-            auto on_complete = [task_ids, &ctx_server] (bool) {
-                ctx_server.queue_results.remove_waiting_task_ids(task_ids);
-            };
-
-            res.set_chunked_content_provider("text/event-stream", chunked_content_provider, on_complete);
-        }
+        std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+
+        return handle_completions_generic(
+            SERVER_TASK_INF_TYPE_COMPLETION,
+            data,
+            res,
+            /* format_partial */ [data, model_name, completion_id](server_task_result_cmpl_partial & result) {
+                return format_partial_response_oaicompat(model_name, result, completion_id);
+            },
+            /* format_final */ [data, verbose, model_name](std::vector<server_task_result_cmpl_final> & results) {
+                return format_final_response_oaicompat(data, results[0], model_name, false, verbose);
+            },
+            /* send_done_event */ true);
     };
 
     const auto handle_models = [&params, &ctx_server](const httplib::Request &, httplib::Response & res) {

From 8ab173c865e76a4078c9ea461589e15e6e74c631 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Dec 2024 14:44:06 +0100
Subject: [PATCH 10/19] add virtual functions

---
 examples/server/server.cpp | 437 +++++++++++++++++--------------------
 examples/server/server.hpp | 323 ++++++++++++++++++++-------
 examples/server/utils.hpp  | 189 ----------------
 3 files changed, 440 insertions(+), 509 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0ab09db22934c..c8cb48b15c6ba 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -84,9 +84,6 @@ struct server_slot {
     bool truncated      = false;
     stop_type stop;
 
-    bool oaicompat = false;
-
-    std::string oaicompat_model;
     std::string stopping_word;
 
     // sampling
@@ -494,17 +491,15 @@ struct server_response {
     }
 
     // Send a new result to a waiting id_task
-    template<typename T>
-    void send(T & result) {
-        static_assert(std::is_base_of<server_task_result, T>::value, "T must be derived from server_task_result");
-        SRV_DBG("sending result for task id = %d\n", result.id);
+    void send(task_result_ptr && result) {
+        SRV_DBG("sending result for task id = %d\n", result->id);
 
         std::unique_lock<std::mutex> lock(mutex_results);
         for (const auto & id_task : waiting_task_ids) {
-            if (result.id == id_task) {
-                SRV_DBG("task id = %d pushed to result queue\n", result.id);
+            if (result->id == id_task) {
+                SRV_DBG("task id = %d pushed to result queue\n", result->id);
 
-                queue_results.push_back(std::make_unique<T>(std::move(result)));
+                queue_results.emplace_back(std::move(result));
                 condition_results.notify_all();
                 return;
             }
@@ -791,13 +786,16 @@ struct server_context {
         const auto & data = task.data;
 
         if (data.count("__oaicompat") != 0) {
-            slot.oaicompat = true;
-            slot.oaicompat_model = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+            slot.params.oaicompat         = true;
+            slot.params.oaicompat_model   = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+            slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string());
         } else {
-            slot.oaicompat = false;
-            slot.oaicompat_model = "";
+            slot.params.oaicompat         = false;
         }
 
+
+        // enabling this will output extra debug information in the HTTP responses from the server
+        slot.params.verbose           = params_base.verbosity > 9;
         slot.params.timings_per_token = json_value(data, "timings_per_token", false);
 
         slot.params.stream           = json_value(data, "stream",             false);
@@ -1158,25 +1156,29 @@ struct server_context {
     void send_error(const int id_task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) {
         SRV_ERR("task id = %d, error: %s\n", id_task, error.c_str());
 
-        server_task_result_error res;
-        res.id       = id_task;
-        res.err_type = type;
-        res.err_msg  = error;
+        auto res = std::make_unique<server_task_result_error>();
+        res->id       = id_task;
+        res->err_type = type;
+        res->err_msg  = error;
 
-        queue_results.send(res);
+        queue_results.send(std::move(res));
     }
 
     void send_partial_response(server_slot & slot, completion_token_output tkn) {
-        server_task_result_cmpl_partial res;
-        res.id              = slot.id_task;
-        res.index           = slot.index;
-        res.content         = tkn.text_to_send;
+        auto res = std::make_unique<server_task_result_cmpl_partial>();
+        res->id              = slot.id_task;
+        res->index           = slot.index;
+        res->content         = tkn.text_to_send;
+
+        res->truncated       = slot.truncated;
+        res->n_decoded       = slot.n_decoded;
+        res->n_prompt_tokens = slot.n_prompt_tokens;
 
-        res.truncated       = slot.truncated;
-        res.n_decoded       = slot.n_decoded;
-        res.n_prompt_tokens = slot.n_prompt_tokens;
+        res->stop            = slot.stop;
 
-        res.stop            = slot.stop;
+        res->oaicompat_model   = slot.params.oaicompat_model;
+        res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
+        res->verbose           = slot.params.verbose;
 
         // populate res.probs_output
         if (slot.params.sampling.n_probs > 0) {
@@ -1186,7 +1188,7 @@ struct server_context {
 
             std::vector<completion_token_output> probs_output;
             if (probs_pos < probs_stop_pos) {
-                res.probs_output = std::vector<completion_token_output>(
+                res->probs_output = std::vector<completion_token_output>(
                         slot.generated_token_probs.begin() + probs_pos,
                         slot.generated_token_probs.begin() + probs_stop_pos);
             }
@@ -1194,10 +1196,10 @@ struct server_context {
 
         // populate timings if this is final response or timings_per_token is enabled
         if (slot.stop != STOP_TYPE_NONE || slot.params.timings_per_token) {
-            res.timings = slot.get_timings();
+            res->timings = slot.get_timings();
         }
 
-        queue_results.send(res);
+        queue_results.send(std::move(res));
     }
 
     void send_final_response(server_slot & slot) {
@@ -1206,23 +1208,26 @@ struct server_context {
             return send_partial_response(slot, {0, "", {}});
         }
 
-        server_task_result_cmpl_final res;
-        res.id              = slot.id_task;
-        res.id_slot         = slot.id;
+        auto res = std::make_unique<server_task_result_cmpl_final>();
+        res->id              = slot.id_task;
+        res->id_slot         = slot.id;
 
-        res.index           = slot.index;
-        res.content         = slot.generated_text;
-        res.timings         = slot.get_timings();
-        res.model_alias     = slot.oaicompat_model;
-        res.prompt          = common_detokenize(ctx, slot.prompt_tokens, true);
+        res->index           = slot.index;
+        res->content         = slot.generated_text;
+        res->timings         = slot.get_timings();
+        res->prompt          = common_detokenize(ctx, slot.prompt_tokens, true);
 
-        res.truncated       = slot.truncated;
-        res.n_decoded       = slot.n_decoded;
-        res.n_prompt_tokens = slot.n_prompt_tokens;
-        res.n_tokens_cached = slot.n_past;
-        res.has_new_line    = slot.has_new_line;
-        res.stopping_word   = slot.stopping_word;
-        res.stop            = slot.stop;
+        res->truncated       = slot.truncated;
+        res->n_decoded       = slot.n_decoded;
+        res->n_prompt_tokens = slot.n_prompt_tokens;
+        res->n_tokens_cached = slot.n_past;
+        res->has_new_line    = slot.has_new_line;
+        res->stopping_word   = slot.stopping_word;
+        res->stop            = slot.stop;
+
+        res->oaicompat_model   = slot.params.oaicompat_model;
+        res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
+        res->verbose           = slot.params.verbose;
 
         // populate res.probs_output
         if (slot.params.sampling.n_probs > 0) {
@@ -1230,25 +1235,25 @@ struct server_context {
                 const llama_tokens stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
 
                 size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
-                res.probs_output = std::vector<completion_token_output>(
+                res->probs_output = std::vector<completion_token_output>(
                         slot.generated_token_probs.begin(),
                         slot.generated_token_probs.end() - safe_offset);
             } else {
-                res.probs_output = std::vector<completion_token_output>(
+                res->probs_output = std::vector<completion_token_output>(
                         slot.generated_token_probs.begin(),
                         slot.generated_token_probs.end());
             }
         }
 
-        res.generation_params = slot.params; // copy the parameters
+        res->generation_params = slot.params; // copy the parameters
 
-        queue_results.send(res);
+        queue_results.send(std::move(res));
     }
 
     void send_embedding(const server_slot & slot, const llama_batch & batch) {
-        server_task_result_embd res;
-        res.id    = slot.id_task;
-        res.index = slot.index;
+        auto res = std::make_unique<server_task_result_embd>();
+        res->id    = slot.id_task;
+        res->index = slot.index;
 
         const int n_embd = llama_n_embd(model);
 
@@ -1267,23 +1272,23 @@ struct server_context {
             if (embd == NULL) {
                 SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
 
-                res.embedding = std::vector<float>(n_embd, 0.0f);
+                res->embedding = std::vector<float>(n_embd, 0.0f);
                 continue;
             }
 
             common_embd_normalize(embd, embd_res.data(), n_embd);
-            res.embedding = embd_res;
+            res->embedding = embd_res;
         }
 
         SLT_DBG(slot, "%s", "sending embeddings\n");
 
-        queue_results.send(res);
+        queue_results.send(std::move(res));
     }
 
     void send_rerank(const server_slot & slot, const llama_batch & batch) {
-        server_task_result_rerank res;
-        res.id    = slot.id_task;
-        res.index = slot.index;
+        auto res = std::make_unique<server_task_result_rerank>();
+        res->id    = slot.id_task;
+        res->index = slot.index;
 
         for (int i = 0; i < batch.n_tokens; ++i) {
             if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
@@ -1298,16 +1303,16 @@ struct server_context {
             if (embd == NULL) {
                 SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]);
 
-                res.score = -1e6;
+                res->score = -1e6;
                 continue;
             }
 
-            res.score = embd[0];
+            res->score = embd[0];
         }
 
-        SLT_DBG(slot, "sending rerank result, res.score = %f\n", res.score);
+        SLT_DBG(slot, "sending rerank result, res.score = %f\n", res->score);
 
-        queue_results.send(res);
+        queue_results.send(std::move(res));
     }
 
     //
@@ -1398,35 +1403,28 @@ struct server_context {
     }
 
     // receive the results from task(s) created by create_tasks_inference
-    template<typename T>
     void receive_multi_results(
             const std::unordered_set<int> & id_tasks,
-            const std::function<void(std::vector<T>&)> & result_handler,
+            const std::function<void(std::vector<task_result_ptr>&)> & result_handler,
             const std::function<void(json)> & error_handler) {
-        static_assert(std::is_base_of<server_task_result, T>::value, "T must be derived from server_task_result");
-        std::vector<T> results(id_tasks.size());
+        std::vector<task_result_ptr> results(id_tasks.size());
         for (size_t i = 0; i < id_tasks.size(); i++) {
-            task_result_ptr result_raw = queue_results.recv(id_tasks);
+            task_result_ptr result = queue_results.recv(id_tasks);
 
-            if (result_raw->type == RESULT_TYPE_ERROR) {
-                auto result = server_task_result_error::from_ptr(result_raw);
-                error_handler(format_error_response(result.err_msg, result.err_type));
+            if (result->is_error()) {
+                error_handler(result->to_json());
                 cancel_tasks(id_tasks);
                 return;
             }
 
-            if (
-                result_raw->type == RESULT_TYPE_CMPL_FINAL
-                || result_raw->type == RESULT_TYPE_EMBD
-                || result_raw->type == RESULT_TYPE_RERANK
-            ) {
-                T result = T::from_ptr(result_raw);
-                const size_t idx = result.index;
-                GGML_ASSERT(idx < results.size() && "index out of range");
-                results[idx] = std::move(result);
-            } else {
-                GGML_ASSERT(false && "unexpected result type");
-            }
+            GGML_ASSERT(
+                dynamic_cast<server_task_result_cmpl_final*>(result.get()) != nullptr
+                || dynamic_cast<server_task_result_embd*>(result.get()) != nullptr
+                || dynamic_cast<server_task_result_rerank*>(result.get()) != nullptr
+            );
+            const size_t idx = result->get_index();
+            GGML_ASSERT(idx < results.size() && "index out of range");
+            results[idx] = std::move(result);
         }
         result_handler(results);
     }
@@ -1434,29 +1432,25 @@ struct server_context {
     // receive the results from task(s) created by create_tasks_inference, in stream mode
     void receive_cmpl_results_stream(
             const std::unordered_set<int> & id_tasks, const
-            std::function<bool(server_task_result_cmpl_partial&)> & result_handler, const
+            std::function<bool(task_result_ptr&)> & result_handler, const
             std::function<void(json)> & error_handler) {
         size_t n_finished = 0;
         while (true) {
-            task_result_ptr result_raw = queue_results.recv(id_tasks);
+            task_result_ptr result = queue_results.recv(id_tasks);
 
-            if (result_raw->type == RESULT_TYPE_ERROR) {
-                auto result = server_task_result_error::from_ptr(result_raw);
-                error_handler(format_error_response(result.err_msg, result.err_type));
+            if (result->is_error()) {
+                error_handler(result->to_json());
                 cancel_tasks(id_tasks);
                 return;
             }
 
-            GGML_ASSERT(result_raw->type == RESULT_TYPE_CMPL_PARTIAL);
-            auto result = server_task_result_cmpl_partial::from_ptr(result_raw);
+            GGML_ASSERT(dynamic_cast<server_task_result_cmpl_partial*>(result.get()) != nullptr);
             if (!result_handler(result)) {
                 cancel_tasks(id_tasks);
                 break;
             }
 
-            SRV_ERR("received partial result, %s\n", result.to_json().dump().c_str());
-
-            if (result.stop != STOP_TYPE_NONE) {
+            if (result->is_stop()) {
                 if (++n_finished == id_tasks.size()) {
                     break;
                 }
@@ -1546,33 +1540,33 @@ struct server_context {
                     }
                     SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots);
 
-                    server_task_result_metrics res;
-                    res.id                  = task.id;
-                    res.n_idle_slots        = n_idle_slots;
-                    res.n_processing_slots  = n_processing_slots;
-                    res.n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
-                    res.t_start             = metrics.t_start;
+                    auto res = std::make_unique<server_task_result_metrics>();
+                    res->id                  = task.id;
+                    res->n_idle_slots        = n_idle_slots;
+                    res->n_processing_slots  = n_processing_slots;
+                    res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
+                    res->t_start             = metrics.t_start;
 
-                    res.kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
-                    res.kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx);
+                    res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
+                    res->kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx);
 
-                    res.n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
-                    res.t_prompt_processing_total       = metrics.t_prompt_processing_total;
-                    res.n_tokens_predicted_total        = metrics.n_tokens_predicted_total;
-                    res.t_tokens_generation_total       = metrics.t_tokens_generation_total;
+                    res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
+                    res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
+                    res->n_tokens_predicted_total        = metrics.n_tokens_predicted_total;
+                    res->t_tokens_generation_total       = metrics.t_tokens_generation_total;
 
-                    res.n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
-                    res.t_prompt_processing       = metrics.t_prompt_processing;
-                    res.n_tokens_predicted        = metrics.n_tokens_predicted;
-                    res.t_tokens_generation       = metrics.t_tokens_generation;
+                    res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
+                    res->t_prompt_processing       = metrics.t_prompt_processing;
+                    res->n_tokens_predicted        = metrics.n_tokens_predicted;
+                    res->t_tokens_generation       = metrics.t_tokens_generation;
 
-                    res.n_decode_total          = metrics.n_decode_total;
-                    res.n_busy_slots_total      = metrics.n_busy_slots_total;
+                    res->n_decode_total          = metrics.n_decode_total;
+                    res->n_busy_slots_total      = metrics.n_busy_slots_total;
 
                     if (json_value(task.data, "reset_bucket", false)) {
                         metrics.reset_bucket();
                     }
-                    queue_results.send(res);
+                    queue_results.send(std::move(res));
                 } break;
             case SERVER_TASK_TYPE_SLOT_SAVE:
                 {
@@ -1600,15 +1594,15 @@ struct server_context {
                     const int64_t t_end = ggml_time_us();
                     const double t_save_ms = (t_end - t_start) / 1000.0;
 
-                    server_task_result_slot_save_load result;
-                    result.id       = task.id;
-                    result.id_slot  = id_slot;
-                    result.filename = filename;
-                    result.is_save  = true;
-                    result.n_tokens = token_count;
-                    result.n_bytes  = nwrite;
-                    result.t_ms     = t_save_ms;
-                    queue_results.send(result);
+                    auto res = std::make_unique<server_task_result_slot_save_load>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->filename = filename;
+                    res->is_save  = true;
+                    res->n_tokens = token_count;
+                    res->n_bytes  = nwrite;
+                    res->t_ms     = t_save_ms;
+                    queue_results.send(std::move(res));
                 } break;
             case SERVER_TASK_TYPE_SLOT_RESTORE:
                 {
@@ -1643,15 +1637,15 @@ struct server_context {
                     const int64_t t_end = ggml_time_us();
                     const double t_restore_ms = (t_end - t_start) / 1000.0;
 
-                    server_task_result_slot_save_load result;
-                    result.id       = task.id;
-                    result.id_slot  = id_slot;
-                    result.filename = filename;
-                    result.is_save  = false;
-                    result.n_tokens = token_count;
-                    result.n_bytes  = nread;
-                    result.t_ms     = t_restore_ms;
-                    queue_results.send(result);
+                    auto res = std::make_unique<server_task_result_slot_save_load>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->filename = filename;
+                    res->is_save  = false;
+                    res->n_tokens = token_count;
+                    res->n_bytes  = nread;
+                    res->t_ms     = t_restore_ms;
+                    queue_results.send(std::move(res));
                 } break;
             case SERVER_TASK_TYPE_SLOT_ERASE:
                 {
@@ -1673,18 +1667,18 @@ struct server_context {
                     llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
                     slot->cache_tokens.clear();
 
-                    server_task_result_slot_erase result;
-                    result.id       = task.id;
-                    result.id_slot  = id_slot;
-                    result.n_erased = n_erased;
-                    queue_results.send(result);
+                    auto res = std::make_unique<server_task_result_slot_erase>();
+                    res->id       = task.id;
+                    res->id_slot  = id_slot;
+                    res->n_erased = n_erased;
+                    queue_results.send(std::move(res));
                 } break;
             case SERVER_TASK_TYPE_SET_LORA:
                 {
                     common_lora_adapters_apply(ctx, loras);
-                    server_task_result_apply_lora result;
-                    result.id = task.id;
-                    queue_results.send(result);
+                    auto res = std::make_unique<server_task_result_apply_lora>();
+                    res->id = task.id;
+                    queue_results.send(std::move(res));
                 } break;
         }
     }
@@ -2250,10 +2244,6 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    // enabling this will output extra debug information in the HTTP responses from the server
-    // see format_final_response_oaicompat()
-    const bool verbose = params.verbosity > 9;
-
     // struct that contains llama context and inference
     server_context ctx_server;
 
@@ -2445,26 +2435,27 @@ int main(int argc, char ** argv) {
         ctx_server.queue_tasks.post(task, true); // high-priority task
 
         // get the result
-        task_result_ptr result_raw = ctx_server.queue_results.recv(task.id);
+        task_result_ptr result = ctx_server.queue_results.recv(task.id);
         ctx_server.queue_results.remove_waiting_task_id(task.id);
 
-        if (result_raw->type != RESULT_TYPE_METRICS) {
-            SRV_ERR("Unexpected result type: %d\n", result_raw->type);
-            res_error(res, format_error_response("Unexpected result type", ERROR_TYPE_SERVER));
+        if (result->is_error()) {
+            res_error(res, result->to_json());
             return;
         }
 
-        auto result = server_task_result_metrics::from_ptr(result_raw);
+        // TODO: get rid of this dynamic_cast
+        auto res_metrics = dynamic_cast<server_task_result_metrics*>(result.get());
+        GGML_ASSERT(res_metrics != nullptr);
 
         // optionally return "fail_on_no_slot" error
         if (req.has_param("fail_on_no_slot")) {
-            if (result.n_idle_slots == 0) {
+            if (res_metrics->n_idle_slots == 0) {
                 res_error(res, format_error_response("no slot available", ERROR_TYPE_UNAVAILABLE));
                 return;
             }
         }
 
-        res_ok(res, result.slots_data);
+        res_ok(res, res_metrics->slots_data);
     };
 
     const auto handle_metrics = [&](const httplib::Request &, httplib::Response & res) {
@@ -2484,68 +2475,69 @@ int main(int argc, char ** argv) {
         ctx_server.queue_tasks.post(task, true); // high-priority task
 
         // get the result
-        task_result_ptr result_raw = ctx_server.queue_results.recv(task.id);
+        task_result_ptr result = ctx_server.queue_results.recv(task.id);
         ctx_server.queue_results.remove_waiting_task_id(task.id);
-        if (result_raw->type == RESULT_TYPE_ERROR) {
-            auto result = server_task_result_error::from_ptr(result_raw);
-            res_error(res, format_error_response(result.err_msg, result.err_type));
+
+        if (result->is_error()) {
+            res_error(res, result->to_json());
             return;
         }
 
-        GGML_ASSERT(result_raw->type == RESULT_TYPE_METRICS);
-        auto result = server_task_result_metrics::from_ptr(result_raw);
+        // TODO: get rid of this dynamic_cast
+        auto res_metrics = dynamic_cast<server_task_result_metrics*>(result.get());
+        GGML_ASSERT(res_metrics != nullptr);
 
         // metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
         json all_metrics_def = json {
             {"counter", {{
                     {"name",  "prompt_tokens_total"},
                     {"help",  "Number of prompt tokens processed."},
-                    {"value",  (uint64_t) result.n_prompt_tokens_processed_total}
+                    {"value",  (uint64_t) res_metrics->n_prompt_tokens_processed_total}
             }, {
                     {"name",  "prompt_seconds_total"},
                     {"help",  "Prompt process time"},
-                    {"value",  (uint64_t) result.t_prompt_processing_total / 1.e3}
+                    {"value",  (uint64_t) res_metrics->t_prompt_processing_total / 1.e3}
             }, {
                     {"name",  "tokens_predicted_total"},
                     {"help",  "Number of generation tokens processed."},
-                    {"value",  (uint64_t) result.n_tokens_predicted_total}
+                    {"value",  (uint64_t) res_metrics->n_tokens_predicted_total}
             }, {
                     {"name",  "tokens_predicted_seconds_total"},
                     {"help",  "Predict process time"},
-                    {"value",  (uint64_t) result.t_tokens_generation_total / 1.e3}
+                    {"value",  (uint64_t) res_metrics->t_tokens_generation_total / 1.e3}
             }, {
                     {"name",  "n_decode_total"},
                     {"help",  "Total number of llama_decode() calls"},
-                    {"value",  result.n_decode_total}
+                    {"value",  res_metrics->n_decode_total}
             }, {
                     {"name",  "n_busy_slots_per_decode"},
                     {"help",  "Average number of busy slots per llama_decode() call"},
-                    {"value",  (float) result.n_busy_slots_total / (float) result.n_decode_total}
+                    {"value",  (float) res_metrics->n_busy_slots_total / (float) res_metrics->n_decode_total}
             }}},
             {"gauge", {{
                     {"name",  "prompt_tokens_seconds"},
                     {"help",  "Average prompt throughput in tokens/s."},
-                    {"value",  result.n_prompt_tokens_processed ? 1.e3 / result.t_prompt_processing * result.n_prompt_tokens_processed : 0.}
+                    {"value",  res_metrics->n_prompt_tokens_processed ? 1.e3 / res_metrics->t_prompt_processing * res_metrics->n_prompt_tokens_processed : 0.}
             },{
                     {"name",  "predicted_tokens_seconds"},
                     {"help",  "Average generation throughput in tokens/s."},
-                    {"value",  result.n_tokens_predicted ? 1.e3 / result.t_tokens_generation * result.n_tokens_predicted : 0.}
+                    {"value",  res_metrics->n_tokens_predicted ? 1.e3 / res_metrics->t_tokens_generation * res_metrics->n_tokens_predicted : 0.}
             },{
                     {"name",  "kv_cache_usage_ratio"},
                     {"help",  "KV-cache usage. 1 means 100 percent usage."},
-                    {"value",  1. * result.kv_cache_used_cells / params.n_ctx}
+                    {"value",  1. * res_metrics->kv_cache_used_cells / params.n_ctx}
             },{
                     {"name",  "kv_cache_tokens"},
                     {"help",  "KV-cache tokens."},
-                    {"value",  (uint64_t) result.kv_cache_tokens_count}
+                    {"value",  (uint64_t) res_metrics->kv_cache_tokens_count}
             },{
                     {"name",  "requests_processing"},
                     {"help",  "Number of request processing."},
-                    {"value",  (uint64_t) result.n_processing_slots}
+                    {"value",  (uint64_t) res_metrics->n_processing_slots}
             },{
                     {"name",  "requests_deferred"},
                     {"help",  "Number of request deferred."},
-                    {"value",  (uint64_t) result.n_tasks_deferred}
+                    {"value",  (uint64_t) res_metrics->n_tasks_deferred}
             }}}
         };
 
@@ -2566,7 +2558,7 @@ int main(int argc, char ** argv) {
             }
         }
 
-        res.set_header("Process-Start-Time-Unix", std::to_string(result.t_start));
+        res.set_header("Process-Start-Time-Unix", std::to_string(res_metrics->t_start));
 
         res.set_content(prometheus.str(), "text/plain; version=0.0.4");
         res.status = 200; // HTTP OK
@@ -2592,18 +2584,15 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        task_result_ptr result_raw = ctx_server.queue_results.recv(id_task);
+        task_result_ptr result = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
-        if (result_raw->type == RESULT_TYPE_ERROR) {
-            auto result = server_task_result_error::from_ptr(result_raw);
-            res_error(res, format_error_response(result.err_msg, result.err_type));
+        if (result->is_error()) {
+            res_error(res, result->to_json());
             return;
         }
 
-        GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_SAVE_LOAD);
-        auto result = server_task_result_slot_save_load::from_ptr(result_raw);
-        res_ok(res, result.to_json());
+        res_ok(res, result->to_json());
     };
 
     const auto handle_slots_restore = [&ctx_server, &res_error, &res_ok, &params](const httplib::Request & req, httplib::Response & res, int id_slot) {
@@ -2626,18 +2615,16 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        task_result_ptr result_raw = ctx_server.queue_results.recv(id_task);
+        task_result_ptr result = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
-        if (result_raw->type == RESULT_TYPE_ERROR) {
-            auto result = server_task_result_error::from_ptr(result_raw);
-            res_error(res, format_error_response(result.err_msg, result.err_type));
+        if (result->is_error()) {
+            res_error(res, result->to_json());
             return;
         }
 
-        GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_SAVE_LOAD);
-        auto result = server_task_result_slot_save_load::from_ptr(result_raw);
-        res_ok(res, result.to_json());
+        GGML_ASSERT(dynamic_cast<server_task_result_slot_save_load*>(result.get()) != nullptr);
+        res_ok(res, result->to_json());
     };
 
     const auto handle_slots_erase = [&ctx_server, &res_error, &res_ok](const httplib::Request & /* req */, httplib::Response & res, int id_slot) {
@@ -2650,18 +2637,16 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        task_result_ptr result_raw = ctx_server.queue_results.recv(id_task);
+        task_result_ptr result = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
-        if (result_raw->type == RESULT_TYPE_ERROR) {
-            auto result = server_task_result_error::from_ptr(result_raw);
-            res_error(res, format_error_response(result.err_msg, result.err_type));
+        if (result->is_error()) {
+            res_error(res, result->to_json());
             return;
         }
 
-        GGML_ASSERT(result_raw->type == RESULT_TYPE_SLOT_ERASE);
-        auto result = server_task_result_slot_erase::from_ptr(result_raw);
-        res_ok(res, result.to_json());
+        GGML_ASSERT(dynamic_cast<server_task_result_slot_erase*>(result.get()) != nullptr);
+        res_ok(res, result->to_json());
     };
 
     const auto handle_slots_action = [&params, &res_error, &handle_slots_save, &handle_slots_restore, &handle_slots_erase](const httplib::Request & req, httplib::Response & res) {
@@ -2722,15 +2707,13 @@ int main(int argc, char ** argv) {
             server_task_inf_type inf_type,
             json & data,
             httplib::Response & res,
-            const std::function<std::vector<json>(server_task_result_cmpl_partial&)> & format_partial = nullptr,
-            const std::function<json(std::vector<server_task_result_cmpl_final>&)> & format_final = nullptr,
-            // wether to send [DONE] event after completion (required for OAI-compat)
-            bool send_done_event = false) {
+            bool oai_compat = false) {
         if (ctx_server.params_base.embedding) {
             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
             return;
         }
 
+        data["completion_id"] = gen_chatcmplid();
         std::vector<server_task> tasks = ctx_server.create_tasks_inference(data, inf_type);
         ctx_server.queue_results.add_waiting_tasks(tasks);
         ctx_server.queue_tasks.post(tasks);
@@ -2739,17 +2722,15 @@ int main(int argc, char ** argv) {
         const auto task_ids = server_task::get_list_id(tasks);
 
         if (!stream) {
-            ctx_server.receive_multi_results<server_task_result_cmpl_final>(task_ids, [&](std::vector<server_task_result_cmpl_final> & results) {
-                if (format_final) {
-                    res_ok(res, format_final(results));
-                } else if (results.size() == 1) {
+            ctx_server.receive_multi_results(task_ids, [&](std::vector<task_result_ptr> & results) {
+                if (results.size() == 1) {
                     // single result
-                    res_ok(res, results[0].to_json());
+                    res_ok(res, oai_compat ? results[0]->to_json_oai_compat() : results[0]->to_json());
                 } else {
                     // multiple results (multitask)
                     json arr = json::array();
                     for (auto & res : results) {
-                        arr.push_back(res.to_json());
+                        arr.push_back(oai_compat ? res->to_json_oai_compat() : res->to_json());
                     }
                     res_ok(res, arr);
                 }
@@ -2759,22 +2740,23 @@ int main(int argc, char ** argv) {
 
             ctx_server.queue_results.remove_waiting_task_ids(task_ids);
         } else {
-            const auto chunked_content_provider = [task_ids, &ctx_server, format_partial = std::move(format_partial), send_done_event](size_t, httplib::DataSink & sink) {
-                ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_cmpl_partial & result) -> bool {
-                    if (format_partial) {
-                        for (const auto & res : format_partial(result)) {
+            const auto chunked_content_provider = [task_ids, &ctx_server, oai_compat](size_t, httplib::DataSink & sink) {
+                ctx_server.receive_cmpl_results_stream(task_ids, [&](task_result_ptr & result) -> bool {
+                    json res_json = oai_compat ? result->to_json_oai_compat() : result->to_json();
+                    if (res_json.is_array()) {
+                        for (const auto & res : res_json) {
                             if (!server_sent_event(sink, "data", res)) {
                                 return false;
                             }
                         }
                         return true;
                     } else {
-                        return server_sent_event(sink, "data", result.to_json());
+                        return server_sent_event(sink, "data", res_json);
                     }
                 }, [&](const json & error_data) {
                     server_sent_event(sink, "error", error_data);
                 });
-                if (send_done_event) {
+                if (oai_compat) {
                     static const std::string ev_done = "data: [DONE]\n\n";
                     sink.write(ev_done.data(), ev_done.size());
                 }
@@ -2792,13 +2774,7 @@ int main(int argc, char ** argv) {
 
     const auto handle_completions = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
         json data = json::parse(req.body);
-        return handle_completions_generic(
-            SERVER_TASK_INF_TYPE_COMPLETION,
-            data,
-            res,
-            // TODO: support OAI-compat response via format_partial and format_final
-            /* format_partial */ nullptr,
-            /* format_final */ nullptr);
+        return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res);
     };
 
     const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
@@ -2851,7 +2827,7 @@ int main(int argc, char ** argv) {
         return handle_completions_generic(SERVER_TASK_INF_TYPE_INFILL, data, res);
     };
 
-    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_generic, verbose](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &params, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
         if (ctx_server.params_base.embedding) {
             res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
             return;
@@ -2859,20 +2835,9 @@ int main(int argc, char ** argv) {
 
         json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
 
-        const auto completion_id = gen_chatcmplid();
         std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
 
-        return handle_completions_generic(
-            SERVER_TASK_INF_TYPE_COMPLETION,
-            data,
-            res,
-            /* format_partial */ [data, model_name, completion_id](server_task_result_cmpl_partial & result) {
-                return format_partial_response_oaicompat(model_name, result, completion_id);
-            },
-            /* format_final */ [data, verbose, model_name](std::vector<server_task_result_cmpl_final> & results) {
-                return format_final_response_oaicompat(data, results[0], model_name, false, verbose);
-            },
-            /* send_done_event */ true);
+        return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true);
     };
 
     const auto handle_models = [&params, &ctx_server](const httplib::Request &, httplib::Response & res) {
@@ -2973,10 +2938,10 @@ int main(int argc, char ** argv) {
             // get the result
             std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
 
-            ctx_server.receive_multi_results<server_task_result_embd>(task_ids, [&](std::vector<server_task_result_embd> & results) {
+            ctx_server.receive_multi_results(task_ids, [&](std::vector<task_result_ptr> & results) {
                 for (auto & res : results) {
-                    GGML_ASSERT(res.type == RESULT_TYPE_EMBD);
-                    responses.push_back(res.to_json());
+                    GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
+                    responses.push_back(res->to_json());
                 }
             }, [&](const json & error_data) {
                 res_error(res, error_data);
@@ -3052,10 +3017,10 @@ int main(int argc, char ** argv) {
             // get the result
             std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
 
-            ctx_server.receive_multi_results<server_task_result_rerank>(task_ids, [&](std::vector<server_task_result_rerank> & results) {
+            ctx_server.receive_multi_results(task_ids, [&](std::vector<task_result_ptr> & results) {
                 for (auto & res : results) {
-                    GGML_ASSERT(res.type == RESULT_TYPE_RERANK);
-                    responses.push_back(res.to_json());
+                    GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
+                    responses.push_back(res->to_json());
                 }
             }, [&](const json & error_data) {
                 res_error(res, error_data);
@@ -3111,18 +3076,16 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        task_result_ptr result_raw = ctx_server.queue_results.recv(id_task);
+        task_result_ptr result = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
-        if (result_raw->type == RESULT_TYPE_ERROR) {
-            auto result = server_task_result_error::from_ptr(result_raw);
-            res_error(res, format_error_response(result.err_msg, result.err_type));
+        if (result->is_error()) {
+            res_error(res, result->to_json());
             return;
         }
 
-        GGML_ASSERT(result_raw->type == RESULT_TYPE_APPLY_LORA);
-        auto result = server_task_result_apply_lora::from_ptr(result_raw);
-        res_ok(res, result.to_json());
+        GGML_ASSERT(dynamic_cast<server_task_result_apply_lora*>(result.get()) != nullptr);
+        res_ok(res, result->to_json());
     };
 
     //
diff --git a/examples/server/server.hpp b/examples/server/server.hpp
index 1e65614f62ac7..201f154560e53 100644
--- a/examples/server/server.hpp
+++ b/examples/server/server.hpp
@@ -15,9 +15,6 @@
 
 using json = nlohmann::ordered_json;
 
-// cast a shared_ptr to a specific type using copy constructor
-#define copy_cast_ptr(TYPEOUT, ptr) *(static_cast<TYPEOUT*>(ptr.get()));
-
 enum stop_type {
     STOP_TYPE_NONE,
     STOP_TYPE_EOS,
@@ -68,19 +65,6 @@ enum error_type {
     ERROR_TYPE_NOT_SUPPORTED, // custom error
 };
 
-enum result_type {
-    RESULT_TYPE_CMPL_FINAL,
-    RESULT_TYPE_CMPL_PARTIAL,
-    RESULT_TYPE_EMBD,
-    RESULT_TYPE_RERANK,
-    RESULT_TYPE_METRICS,
-    RESULT_TYPE_SLOT_SAVE_LOAD,
-    RESULT_TYPE_SLOT_ERASE,
-    RESULT_TYPE_APPLY_LORA,
-    RESULT_TYPE_ERROR,
-    RESULT_TYPE_UNKNOWN, // will throw an error
-};
-
 struct server_task {
     int id        = -1; // to be filled by server_queue
     int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
@@ -126,6 +110,12 @@ struct slot_params {
     uint32_t seed_cur;
     bool can_speculative;
 
+    // OAI-compat fields
+    bool oaicompat = false;
+    std::string oaicompat_model;
+    std::string oaicompat_cmpl_id;
+    bool verbose = false;
+
     json to_json() {
         std::vector<std::string> samplers;
         samplers.reserve(sampling.samplers.size());
@@ -205,11 +195,24 @@ struct result_timings {
 };
 
 struct server_task_result {
-    result_type type = RESULT_TYPE_UNKNOWN;
     int id           = -1;
     int id_slot      = -1;
-    server_task_result() = default;
-    server_task_result(result_type type) : type(type) {}
+    virtual bool is_error() {
+        // only used by server_task_result_error
+        return false;
+    }
+    virtual bool is_stop() {
+        // only used by server_task_result_cmpl_partial
+        return false;
+    }
+    virtual int get_index() {
+        return -1;
+    }
+    virtual json to_json() = 0;
+    virtual json to_json_oai_compat() {
+        // used by server_task_result_cmpl_final and server_task_result_cmpl_partial
+        return json();
+    }
     virtual ~server_task_result() = default;
 };
 
@@ -233,12 +236,10 @@ struct completion_token_output {
 };
 
 struct server_task_result_cmpl_final : server_task_result {
-    server_task_result_cmpl_final() : server_task_result(RESULT_TYPE_CMPL_FINAL) {}
     int index = 0;
     std::string content;
     bool stream;
     result_timings timings;
-    std::string model_alias;
     std::string prompt;
 
     bool truncated;
@@ -253,14 +254,23 @@ struct server_task_result_cmpl_final : server_task_result {
 
     slot_params generation_params;
 
-    json to_json() {
+    // OAI-compat fields
+    std::string oaicompat_model;
+    std::string oaicompat_cmpl_id;
+    bool verbose = false;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override {
         // non-OAI-compat JSON
         return json {
             {"index",               index},
             {"content",             content},
             {"id_slot",             id_slot},
             {"stop",                true},
-            {"model",               model_alias},
+            {"model",               oaicompat_model},
             {"tokens_predicted",    n_decoded},
             {"tokens_evaluated",    n_prompt_tokens},
             {"generation_settings", generation_params.to_json()},
@@ -274,15 +284,55 @@ struct server_task_result_cmpl_final : server_task_result {
         };
     }
 
-    static server_task_result_cmpl_final from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
-        return copy_cast_ptr(server_task_result_cmpl_final, result_ptr);
-    }
+    virtual json to_json_oai_compat() override {
+        std::string finish_reason = "length";
+        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+            finish_reason = "stop";
+        }
+
+        json choices = json::array({json{
+            {"finish_reason", finish_reason},
+            {"index", 0},
+            {"message", json{
+                {"content", content},
+                {"role", "assistant"}
+            }
+        }}});
+
+        std::time_t t = std::time(0);
+
+        json res = json {
+            {"choices", choices},
+            {"created", t},
+            {"model", oaicompat_model},
+            {"object", "chat.completion"},
+            {"usage", json {
+                {"completion_tokens", n_decoded},
+                {"prompt_tokens",     n_prompt_tokens},
+                {"total_tokens",      n_decoded + n_prompt_tokens}
+            }},
+            {"id", oaicompat_cmpl_id}
+        };
+
+        // extra fields for debugging purposes
+        if (verbose) {
+            res["__verbose"] = to_json();
+        }
+
+        // TODO: fix this
+        // if (result.contains("completion_probabilities")) {
+        //     res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
+        // }
+
+        if (timings.prompt_n >= 0) {
+            res.push_back({"timings", timings.to_json()});
+        }
 
-    virtual ~server_task_result_cmpl_final() = default;
+        return res;
+    }
 };
 
 struct server_task_result_cmpl_partial : server_task_result {
-    server_task_result_cmpl_partial() : server_task_result(RESULT_TYPE_CMPL_PARTIAL) {}
     int index = 0;
     std::string content;
 
@@ -295,7 +345,20 @@ struct server_task_result_cmpl_partial : server_task_result {
     std::vector<completion_token_output> probs_output;
     result_timings timings;
 
-    json to_json() {
+    // OAI-compat fields
+    std::string oaicompat_model;
+    std::string oaicompat_cmpl_id;
+    bool verbose = false;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual bool is_stop() override {
+        return stop != STOP_TYPE_NONE;
+    }
+
+    virtual json to_json() override {
         bool is_stop = stop != STOP_TYPE_NONE;
         // non-OAI-compat JSON
         json res = json {
@@ -317,67 +380,186 @@ struct server_task_result_cmpl_partial : server_task_result {
         return res;
     }
 
-    static server_task_result_cmpl_partial from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
-        return copy_cast_ptr(server_task_result_cmpl_partial, result_ptr);
-    }
+    virtual json to_json_oai_compat() override {
+        bool first = n_decoded == 0;
+
+        std::string finish_reason;
+        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+            finish_reason = "stop";
+        } else if (stop == STOP_TYPE_LIMIT) {
+            finish_reason = "length";
+        }
+
+        std::time_t t = std::time(0);
+
+        json choices;
 
-    virtual ~server_task_result_cmpl_partial() = default;
+        if (!finish_reason.empty()) {
+            choices = json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"delta", json::object()}}});
+        } else {
+            if (first) {
+                if (content.empty()) {
+                    choices = json::array({json{{"finish_reason", nullptr},
+                                                {"index", 0},
+                                                {"delta", json{{"role", "assistant"}}}}});
+                } else {
+                    // We have to send this as two updates to conform to openai behavior
+                    json initial_ret = json{{"choices", json::array({json{
+                                            {"finish_reason", nullptr},
+                                            {"index", 0},
+                                            {"delta", json{
+                                                {"role", "assistant"}
+                                            }}}})},
+                                {"created", t},
+                                {"id", oaicompat_cmpl_id},
+                                {"model", oaicompat_model},
+                                {"object", "chat.completion.chunk"}};
+
+                    json second_ret = json{
+                                {"choices", json::array({json{{"finish_reason", nullptr},
+                                                                {"index", 0},
+                                                                {"delta", json{
+                                                                {"content", content}}}
+                                                                }})},
+                                {"created", t},
+                                {"id", oaicompat_cmpl_id},
+                                {"model", oaicompat_model},
+                                {"object", "chat.completion.chunk"}};
+
+                    return std::vector<json>({initial_ret, second_ret});
+                }
+            } else {
+                // Some idiosyncrasy in task processing logic makes several trailing calls
+                // with empty content, we ignore these at the calee site.
+                if (content.empty()) {
+                    return std::vector<json>({json::object()});
+                }
+
+                choices = json::array({json{
+                    {"finish_reason", nullptr},
+                    {"index", 0},
+                    {"delta",
+                    json{
+                        {"content", content},
+                    }},
+                }});
+            }
+        }
+
+        json ret = json {
+            {"choices", choices},
+            {"created", t},
+            {"id",      oaicompat_cmpl_id},
+            {"model",   oaicompat_model},
+            {"object",  "chat.completion.chunk"}
+        };
+
+        if (timings.prompt_n >= 0) {
+            ret.push_back({"timings", timings.to_json()});
+        }
+
+        if (!finish_reason.empty()) {
+            ret.push_back({"usage", json {
+                {"completion_tokens", n_decoded},
+                {"prompt_tokens",     n_prompt_tokens},
+                {"total_tokens",      n_decoded + n_prompt_tokens},
+            }});
+        }
+
+        return std::vector<json>({ret});
+    }
 };
 
 struct server_task_result_embd : server_task_result {
-    server_task_result_embd() : server_task_result(RESULT_TYPE_EMBD) {}
-    result_type type = RESULT_TYPE_EMBD;
     int index = 0;
     std::vector<float> embedding;
 
-    json to_json() {
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override {
         return json {
             {"index",     index},
             {"embedding", embedding},
         };
     }
-
-    static server_task_result_embd from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
-        return copy_cast_ptr(server_task_result_embd, result_ptr);
-    }
-
-    virtual ~server_task_result_embd() = default;
 };
 
 struct server_task_result_rerank : server_task_result {
-    server_task_result_rerank() : server_task_result(RESULT_TYPE_RERANK) {}
     int index = 0;
     float score = -1e6;
 
-    json to_json() {
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override {
         return json {
             {"index", index},
             {"score", score},
         };
     }
+};
 
-    static server_task_result_rerank from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
-        return copy_cast_ptr(server_task_result_rerank, result_ptr);
+// this function maybe used outside of server_task_result_error
+static json format_error_response(const std::string & message, const enum error_type type) {
+    std::string type_str;
+    int code = 500;
+    switch (type) {
+        case ERROR_TYPE_INVALID_REQUEST:
+            type_str = "invalid_request_error";
+            code = 400;
+            break;
+        case ERROR_TYPE_AUTHENTICATION:
+            type_str = "authentication_error";
+            code = 401;
+            break;
+        case ERROR_TYPE_NOT_FOUND:
+            type_str = "not_found_error";
+            code = 404;
+            break;
+        case ERROR_TYPE_SERVER:
+            type_str = "server_error";
+            code = 500;
+            break;
+        case ERROR_TYPE_PERMISSION:
+            type_str = "permission_error";
+            code = 403;
+            break;
+        case ERROR_TYPE_NOT_SUPPORTED:
+            type_str = "not_supported_error";
+            code = 501;
+            break;
+        case ERROR_TYPE_UNAVAILABLE:
+            type_str = "unavailable_error";
+            code = 503;
+            break;
     }
-
-    virtual ~server_task_result_rerank() = default;
-};
+    return json {
+        {"code", code},
+        {"message", message},
+        {"type", type_str},
+    };
+}
 
 struct server_task_result_error : server_task_result {
-    server_task_result_error() : server_task_result(RESULT_TYPE_ERROR) {}
     int index = 0;
     error_type err_type = ERROR_TYPE_SERVER;
     std::string err_msg;
 
-    static server_task_result_error from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
-        return copy_cast_ptr(server_task_result_error, result_ptr);
+    virtual bool is_error() override {
+        return true;
     }
 
-    virtual ~server_task_result_error() = default;
+    virtual json to_json() override {
+        return format_error_response(err_msg, err_type);
+    }
 };
 
 struct server_task_result_metrics : server_task_result {
-    server_task_result_metrics() : server_task_result(RESULT_TYPE_METRICS) {}
     int n_idle_slots;
     int n_processing_slots;
     int n_tasks_deferred;
@@ -404,7 +586,7 @@ struct server_task_result_metrics : server_task_result {
     // TODO: get rid of this json object and use to_json() instead
     json slots_data = json::array();
 
-    json to_json() {
+    virtual json to_json() override {
         return json {
             { "idle",                            n_idle_slots },
             { "processing",                      n_processing_slots },
@@ -430,16 +612,9 @@ struct server_task_result_metrics : server_task_result {
             { "slots",                           slots_data },
         };
     }
-
-    static server_task_result_metrics from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
-        return copy_cast_ptr(server_task_result_metrics, result_ptr);
-    }
-
-    virtual ~server_task_result_metrics() = default;
 };
 
 struct server_task_result_slot_save_load : server_task_result {
-    server_task_result_slot_save_load() : server_task_result(RESULT_TYPE_SLOT_SAVE_LOAD) {}
     std::string filename;
     bool is_save; // true = save, false = load
 
@@ -447,7 +622,7 @@ struct server_task_result_slot_save_load : server_task_result {
     size_t n_bytes;
     double t_ms;
 
-    json to_json() {
+    virtual json to_json() override {
         if (is_save) {
             return json {
                 { "id_slot",   id_slot },
@@ -470,39 +645,21 @@ struct server_task_result_slot_save_load : server_task_result {
             };
         }
     }
-
-    static server_task_result_slot_save_load from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
-        return copy_cast_ptr(server_task_result_slot_save_load, result_ptr);
-    }
-
-    virtual ~server_task_result_slot_save_load() = default;
 };
 
 struct server_task_result_slot_erase : server_task_result {
-    server_task_result_slot_erase() : server_task_result(RESULT_TYPE_SLOT_ERASE) {}
     size_t n_erased;
 
-    json to_json() {
+    virtual json to_json() override {
         return json {
             { "id_slot",  id_slot },
             { "n_erased", n_erased },
         };
     }
-
-    static server_task_result_slot_erase from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
-        return copy_cast_ptr(server_task_result_slot_erase, result_ptr);
-    }
-
-    virtual ~server_task_result_slot_erase() = default;
 };
 
 struct server_task_result_apply_lora : server_task_result {
-    server_task_result_apply_lora() : server_task_result(RESULT_TYPE_APPLY_LORA) {}
-    json to_json() {
+    virtual json to_json() override {
         return json {{ "success", true }};
     }
-
-    static server_task_result_apply_lora from_ptr(std::unique_ptr<server_task_result> & result_ptr) {
-        return copy_cast_ptr(server_task_result_apply_lora, result_ptr);
-    }
 };
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 98a777192027c..8a8d9f8f7e894 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -583,155 +583,6 @@ static json oaicompat_completion_params_parse(
     return llama_params;
 }
 
-static json format_final_response_oaicompat(
-        const json & request,
-        server_task_result_cmpl_final & result,
-        const std::string & completion_id,
-        bool streaming = false,
-        bool verbose = false) {
-    std::string finish_reason = "length";
-    if (result.stop == STOP_TYPE_WORD || result.stop == STOP_TYPE_EOS) {
-        finish_reason = "stop";
-    }
-
-    json choices =
-        streaming ? json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"delta", json::object()}}})
-                  : json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"message", json{{"content", result.content},
-                                                         {"role", "assistant"}}}}});
-
-    std::time_t t = std::time(0);
-
-    json res = json {
-        {"choices", choices},
-        {"created", t},
-        {"model",
-            json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-        {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
-        {"usage", json {
-            {"completion_tokens", result.n_decoded},
-            {"prompt_tokens",     result.n_prompt_tokens},
-            {"total_tokens",      result.n_decoded + result.n_prompt_tokens}
-        }},
-        {"id", completion_id}
-    };
-
-    // extra fields for debugging purposes
-    if (verbose) {
-        res["__verbose"] = result.to_json();
-    }
-
-    // TODO: fix this
-    // if (result.contains("completion_probabilities")) {
-    //     res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
-    // }
-
-    if (result.timings.prompt_n >= 0) {
-        res.push_back({"timings", result.timings.to_json()});
-    }
-
-    return res;
-}
-
-// return value is vector as there is one case where we might need to generate two responses
-static std::vector<json> format_partial_response_oaicompat(
-        std::string modelname,
-        server_task_result_cmpl_partial & result,
-        const std::string & completion_id) {
-    bool first = result.n_decoded == 0;
-    std::string content = result.content;
-
-    std::string finish_reason;
-    if (result.stop == STOP_TYPE_WORD || result.stop == STOP_TYPE_EOS) {
-        finish_reason = "stop";
-    } else if (result.stop == STOP_TYPE_LIMIT) {
-        finish_reason = "length";
-    }
-
-    std::time_t t = std::time(0);
-
-    json choices;
-
-    if (!finish_reason.empty()) {
-        choices = json::array({json{{"finish_reason", finish_reason},
-                                    {"index", 0},
-                                    {"delta", json::object()}}});
-    } else {
-        if (first) {
-            if (content.empty()) {
-                choices = json::array({json{{"finish_reason", nullptr},
-                                            {"index", 0},
-                                            {"delta", json{{"role", "assistant"}}}}});
-            } else {
-                // We have to send this as two updates to conform to openai behavior
-                json initial_ret = json{{"choices", json::array({json{
-                                        {"finish_reason", nullptr},
-                                        {"index", 0},
-                                        {"delta", json{
-                                            {"role", "assistant"}
-                                        }}}})},
-                            {"created", t},
-                            {"id", completion_id},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
-
-                json second_ret = json{
-                            {"choices", json::array({json{{"finish_reason", nullptr},
-                                                            {"index", 0},
-                                                            {"delta", json{
-                                                            {"content", content}}}
-                                                            }})},
-                            {"created", t},
-                            {"id", completion_id},
-                            {"model", modelname},
-                            {"object", "chat.completion.chunk"}};
-
-                return std::vector<json>({initial_ret, second_ret});
-            }
-        } else {
-            // Some idiosyncrasy in task processing logic makes several trailing calls
-            // with empty content, we ignore these at the calee site.
-            if (content.empty()) {
-                return std::vector<json>({json::object()});
-            }
-
-            choices = json::array({json{
-                {"finish_reason", nullptr},
-                {"index", 0},
-                {"delta",
-                json{
-                    {"content", content},
-                }},
-            }});
-        }
-    }
-
-    json ret = json {
-        {"choices", choices},
-        {"created", t},
-        {"id",      completion_id},
-        {"model",   modelname},
-        {"object",  "chat.completion.chunk"}
-    };
-
-    if (result.timings.prompt_n >= 0) {
-        ret.push_back({"timings", result.timings.to_json()});
-    }
-
-    if (!finish_reason.empty()) {
-        ret.push_back({"usage", json {
-            {"completion_tokens", result.n_decoded},
-            {"prompt_tokens",     result.n_prompt_tokens},
-            {"total_tokens",      result.n_decoded + result.n_prompt_tokens}
-        }});
-    }
-
-    return std::vector<json>({ret});
-}
-
 static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
     json data = json::array();
     int i = 0;
@@ -823,43 +674,3 @@ static json format_detokenized_response(const std::string & content) {
         {"content", content}
     };
 }
-
-static json format_error_response(const std::string & message, const enum error_type type) {
-    std::string type_str;
-    int code = 500;
-    switch (type) {
-        case ERROR_TYPE_INVALID_REQUEST:
-            type_str = "invalid_request_error";
-            code = 400;
-            break;
-        case ERROR_TYPE_AUTHENTICATION:
-            type_str = "authentication_error";
-            code = 401;
-            break;
-        case ERROR_TYPE_NOT_FOUND:
-            type_str = "not_found_error";
-            code = 404;
-            break;
-        case ERROR_TYPE_SERVER:
-            type_str = "server_error";
-            code = 500;
-            break;
-        case ERROR_TYPE_PERMISSION:
-            type_str = "permission_error";
-            code = 403;
-            break;
-        case ERROR_TYPE_NOT_SUPPORTED:
-            type_str = "not_supported_error";
-            code = 501;
-            break;
-        case ERROR_TYPE_UNAVAILABLE:
-            type_str = "unavailable_error";
-            code = 503;
-            break;
-    }
-    return json {
-        {"code", code},
-        {"message", message},
-        {"type", type_str},
-    };
-}

From 1cf769be673932e33791372ca1156503c51759b8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Dec 2024 16:04:36 +0100
Subject: [PATCH 11/19] remove server.hpp

---
 examples/server/server.cpp | 679 ++++++++++++++++++++++++++++++++++++-
 examples/server/server.hpp | 665 ------------------------------------
 examples/server/utils.hpp  |  26 --
 3 files changed, 675 insertions(+), 695 deletions(-)
 delete mode 100644 examples/server/server.hpp

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index c8cb48b15c6ba..44e6ead3ae897 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1,5 +1,4 @@
 #include "utils.hpp"
-#include "server.hpp"
 
 #include "arg.h"
 #include "common.h"
@@ -33,9 +32,682 @@
 
 using json = nlohmann::ordered_json;
 
+enum stop_type {
+    STOP_TYPE_NONE,
+    STOP_TYPE_EOS,
+    STOP_TYPE_WORD,
+    STOP_TYPE_LIMIT,
+};
+
+// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
+enum slot_state {
+    SLOT_STATE_IDLE,
+    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
+    SLOT_STATE_PROCESSING_PROMPT,
+    SLOT_STATE_DONE_PROMPT,
+    SLOT_STATE_GENERATING,
+};
+
+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+};
+
+enum server_task_type {
+    SERVER_TASK_TYPE_INFERENCE,
+    SERVER_TASK_TYPE_CANCEL,
+    SERVER_TASK_TYPE_NEXT_RESPONSE,
+    SERVER_TASK_TYPE_METRICS,
+    SERVER_TASK_TYPE_SLOT_SAVE,
+    SERVER_TASK_TYPE_SLOT_RESTORE,
+    SERVER_TASK_TYPE_SLOT_ERASE,
+    SERVER_TASK_TYPE_SET_LORA,
+};
+
+enum server_task_inf_type {
+    SERVER_TASK_INF_TYPE_COMPLETION,
+    SERVER_TASK_INF_TYPE_EMBEDDING,
+    SERVER_TASK_INF_TYPE_RERANK,
+    SERVER_TASK_INF_TYPE_INFILL,
+};
+
+// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
+enum error_type {
+    ERROR_TYPE_INVALID_REQUEST,
+    ERROR_TYPE_AUTHENTICATION,
+    ERROR_TYPE_SERVER,
+    ERROR_TYPE_NOT_FOUND,
+    ERROR_TYPE_PERMISSION,
+    ERROR_TYPE_UNAVAILABLE, // custom error
+    ERROR_TYPE_NOT_SUPPORTED, // custom error
+};
+
+struct server_task {
+    int id        = -1; // to be filled by server_queue
+    int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
+
+    llama_tokens prompt_tokens;
+    server_task_type type;
+
+    // TODO @ngxson : we should get rid of json type here
+    json data;
+
+    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
+
+    // utility function
+    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
+        std::unordered_set<int> ids(tasks.size());
+        for (size_t i = 0; i < tasks.size(); i++) {
+            ids.insert(tasks[i].id);
+        }
+        return ids;
+    }
+};
+
+struct slot_params {
+    bool stream       = true;
+    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
+
+    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
+    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
+    int32_t n_predict = -1; // new tokens to predict
+    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
+
+    int64_t t_max_prompt_ms  = -1; // TODO: implement
+    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
+
+    std::vector<std::string> antiprompt;
+    bool timings_per_token = false;
+
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
+
+    // params only used in to_json()
+    int32_t n_ctx;
+    uint32_t seed_cur;
+    bool can_speculative;
+
+    // OAI-compat fields
+    bool oaicompat = false;
+    std::string oaicompat_model;
+    std::string oaicompat_cmpl_id;
+    bool verbose = false;
+
+    json to_json() {
+        std::vector<std::string> samplers;
+        samplers.reserve(sampling.samplers.size());
+        for (const auto & sampler : sampling.samplers) {
+            samplers.emplace_back(common_sampler_type_to_str(sampler));
+        }
+
+        return json {
+            {"n_ctx",                     n_ctx},
+            {"n_predict",                 n_predict},     // Server configured n_predict
+            {"temperature",               sampling.temp},
+            {"dynatemp_range",            sampling.dynatemp_range},
+            {"dynatemp_exponent",         sampling.dynatemp_exponent},
+            {"top_k",                     sampling.top_k},
+            {"top_p",                     sampling.top_p},
+            {"min_p",                     sampling.min_p},
+            {"xtc_probability",           sampling.xtc_probability},
+            {"xtc_threshold",             sampling.xtc_threshold},
+            {"typical_p",                 sampling.typ_p},
+            {"repeat_last_n",             sampling.penalty_last_n},
+            {"repeat_penalty",            sampling.penalty_repeat},
+            {"presence_penalty",          sampling.penalty_present},
+            {"frequency_penalty",         sampling.penalty_freq},
+            {"dry_multiplier",            sampling.dry_multiplier},
+            {"dry_base",                  sampling.dry_base},
+            {"dry_allowed_length",        sampling.dry_allowed_length},
+            {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
+            {"dry_sequence_breakers",     sampling.dry_sequence_breakers},
+            {"mirostat",                  sampling.mirostat},
+            {"mirostat_tau",              sampling.mirostat_tau},
+            {"mirostat_eta",              sampling.mirostat_eta},
+            {"penalize_nl",               sampling.penalize_nl},
+            {"stop",                      antiprompt},
+            {"max_tokens",                n_predict}, // User configured n_predict
+            {"n_keep",                    n_keep},
+            {"n_discard",                 n_discard},
+            {"ignore_eos",                sampling.ignore_eos},
+            {"stream",                    stream},
+            //{"logit_bias",                sampling.logit_bias},
+            {"n_probs",                   sampling.n_probs},
+            {"min_keep",                  sampling.min_keep},
+            {"grammar",                   sampling.grammar},
+            {"samplers",                  samplers},
+            {"speculative",               can_speculative},
+            {"speculative.n_max",         speculative.n_max},
+            {"speculative.n_min",         speculative.n_min},
+            {"speculative.p_min",         speculative.p_min},
+            {"timings_per_token",         timings_per_token},
+        };
+    }
+};
+
+struct result_timings {
+    int32_t prompt_n = -1;
+    double prompt_ms;
+    double prompt_per_token_ms;
+    double prompt_per_second;
+
+    int32_t predicted_n = -1;
+    double predicted_ms;
+    double predicted_per_token_ms;
+    double predicted_per_second;
+
+    json to_json() {
+        return {
+            {"prompt_n",               prompt_n},
+            {"prompt_ms",              prompt_ms},
+            {"prompt_per_token_ms",    prompt_per_token_ms},
+            {"prompt_per_second",      prompt_per_second},
+
+            {"predicted_n",            predicted_n},
+            {"predicted_ms",           predicted_ms},
+            {"predicted_per_token_ms", predicted_per_token_ms},
+            {"predicted_per_second",   predicted_per_second},
+        };
+    }
+};
+
+struct server_task_result {
+    int id           = -1;
+    int id_slot      = -1;
+    virtual bool is_error() {
+        // only used by server_task_result_error
+        return false;
+    }
+    virtual bool is_stop() {
+        // only used by server_task_result_cmpl_partial
+        return false;
+    }
+    virtual int get_index() {
+        return -1;
+    }
+    virtual json to_json() = 0;
+    virtual json to_json_oai_compat() {
+        // used by server_task_result_cmpl_final and server_task_result_cmpl_partial
+        return json();
+    }
+    virtual ~server_task_result() = default;
+};
+
 // using shared_ptr for polymorphism of server_task_result
 using task_result_ptr = std::unique_ptr<server_task_result>;
 
+inline std::string stop_type_to_str(stop_type type) {
+    switch (type) {
+        case STOP_TYPE_EOS:   return "eos";
+        case STOP_TYPE_WORD:  return "word";
+        case STOP_TYPE_LIMIT: return "limit";
+        default:              return "none";
+    }
+}
+
+struct completion_token_output {
+    llama_token tok;
+    std::string text_to_send;
+    struct token_prob {
+        llama_token tok;
+        float prob;
+    };
+    std::vector<token_prob> probs;
+
+    json to_json(const llama_context * ctx) const {
+        json probs_for_token = json::array();
+        for (const auto & p : probs) {
+            const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
+            probs_for_token.push_back(json {
+                {"tok_str", tok_str},
+                {"prob",    p.prob},
+            });
+        }
+        return probs_for_token;
+    }
+
+    static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
+        json out = json::array();
+        for (const auto & prob : probs) {
+            const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
+            out.push_back(json {
+                {"content", tok_str},
+                {"probs",   prob.to_json(ctx)},
+            });
+        }
+        return out;
+    }
+};
+
+struct server_task_result_cmpl_final : server_task_result {
+    int index = 0;
+    std::string content;
+    bool stream;
+    result_timings timings;
+    std::string prompt;
+
+    bool truncated;
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+    int32_t n_tokens_cached;
+    int32_t has_new_line;
+    std::string stopping_word;
+    stop_type stop = STOP_TYPE_NONE;
+
+    std::vector<completion_token_output> probs_output;
+
+    slot_params generation_params;
+
+    // OAI-compat fields
+    std::string oaicompat_model;
+    std::string oaicompat_cmpl_id;
+    bool verbose = false;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override {
+        // non-OAI-compat JSON
+        return json {
+            {"index",               index},
+            {"content",             content},
+            {"id_slot",             id_slot},
+            {"stop",                true},
+            {"model",               oaicompat_model},
+            {"tokens_predicted",    n_decoded},
+            {"tokens_evaluated",    n_prompt_tokens},
+            {"generation_settings", generation_params.to_json()},
+            {"prompt",              prompt},
+            {"has_new_line",        has_new_line},
+            {"truncated",           truncated},
+            {"stop_type",           stop_type_to_str(stop)},
+            {"stopping_word",       stopping_word},
+            {"tokens_cached",       n_tokens_cached},
+            {"timings",             timings.to_json()},
+        };
+    }
+
+    virtual json to_json_oai_compat() override {
+        std::string finish_reason = "length";
+        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+            finish_reason = "stop";
+        }
+
+        json choices = json::array({json{
+            {"finish_reason", finish_reason},
+            {"index", 0},
+            {"message", json{
+                {"content", content},
+                {"role", "assistant"}
+            }
+        }}});
+
+        std::time_t t = std::time(0);
+
+        json res = json {
+            {"choices", choices},
+            {"created", t},
+            {"model", oaicompat_model},
+            {"object", "chat.completion"},
+            {"usage", json {
+                {"completion_tokens", n_decoded},
+                {"prompt_tokens",     n_prompt_tokens},
+                {"total_tokens",      n_decoded + n_prompt_tokens}
+            }},
+            {"id", oaicompat_cmpl_id}
+        };
+
+        // extra fields for debugging purposes
+        if (verbose) {
+            res["__verbose"] = to_json();
+        }
+
+        // TODO: fix this
+        // if (result.contains("completion_probabilities")) {
+        //     res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
+        // }
+
+        if (timings.prompt_n >= 0) {
+            res.push_back({"timings", timings.to_json()});
+        }
+
+        return res;
+    }
+};
+
+struct server_task_result_cmpl_partial : server_task_result {
+    int index = 0;
+    std::string content;
+
+    bool truncated;
+    int32_t n_decoded;
+    int32_t n_prompt_tokens;
+
+    stop_type stop = STOP_TYPE_NONE;
+
+    std::vector<completion_token_output> probs_output;
+    result_timings timings;
+
+    // OAI-compat fields
+    std::string oaicompat_model;
+    std::string oaicompat_cmpl_id;
+    bool verbose = false;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual bool is_stop() override {
+        return stop != STOP_TYPE_NONE;
+    }
+
+    virtual json to_json() override {
+        bool is_stop = stop != STOP_TYPE_NONE;
+        // non-OAI-compat JSON
+        json res = json {
+            {"index",            index},
+            {"content",          content},
+            {"stop_type",        stop_type_to_str(stop)},
+            {"stop",             is_stop},
+            {"id_slot",          id_slot},
+            {"tokens_predicted", n_decoded},
+            {"tokens_evaluated", n_prompt_tokens},
+        };
+        // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
+        if (timings.prompt_n > 0) {
+            res.push_back({"timings", timings.to_json()});
+        }
+        if (is_stop) {
+            res.push_back({"truncated", truncated});
+        }
+        return res;
+    }
+
+    virtual json to_json_oai_compat() override {
+        bool first = n_decoded == 0;
+
+        std::string finish_reason;
+        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
+            finish_reason = "stop";
+        } else if (stop == STOP_TYPE_LIMIT) {
+            finish_reason = "length";
+        }
+
+        std::time_t t = std::time(0);
+
+        json choices;
+
+        if (!finish_reason.empty()) {
+            choices = json::array({json{{"finish_reason", finish_reason},
+                                        {"index", 0},
+                                        {"delta", json::object()}}});
+        } else {
+            if (first) {
+                if (content.empty()) {
+                    choices = json::array({json{{"finish_reason", nullptr},
+                                                {"index", 0},
+                                                {"delta", json{{"role", "assistant"}}}}});
+                } else {
+                    // We have to send this as two updates to conform to openai behavior
+                    json initial_ret = json{{"choices", json::array({json{
+                                            {"finish_reason", nullptr},
+                                            {"index", 0},
+                                            {"delta", json{
+                                                {"role", "assistant"}
+                                            }}}})},
+                                {"created", t},
+                                {"id", oaicompat_cmpl_id},
+                                {"model", oaicompat_model},
+                                {"object", "chat.completion.chunk"}};
+
+                    json second_ret = json{
+                                {"choices", json::array({json{{"finish_reason", nullptr},
+                                                                {"index", 0},
+                                                                {"delta", json{
+                                                                {"content", content}}}
+                                                                }})},
+                                {"created", t},
+                                {"id", oaicompat_cmpl_id},
+                                {"model", oaicompat_model},
+                                {"object", "chat.completion.chunk"}};
+
+                    return std::vector<json>({initial_ret, second_ret});
+                }
+            } else {
+                // Some idiosyncrasy in task processing logic makes several trailing calls
+                // with empty content, we ignore these at the calee site.
+                if (content.empty()) {
+                    return std::vector<json>({json::object()});
+                }
+
+                choices = json::array({json{
+                    {"finish_reason", nullptr},
+                    {"index", 0},
+                    {"delta",
+                    json{
+                        {"content", content},
+                    }},
+                }});
+            }
+        }
+
+        json ret = json {
+            {"choices", choices},
+            {"created", t},
+            {"id",      oaicompat_cmpl_id},
+            {"model",   oaicompat_model},
+            {"object",  "chat.completion.chunk"}
+        };
+
+        if (timings.prompt_n >= 0) {
+            ret.push_back({"timings", timings.to_json()});
+        }
+
+        if (!finish_reason.empty()) {
+            ret.push_back({"usage", json {
+                {"completion_tokens", n_decoded},
+                {"prompt_tokens",     n_prompt_tokens},
+                {"total_tokens",      n_decoded + n_prompt_tokens},
+            }});
+        }
+
+        return std::vector<json>({ret});
+    }
+};
+
+struct server_task_result_embd : server_task_result {
+    int index = 0;
+    std::vector<float> embedding;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override {
+        return json {
+            {"index",     index},
+            {"embedding", embedding},
+        };
+    }
+};
+
+struct server_task_result_rerank : server_task_result {
+    int index = 0;
+    float score = -1e6;
+
+    virtual int get_index() override {
+        return index;
+    }
+
+    virtual json to_json() override {
+        return json {
+            {"index", index},
+            {"score", score},
+        };
+    }
+};
+
+// this function maybe used outside of server_task_result_error
+static json format_error_response(const std::string & message, const enum error_type type) {
+    std::string type_str;
+    int code = 500;
+    switch (type) {
+        case ERROR_TYPE_INVALID_REQUEST:
+            type_str = "invalid_request_error";
+            code = 400;
+            break;
+        case ERROR_TYPE_AUTHENTICATION:
+            type_str = "authentication_error";
+            code = 401;
+            break;
+        case ERROR_TYPE_NOT_FOUND:
+            type_str = "not_found_error";
+            code = 404;
+            break;
+        case ERROR_TYPE_SERVER:
+            type_str = "server_error";
+            code = 500;
+            break;
+        case ERROR_TYPE_PERMISSION:
+            type_str = "permission_error";
+            code = 403;
+            break;
+        case ERROR_TYPE_NOT_SUPPORTED:
+            type_str = "not_supported_error";
+            code = 501;
+            break;
+        case ERROR_TYPE_UNAVAILABLE:
+            type_str = "unavailable_error";
+            code = 503;
+            break;
+    }
+    return json {
+        {"code", code},
+        {"message", message},
+        {"type", type_str},
+    };
+}
+
+struct server_task_result_error : server_task_result {
+    int index = 0;
+    error_type err_type = ERROR_TYPE_SERVER;
+    std::string err_msg;
+
+    virtual bool is_error() override {
+        return true;
+    }
+
+    virtual json to_json() override {
+        return format_error_response(err_msg, err_type);
+    }
+};
+
+struct server_task_result_metrics : server_task_result {
+    int n_idle_slots;
+    int n_processing_slots;
+    int n_tasks_deferred;
+    int64_t t_start;
+
+    int32_t kv_cache_tokens_count;
+    int32_t kv_cache_used_cells;
+
+    // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
+    uint64_t n_prompt_tokens_processed_total = 0;
+    uint64_t t_prompt_processing_total       = 0;
+    uint64_t n_tokens_predicted_total        = 0;
+    uint64_t t_tokens_generation_total       = 0;
+
+    uint64_t n_prompt_tokens_processed = 0;
+    uint64_t t_prompt_processing       = 0;
+
+    uint64_t n_tokens_predicted  = 0;
+    uint64_t t_tokens_generation = 0;
+
+    uint64_t n_decode_total     = 0;
+    uint64_t n_busy_slots_total = 0;
+
+    // TODO: get rid of this json object and use to_json() instead
+    json slots_data = json::array();
+
+    virtual json to_json() override {
+        return json {
+            { "idle",                            n_idle_slots },
+            { "processing",                      n_processing_slots },
+            { "deferred",                        n_tasks_deferred },
+            { "t_start",                         t_start },
+
+            { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
+            { "t_tokens_generation_total",       t_tokens_generation_total },
+            { "n_tokens_predicted_total",        n_tokens_predicted_total },
+            { "t_prompt_processing_total",       t_prompt_processing_total },
+
+            { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
+            { "t_prompt_processing",             t_prompt_processing },
+            { "n_tokens_predicted",              n_tokens_predicted },
+            { "t_tokens_generation",             t_tokens_generation },
+
+            { "n_decode_total",                  n_decode_total },
+            { "n_busy_slots_total",              n_busy_slots_total },
+
+            { "kv_cache_tokens_count",           kv_cache_tokens_count },
+            { "kv_cache_used_cells",             kv_cache_used_cells },
+
+            { "slots",                           slots_data },
+        };
+    }
+};
+
+struct server_task_result_slot_save_load : server_task_result {
+    std::string filename;
+    bool is_save; // true = save, false = load
+
+    size_t n_tokens;
+    size_t n_bytes;
+    double t_ms;
+
+    virtual json to_json() override {
+        if (is_save) {
+            return json {
+                { "id_slot",   id_slot },
+                { "filename",  filename },
+                { "n_saved",   n_tokens },
+                { "n_written", n_bytes },
+                { "timings", {
+                    { "save_ms", t_ms }
+                }},
+            };
+        } else {
+            return json {
+                { "id_slot",    id_slot },
+                { "filename",   filename },
+                { "n_restored", n_tokens },
+                { "n_read",     n_bytes },
+                { "timings", {
+                    { "restore_ms", t_ms }
+                }},
+            };
+        }
+    }
+};
+
+struct server_task_result_slot_erase : server_task_result {
+    size_t n_erased;
+
+    virtual json to_json() override {
+        return json {
+            { "id_slot",  id_slot },
+            { "n_erased", n_erased },
+        };
+    }
+};
+
+struct server_task_result_apply_lora : server_task_result {
+    virtual json to_json() override {
+        return json {{ "success", true }};
+    }
+};
+
 struct server_slot {
     int id;
     int id_task = -1;
@@ -786,8 +1458,9 @@ struct server_context {
         const auto & data = task.data;
 
         if (data.count("__oaicompat") != 0) {
+            std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
             slot.params.oaicompat         = true;
-            slot.params.oaicompat_model   = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
+            slot.params.oaicompat_model   = json_value(data, "model", model_name);
             slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string());
         } else {
             slot.params.oaicompat         = false;
@@ -2835,8 +3508,6 @@ int main(int argc, char ** argv) {
 
         json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
 
-        std::string model_name = json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-
         return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true);
     };
 
diff --git a/examples/server/server.hpp b/examples/server/server.hpp
deleted file mode 100644
index 201f154560e53..0000000000000
--- a/examples/server/server.hpp
+++ /dev/null
@@ -1,665 +0,0 @@
-#pragma once
-
-#include "common.h"
-#include "llama.h"
-#include "sampling.h"
-#include "speculative.h"
-
-// Change JSON_ASSERT from assert() to GGML_ASSERT:
-#define JSON_ASSERT GGML_ASSERT
-#include "json.hpp"
-
-#include <string>
-#include <memory>
-#include <unordered_set>
-
-using json = nlohmann::ordered_json;
-
-enum stop_type {
-    STOP_TYPE_NONE,
-    STOP_TYPE_EOS,
-    STOP_TYPE_WORD,
-    STOP_TYPE_LIMIT,
-};
-
-// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
-enum slot_state {
-    SLOT_STATE_IDLE,
-    SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
-    SLOT_STATE_PROCESSING_PROMPT,
-    SLOT_STATE_DONE_PROMPT,
-    SLOT_STATE_GENERATING,
-};
-
-enum server_state {
-    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
-    SERVER_STATE_READY,          // Server is ready and model is loaded
-};
-
-enum server_task_type {
-    SERVER_TASK_TYPE_INFERENCE,
-    SERVER_TASK_TYPE_CANCEL,
-    SERVER_TASK_TYPE_NEXT_RESPONSE,
-    SERVER_TASK_TYPE_METRICS,
-    SERVER_TASK_TYPE_SLOT_SAVE,
-    SERVER_TASK_TYPE_SLOT_RESTORE,
-    SERVER_TASK_TYPE_SLOT_ERASE,
-    SERVER_TASK_TYPE_SET_LORA,
-};
-
-enum server_task_inf_type {
-    SERVER_TASK_INF_TYPE_COMPLETION,
-    SERVER_TASK_INF_TYPE_EMBEDDING,
-    SERVER_TASK_INF_TYPE_RERANK,
-    SERVER_TASK_INF_TYPE_INFILL,
-};
-
-// https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11
-enum error_type {
-    ERROR_TYPE_INVALID_REQUEST,
-    ERROR_TYPE_AUTHENTICATION,
-    ERROR_TYPE_SERVER,
-    ERROR_TYPE_NOT_FOUND,
-    ERROR_TYPE_PERMISSION,
-    ERROR_TYPE_UNAVAILABLE, // custom error
-    ERROR_TYPE_NOT_SUPPORTED, // custom error
-};
-
-struct server_task {
-    int id        = -1; // to be filled by server_queue
-    int id_target = -1; // used by SERVER_TASK_TYPE_CANCEL
-
-    llama_tokens prompt_tokens;
-    server_task_type type;
-
-    // TODO @ngxson : we should get rid of json type here
-    json data;
-
-    server_task_inf_type inf_type = SERVER_TASK_INF_TYPE_COMPLETION;
-
-    // utility function
-    static std::unordered_set<int> get_list_id(const std::vector<server_task> & tasks) {
-        std::unordered_set<int> ids(tasks.size());
-        for (size_t i = 0; i < tasks.size(); i++) {
-            ids.insert(tasks[i].id);
-        }
-        return ids;
-    }
-};
-
-struct slot_params {
-    bool stream       = true;
-    bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
-
-    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
-    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
-    int32_t n_predict = -1; // new tokens to predict
-    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
-
-    int64_t t_max_prompt_ms  = -1; // TODO: implement
-    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
-
-    std::vector<std::string> antiprompt;
-    bool timings_per_token = false;
-
-    struct common_params_sampling sampling;
-    struct common_params_speculative speculative;
-
-    // params only used in to_json()
-    int32_t n_ctx;
-    uint32_t seed_cur;
-    bool can_speculative;
-
-    // OAI-compat fields
-    bool oaicompat = false;
-    std::string oaicompat_model;
-    std::string oaicompat_cmpl_id;
-    bool verbose = false;
-
-    json to_json() {
-        std::vector<std::string> samplers;
-        samplers.reserve(sampling.samplers.size());
-        for (const auto & sampler : sampling.samplers) {
-            samplers.emplace_back(common_sampler_type_to_str(sampler));
-        }
-
-        return json {
-            {"n_ctx",                     n_ctx},
-            {"n_predict",                 n_predict},     // Server configured n_predict
-            {"temperature",               sampling.temp},
-            {"dynatemp_range",            sampling.dynatemp_range},
-            {"dynatemp_exponent",         sampling.dynatemp_exponent},
-            {"top_k",                     sampling.top_k},
-            {"top_p",                     sampling.top_p},
-            {"min_p",                     sampling.min_p},
-            {"xtc_probability",           sampling.xtc_probability},
-            {"xtc_threshold",             sampling.xtc_threshold},
-            {"typical_p",                 sampling.typ_p},
-            {"repeat_last_n",             sampling.penalty_last_n},
-            {"repeat_penalty",            sampling.penalty_repeat},
-            {"presence_penalty",          sampling.penalty_present},
-            {"frequency_penalty",         sampling.penalty_freq},
-            {"dry_multiplier",            sampling.dry_multiplier},
-            {"dry_base",                  sampling.dry_base},
-            {"dry_allowed_length",        sampling.dry_allowed_length},
-            {"dry_penalty_last_n",        sampling.dry_penalty_last_n},
-            {"dry_sequence_breakers",     sampling.dry_sequence_breakers},
-            {"mirostat",                  sampling.mirostat},
-            {"mirostat_tau",              sampling.mirostat_tau},
-            {"mirostat_eta",              sampling.mirostat_eta},
-            {"penalize_nl",               sampling.penalize_nl},
-            {"stop",                      antiprompt},
-            {"max_tokens",                n_predict}, // User configured n_predict
-            {"n_keep",                    n_keep},
-            {"n_discard",                 n_discard},
-            {"ignore_eos",                sampling.ignore_eos},
-            {"stream",                    stream},
-            //{"logit_bias",                sampling.logit_bias},
-            {"n_probs",                   sampling.n_probs},
-            {"min_keep",                  sampling.min_keep},
-            {"grammar",                   sampling.grammar},
-            {"samplers",                  samplers},
-            {"speculative",               can_speculative},
-            {"speculative.n_max",         speculative.n_max},
-            {"speculative.n_min",         speculative.n_min},
-            {"speculative.p_min",         speculative.p_min},
-            {"timings_per_token",         timings_per_token},
-        };
-    }
-};
-
-struct result_timings {
-    int32_t prompt_n = -1;
-    double prompt_ms;
-    double prompt_per_token_ms;
-    double prompt_per_second;
-
-    int32_t predicted_n = -1;
-    double predicted_ms;
-    double predicted_per_token_ms;
-    double predicted_per_second;
-
-    json to_json() {
-        return {
-            {"prompt_n",               prompt_n},
-            {"prompt_ms",              prompt_ms},
-            {"prompt_per_token_ms",    prompt_per_token_ms},
-            {"prompt_per_second",      prompt_per_second},
-
-            {"predicted_n",            predicted_n},
-            {"predicted_ms",           predicted_ms},
-            {"predicted_per_token_ms", predicted_per_token_ms},
-            {"predicted_per_second",   predicted_per_second},
-        };
-    }
-};
-
-struct server_task_result {
-    int id           = -1;
-    int id_slot      = -1;
-    virtual bool is_error() {
-        // only used by server_task_result_error
-        return false;
-    }
-    virtual bool is_stop() {
-        // only used by server_task_result_cmpl_partial
-        return false;
-    }
-    virtual int get_index() {
-        return -1;
-    }
-    virtual json to_json() = 0;
-    virtual json to_json_oai_compat() {
-        // used by server_task_result_cmpl_final and server_task_result_cmpl_partial
-        return json();
-    }
-    virtual ~server_task_result() = default;
-};
-
-inline std::string stop_type_to_str(stop_type type) {
-    switch (type) {
-        case STOP_TYPE_EOS:   return "eos";
-        case STOP_TYPE_WORD:  return "word";
-        case STOP_TYPE_LIMIT: return "limit";
-        default:              return "none";
-    }
-}
-
-struct completion_token_output {
-    llama_token tok;
-    std::string text_to_send;
-    struct token_prob {
-        llama_token tok;
-        float prob;
-    };
-    std::vector<token_prob> probs;
-};
-
-struct server_task_result_cmpl_final : server_task_result {
-    int index = 0;
-    std::string content;
-    bool stream;
-    result_timings timings;
-    std::string prompt;
-
-    bool truncated;
-    int32_t n_decoded;
-    int32_t n_prompt_tokens;
-    int32_t n_tokens_cached;
-    int32_t has_new_line;
-    std::string stopping_word;
-    stop_type stop = STOP_TYPE_NONE;
-
-    std::vector<completion_token_output> probs_output;
-
-    slot_params generation_params;
-
-    // OAI-compat fields
-    std::string oaicompat_model;
-    std::string oaicompat_cmpl_id;
-    bool verbose = false;
-
-    virtual int get_index() override {
-        return index;
-    }
-
-    virtual json to_json() override {
-        // non-OAI-compat JSON
-        return json {
-            {"index",               index},
-            {"content",             content},
-            {"id_slot",             id_slot},
-            {"stop",                true},
-            {"model",               oaicompat_model},
-            {"tokens_predicted",    n_decoded},
-            {"tokens_evaluated",    n_prompt_tokens},
-            {"generation_settings", generation_params.to_json()},
-            {"prompt",              prompt},
-            {"has_new_line",        has_new_line},
-            {"truncated",           truncated},
-            {"stop_type",           stop_type_to_str(stop)},
-            {"stopping_word",       stopping_word},
-            {"tokens_cached",       n_tokens_cached},
-            {"timings",             timings.to_json()},
-        };
-    }
-
-    virtual json to_json_oai_compat() override {
-        std::string finish_reason = "length";
-        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            finish_reason = "stop";
-        }
-
-        json choices = json::array({json{
-            {"finish_reason", finish_reason},
-            {"index", 0},
-            {"message", json{
-                {"content", content},
-                {"role", "assistant"}
-            }
-        }}});
-
-        std::time_t t = std::time(0);
-
-        json res = json {
-            {"choices", choices},
-            {"created", t},
-            {"model", oaicompat_model},
-            {"object", "chat.completion"},
-            {"usage", json {
-                {"completion_tokens", n_decoded},
-                {"prompt_tokens",     n_prompt_tokens},
-                {"total_tokens",      n_decoded + n_prompt_tokens}
-            }},
-            {"id", oaicompat_cmpl_id}
-        };
-
-        // extra fields for debugging purposes
-        if (verbose) {
-            res["__verbose"] = to_json();
-        }
-
-        // TODO: fix this
-        // if (result.contains("completion_probabilities")) {
-        //     res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
-        // }
-
-        if (timings.prompt_n >= 0) {
-            res.push_back({"timings", timings.to_json()});
-        }
-
-        return res;
-    }
-};
-
-struct server_task_result_cmpl_partial : server_task_result {
-    int index = 0;
-    std::string content;
-
-    bool truncated;
-    int32_t n_decoded;
-    int32_t n_prompt_tokens;
-
-    stop_type stop = STOP_TYPE_NONE;
-
-    std::vector<completion_token_output> probs_output;
-    result_timings timings;
-
-    // OAI-compat fields
-    std::string oaicompat_model;
-    std::string oaicompat_cmpl_id;
-    bool verbose = false;
-
-    virtual int get_index() override {
-        return index;
-    }
-
-    virtual bool is_stop() override {
-        return stop != STOP_TYPE_NONE;
-    }
-
-    virtual json to_json() override {
-        bool is_stop = stop != STOP_TYPE_NONE;
-        // non-OAI-compat JSON
-        json res = json {
-            {"index",            index},
-            {"content",          content},
-            {"stop_type",        stop_type_to_str(stop)},
-            {"stop",             is_stop},
-            {"id_slot",          id_slot},
-            {"tokens_predicted", n_decoded},
-            {"tokens_evaluated", n_prompt_tokens},
-        };
-        // populate the timings object when needed (usually for the last response or with timings_per_token enabled)
-        if (timings.prompt_n > 0) {
-            res.push_back({"timings", timings.to_json()});
-        }
-        if (is_stop) {
-            res.push_back({"truncated", truncated});
-        }
-        return res;
-    }
-
-    virtual json to_json_oai_compat() override {
-        bool first = n_decoded == 0;
-
-        std::string finish_reason;
-        if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
-            finish_reason = "stop";
-        } else if (stop == STOP_TYPE_LIMIT) {
-            finish_reason = "length";
-        }
-
-        std::time_t t = std::time(0);
-
-        json choices;
-
-        if (!finish_reason.empty()) {
-            choices = json::array({json{{"finish_reason", finish_reason},
-                                        {"index", 0},
-                                        {"delta", json::object()}}});
-        } else {
-            if (first) {
-                if (content.empty()) {
-                    choices = json::array({json{{"finish_reason", nullptr},
-                                                {"index", 0},
-                                                {"delta", json{{"role", "assistant"}}}}});
-                } else {
-                    // We have to send this as two updates to conform to openai behavior
-                    json initial_ret = json{{"choices", json::array({json{
-                                            {"finish_reason", nullptr},
-                                            {"index", 0},
-                                            {"delta", json{
-                                                {"role", "assistant"}
-                                            }}}})},
-                                {"created", t},
-                                {"id", oaicompat_cmpl_id},
-                                {"model", oaicompat_model},
-                                {"object", "chat.completion.chunk"}};
-
-                    json second_ret = json{
-                                {"choices", json::array({json{{"finish_reason", nullptr},
-                                                                {"index", 0},
-                                                                {"delta", json{
-                                                                {"content", content}}}
-                                                                }})},
-                                {"created", t},
-                                {"id", oaicompat_cmpl_id},
-                                {"model", oaicompat_model},
-                                {"object", "chat.completion.chunk"}};
-
-                    return std::vector<json>({initial_ret, second_ret});
-                }
-            } else {
-                // Some idiosyncrasy in task processing logic makes several trailing calls
-                // with empty content, we ignore these at the calee site.
-                if (content.empty()) {
-                    return std::vector<json>({json::object()});
-                }
-
-                choices = json::array({json{
-                    {"finish_reason", nullptr},
-                    {"index", 0},
-                    {"delta",
-                    json{
-                        {"content", content},
-                    }},
-                }});
-            }
-        }
-
-        json ret = json {
-            {"choices", choices},
-            {"created", t},
-            {"id",      oaicompat_cmpl_id},
-            {"model",   oaicompat_model},
-            {"object",  "chat.completion.chunk"}
-        };
-
-        if (timings.prompt_n >= 0) {
-            ret.push_back({"timings", timings.to_json()});
-        }
-
-        if (!finish_reason.empty()) {
-            ret.push_back({"usage", json {
-                {"completion_tokens", n_decoded},
-                {"prompt_tokens",     n_prompt_tokens},
-                {"total_tokens",      n_decoded + n_prompt_tokens},
-            }});
-        }
-
-        return std::vector<json>({ret});
-    }
-};
-
-struct server_task_result_embd : server_task_result {
-    int index = 0;
-    std::vector<float> embedding;
-
-    virtual int get_index() override {
-        return index;
-    }
-
-    virtual json to_json() override {
-        return json {
-            {"index",     index},
-            {"embedding", embedding},
-        };
-    }
-};
-
-struct server_task_result_rerank : server_task_result {
-    int index = 0;
-    float score = -1e6;
-
-    virtual int get_index() override {
-        return index;
-    }
-
-    virtual json to_json() override {
-        return json {
-            {"index", index},
-            {"score", score},
-        };
-    }
-};
-
-// this function maybe used outside of server_task_result_error
-static json format_error_response(const std::string & message, const enum error_type type) {
-    std::string type_str;
-    int code = 500;
-    switch (type) {
-        case ERROR_TYPE_INVALID_REQUEST:
-            type_str = "invalid_request_error";
-            code = 400;
-            break;
-        case ERROR_TYPE_AUTHENTICATION:
-            type_str = "authentication_error";
-            code = 401;
-            break;
-        case ERROR_TYPE_NOT_FOUND:
-            type_str = "not_found_error";
-            code = 404;
-            break;
-        case ERROR_TYPE_SERVER:
-            type_str = "server_error";
-            code = 500;
-            break;
-        case ERROR_TYPE_PERMISSION:
-            type_str = "permission_error";
-            code = 403;
-            break;
-        case ERROR_TYPE_NOT_SUPPORTED:
-            type_str = "not_supported_error";
-            code = 501;
-            break;
-        case ERROR_TYPE_UNAVAILABLE:
-            type_str = "unavailable_error";
-            code = 503;
-            break;
-    }
-    return json {
-        {"code", code},
-        {"message", message},
-        {"type", type_str},
-    };
-}
-
-struct server_task_result_error : server_task_result {
-    int index = 0;
-    error_type err_type = ERROR_TYPE_SERVER;
-    std::string err_msg;
-
-    virtual bool is_error() override {
-        return true;
-    }
-
-    virtual json to_json() override {
-        return format_error_response(err_msg, err_type);
-    }
-};
-
-struct server_task_result_metrics : server_task_result {
-    int n_idle_slots;
-    int n_processing_slots;
-    int n_tasks_deferred;
-    int64_t t_start;
-
-    int32_t kv_cache_tokens_count;
-    int32_t kv_cache_used_cells;
-
-    // TODO: somehow reuse server_metrics in the future, instead of duplicating the fields
-    uint64_t n_prompt_tokens_processed_total = 0;
-    uint64_t t_prompt_processing_total       = 0;
-    uint64_t n_tokens_predicted_total        = 0;
-    uint64_t t_tokens_generation_total       = 0;
-
-    uint64_t n_prompt_tokens_processed = 0;
-    uint64_t t_prompt_processing       = 0;
-
-    uint64_t n_tokens_predicted  = 0;
-    uint64_t t_tokens_generation = 0;
-
-    uint64_t n_decode_total     = 0;
-    uint64_t n_busy_slots_total = 0;
-
-    // TODO: get rid of this json object and use to_json() instead
-    json slots_data = json::array();
-
-    virtual json to_json() override {
-        return json {
-            { "idle",                            n_idle_slots },
-            { "processing",                      n_processing_slots },
-            { "deferred",                        n_tasks_deferred },
-            { "t_start",                         t_start },
-
-            { "n_prompt_tokens_processed_total", n_prompt_tokens_processed_total },
-            { "t_tokens_generation_total",       t_tokens_generation_total },
-            { "n_tokens_predicted_total",        n_tokens_predicted_total },
-            { "t_prompt_processing_total",       t_prompt_processing_total },
-
-            { "n_prompt_tokens_processed",       n_prompt_tokens_processed },
-            { "t_prompt_processing",             t_prompt_processing },
-            { "n_tokens_predicted",              n_tokens_predicted },
-            { "t_tokens_generation",             t_tokens_generation },
-
-            { "n_decode_total",                  n_decode_total },
-            { "n_busy_slots_total",              n_busy_slots_total },
-
-            { "kv_cache_tokens_count",           kv_cache_tokens_count },
-            { "kv_cache_used_cells",             kv_cache_used_cells },
-
-            { "slots",                           slots_data },
-        };
-    }
-};
-
-struct server_task_result_slot_save_load : server_task_result {
-    std::string filename;
-    bool is_save; // true = save, false = load
-
-    size_t n_tokens;
-    size_t n_bytes;
-    double t_ms;
-
-    virtual json to_json() override {
-        if (is_save) {
-            return json {
-                { "id_slot",   id_slot },
-                { "filename",  filename },
-                { "n_saved",   n_tokens },
-                { "n_written", n_bytes },
-                { "timings", {
-                    { "save_ms", t_ms }
-                }},
-            };
-        } else {
-            return json {
-                { "id_slot",    id_slot },
-                { "filename",   filename },
-                { "n_restored", n_tokens },
-                { "n_read",     n_bytes },
-                { "timings", {
-                    { "restore_ms", t_ms }
-                }},
-            };
-        }
-    }
-};
-
-struct server_task_result_slot_erase : server_task_result {
-    size_t n_erased;
-
-    virtual json to_json() override {
-        return json {
-            { "id_slot",  id_slot },
-            { "n_erased", n_erased },
-        };
-    }
-};
-
-struct server_task_result_apply_lora : server_task_result {
-    virtual json to_json() override {
-        return json {{ "success", true }};
-    }
-};
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 8a8d9f8f7e894..70bcaf17c272c 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -3,7 +3,6 @@
 #include "common.h"
 #include "log.h"
 #include "llama.h"
-#include "server.hpp"
 
 #ifndef NDEBUG
 // crash the server in debug mode, otherwise send an http 500 error
@@ -476,31 +475,6 @@ static std::string tokens_to_output_formatted_string(const llama_context * ctx,
     return out;
 }
 
-// convert a vector of completion_token_output to json
-static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
-    json out = json::array();
-
-    for (const auto & prob : probs) {
-        json probs_for_token = json::array();
-
-        for (const auto & p : prob.probs) {
-            const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-            probs_for_token.push_back(json {
-                {"tok_str", tok_str},
-                {"prob",    p.prob},
-            });
-        }
-
-        const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
-        out.push_back(json {
-            {"content", tok_str},
-            {"probs",   probs_for_token},
-        });
-    }
-
-    return out;
-}
-
 static bool server_sent_event(httplib::DataSink & sink, const char * event, const json & data) {
     const std::string str =
         std::string(event) + ": " +

From 2e560f90ff06e9a9b5d12b9ddf3498ab9e1e9b44 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Dec 2024 16:13:52 +0100
Subject: [PATCH 12/19] clarify server_sent_event RFC specs

---
 examples/server/utils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp
index 70bcaf17c272c..a96116ac36caa 100644
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -479,7 +479,7 @@ static bool server_sent_event(httplib::DataSink & sink, const char * event, cons
     const std::string str =
         std::string(event) + ": " +
         data.dump(-1, ' ', false, json::error_handler_t::replace) +
-        "\n\n"; // note: these newlines are important (not sure why though, if you know, add a comment to explain)
+        "\n\n"; // required by RFC 8895 - A message is terminated by a blank line (two line terminators in a row).
 
     LOG_DBG("data stream, to_send: %s", str.c_str());
 

From a43e1dc66c911804483dfb67b675ff99034229d8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Dec 2024 22:35:07 +0100
Subject: [PATCH 13/19] apply review comments

---
 examples/server/server.cpp | 46 +++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 44e6ead3ae897..b58f1018628b2 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -234,7 +234,7 @@ struct server_task_result {
 };
 
 // using shared_ptr for polymorphism of server_task_result
-using task_result_ptr = std::unique_ptr<server_task_result>;
+using server_task_result_ptr = std::unique_ptr<server_task_result>;
 
 inline std::string stop_type_to_str(stop_type type) {
     switch (type) {
@@ -1097,7 +1097,7 @@ struct server_response {
     std::unordered_set<int> waiting_task_ids;
 
     // the main result queue (using ptr for polymorphism)
-    std::vector<task_result_ptr> queue_results;
+    std::vector<server_task_result_ptr> queue_results;
 
     std::mutex mutex_results;
     std::condition_variable condition_results;
@@ -1137,7 +1137,7 @@ struct server_response {
     }
 
     // This function blocks the thread until there is a response for one of the id_tasks
-    task_result_ptr recv(const std::unordered_set<int> & id_tasks) {
+    server_task_result_ptr recv(const std::unordered_set<int> & id_tasks) {
         while (true) {
             std::unique_lock<std::mutex> lock(mutex_results);
             condition_results.wait(lock, [&]{
@@ -1146,7 +1146,7 @@ struct server_response {
 
             for (int i = 0; i < (int) queue_results.size(); i++) {
                 if (id_tasks.find(queue_results[i]->id) != id_tasks.end()) {
-                    task_result_ptr res = std::move(queue_results[i]);
+                    server_task_result_ptr res = std::move(queue_results[i]);
                     queue_results.erase(queue_results.begin() + i);
                     return res;
                 }
@@ -1157,13 +1157,13 @@ struct server_response {
     }
 
     // single-task version of recv()
-    task_result_ptr recv(int id_task) {
+    server_task_result_ptr recv(int id_task) {
         std::unordered_set<int> id_tasks = {id_task};
         return recv(id_tasks);
     }
 
     // Send a new result to a waiting id_task
-    void send(task_result_ptr && result) {
+    void send(server_task_result_ptr && result) {
         SRV_DBG("sending result for task id = %d\n", result->id);
 
         std::unique_lock<std::mutex> lock(mutex_results);
@@ -2078,11 +2078,11 @@ struct server_context {
     // receive the results from task(s) created by create_tasks_inference
     void receive_multi_results(
             const std::unordered_set<int> & id_tasks,
-            const std::function<void(std::vector<task_result_ptr>&)> & result_handler,
+            const std::function<void(std::vector<server_task_result_ptr>&)> & result_handler,
             const std::function<void(json)> & error_handler) {
-        std::vector<task_result_ptr> results(id_tasks.size());
+        std::vector<server_task_result_ptr> results(id_tasks.size());
         for (size_t i = 0; i < id_tasks.size(); i++) {
-            task_result_ptr result = queue_results.recv(id_tasks);
+            server_task_result_ptr result = queue_results.recv(id_tasks);
 
             if (result->is_error()) {
                 error_handler(result->to_json());
@@ -2104,12 +2104,12 @@ struct server_context {
 
     // receive the results from task(s) created by create_tasks_inference, in stream mode
     void receive_cmpl_results_stream(
-            const std::unordered_set<int> & id_tasks, const
-            std::function<bool(task_result_ptr&)> & result_handler, const
-            std::function<void(json)> & error_handler) {
+            const std::unordered_set<int> & id_tasks,
+            const std::function<bool(server_task_result_ptr&)> & result_handler,
+            const std::function<void(json)> & error_handler) {
         size_t n_finished = 0;
         while (true) {
-            task_result_ptr result = queue_results.recv(id_tasks);
+            server_task_result_ptr result = queue_results.recv(id_tasks);
 
             if (result->is_error()) {
                 error_handler(result->to_json());
@@ -3108,7 +3108,7 @@ int main(int argc, char ** argv) {
         ctx_server.queue_tasks.post(task, true); // high-priority task
 
         // get the result
-        task_result_ptr result = ctx_server.queue_results.recv(task.id);
+        server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
         ctx_server.queue_results.remove_waiting_task_id(task.id);
 
         if (result->is_error()) {
@@ -3148,7 +3148,7 @@ int main(int argc, char ** argv) {
         ctx_server.queue_tasks.post(task, true); // high-priority task
 
         // get the result
-        task_result_ptr result = ctx_server.queue_results.recv(task.id);
+        server_task_result_ptr result = ctx_server.queue_results.recv(task.id);
         ctx_server.queue_results.remove_waiting_task_id(task.id);
 
         if (result->is_error()) {
@@ -3257,7 +3257,7 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        task_result_ptr result = ctx_server.queue_results.recv(id_task);
+        server_task_result_ptr result = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
         if (result->is_error()) {
@@ -3288,7 +3288,7 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        task_result_ptr result = ctx_server.queue_results.recv(id_task);
+        server_task_result_ptr result = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
         if (result->is_error()) {
@@ -3310,7 +3310,7 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        task_result_ptr result = ctx_server.queue_results.recv(id_task);
+        server_task_result_ptr result = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
         if (result->is_error()) {
@@ -3395,7 +3395,7 @@ int main(int argc, char ** argv) {
         const auto task_ids = server_task::get_list_id(tasks);
 
         if (!stream) {
-            ctx_server.receive_multi_results(task_ids, [&](std::vector<task_result_ptr> & results) {
+            ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
                 if (results.size() == 1) {
                     // single result
                     res_ok(res, oai_compat ? results[0]->to_json_oai_compat() : results[0]->to_json());
@@ -3414,7 +3414,7 @@ int main(int argc, char ** argv) {
             ctx_server.queue_results.remove_waiting_task_ids(task_ids);
         } else {
             const auto chunked_content_provider = [task_ids, &ctx_server, oai_compat](size_t, httplib::DataSink & sink) {
-                ctx_server.receive_cmpl_results_stream(task_ids, [&](task_result_ptr & result) -> bool {
+                ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool {
                     json res_json = oai_compat ? result->to_json_oai_compat() : result->to_json();
                     if (res_json.is_array()) {
                         for (const auto & res : res_json) {
@@ -3609,7 +3609,7 @@ int main(int argc, char ** argv) {
             // get the result
             std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
 
-            ctx_server.receive_multi_results(task_ids, [&](std::vector<task_result_ptr> & results) {
+            ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
                 for (auto & res : results) {
                     GGML_ASSERT(dynamic_cast<server_task_result_embd*>(res.get()) != nullptr);
                     responses.push_back(res->to_json());
@@ -3688,7 +3688,7 @@ int main(int argc, char ** argv) {
             // get the result
             std::unordered_set<int> task_ids = server_task::get_list_id(tasks);
 
-            ctx_server.receive_multi_results(task_ids, [&](std::vector<task_result_ptr> & results) {
+            ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
                 for (auto & res : results) {
                     GGML_ASSERT(dynamic_cast<server_task_result_rerank*>(res.get()) != nullptr);
                     responses.push_back(res->to_json());
@@ -3747,7 +3747,7 @@ int main(int argc, char ** argv) {
         const int id_task = ctx_server.queue_tasks.post(task);
         ctx_server.queue_results.add_waiting_task_id(id_task);
 
-        task_result_ptr result = ctx_server.queue_results.recv(id_task);
+        server_task_result_ptr result = ctx_server.queue_results.recv(id_task);
         ctx_server.queue_results.remove_waiting_task_id(id_task);
 
         if (result->is_error()) {

From fb4b9be6025a4c934d19f47307adc919fbceee4e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Dec 2024 23:13:06 +0100
Subject: [PATCH 14/19] fix model_alias and completion_probabilities

---
 common/common.h                               |  2 +-
 examples/server/server.cpp                    | 37 +++++++++---------
 examples/server/tests/README.md               |  6 +++
 .../server/tests/unit/test_chat_completion.py | 20 +++++-----
 examples/server/tests/unit/test_completion.py | 39 +++++++++++++++++++
 5 files changed, 73 insertions(+), 31 deletions(-)

diff --git a/common/common.h b/common/common.h
index 0373fd3ead49e..95d20401d2a9a 100644
--- a/common/common.h
+++ b/common/common.h
@@ -215,7 +215,7 @@ struct common_params {
     struct common_params_speculative speculative;
 
     std::string model                = ""; // model path                                                    // NOLINT
-    std::string model_alias          = "unknown"; // model alias                                            // NOLINT
+    std::string model_alias          = ""; // model alias                                                   // NOLINT
     std::string model_url            = ""; // model url to download                                         // NOLINT
     std::string hf_token             = ""; // HF token                                                      // NOLINT
     std::string hf_repo              = ""; // HF repo                                                       // NOLINT
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index b58f1018628b2..95d4bfd37f5c0 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -250,29 +250,29 @@ struct completion_token_output {
     std::string text_to_send;
     struct token_prob {
         llama_token tok;
+        std::string tok_str;
         float prob;
     };
     std::vector<token_prob> probs;
 
-    json to_json(const llama_context * ctx) const {
+    json to_json() const {
         json probs_for_token = json::array();
         for (const auto & p : probs) {
-            const std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
             probs_for_token.push_back(json {
-                {"tok_str", tok_str},
+                {"tok_str", p.tok_str},
                 {"prob",    p.prob},
             });
         }
         return probs_for_token;
     }
 
-    static json probs_vector_to_json(const llama_context * ctx, const std::vector<completion_token_output> & probs) {
+    static json probs_vector_to_json(const std::vector<completion_token_output> & probs) {
         json out = json::array();
         for (const auto & prob : probs) {
-            const std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
+            const std::string tok_str = prob.text_to_send;
             out.push_back(json {
                 {"content", tok_str},
-                {"probs",   prob.to_json(ctx)},
+                {"probs",   prob.to_json()},
             });
         }
         return out;
@@ -309,7 +309,7 @@ struct server_task_result_cmpl_final : server_task_result {
 
     virtual json to_json() override {
         // non-OAI-compat JSON
-        return json {
+        json res = json {
             {"index",               index},
             {"content",             content},
             {"id_slot",             id_slot},
@@ -326,6 +326,10 @@ struct server_task_result_cmpl_final : server_task_result {
             {"tokens_cached",       n_tokens_cached},
             {"timings",             timings.to_json()},
         };
+        if (!probs_output.empty()) {
+            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output);
+        }
+        return res;
     }
 
     virtual json to_json_oai_compat() override {
@@ -362,12 +366,6 @@ struct server_task_result_cmpl_final : server_task_result {
         if (verbose) {
             res["__verbose"] = to_json();
         }
-
-        // TODO: fix this
-        // if (result.contains("completion_probabilities")) {
-        //     res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
-        // }
-
         if (timings.prompt_n >= 0) {
             res.push_back({"timings", timings.to_json()});
         }
@@ -418,6 +416,9 @@ struct server_task_result_cmpl_partial : server_task_result {
         if (timings.prompt_n > 0) {
             res.push_back({"timings", timings.to_json()});
         }
+        if (!probs_output.empty()) {
+            res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output);
+        }
         if (is_stop) {
             res.push_back({"truncated", truncated});
         }
@@ -2786,9 +2787,11 @@ struct server_context {
                 const auto * cur_p = common_sampler_get_candidates(slot.smpl);
 
                 for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) {
+                    auto tok_id = cur_p->data[i].id;
                     result.probs.push_back({
-                        cur_p->data[i].id,
-                            i >= cur_p->size ? 0.0f : cur_p->data[i].p,
+                        tok_id,
+                        tokens_to_output_formatted_string(ctx, tok_id),
+                        i >= cur_p->size ? 0.0f : cur_p->data[i].p,
                     });
                 }
 
@@ -2920,10 +2923,6 @@ int main(int argc, char ** argv) {
     // struct that contains llama context and inference
     server_context ctx_server;
 
-    if (params.model_alias == "unknown") {
-        params.model_alias = params.model;
-    }
-
     llama_backend_init();
     llama_numa_init(params.numa);
 
diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md
index 2930a2e0dea0f..fa3d0a2f5ff66 100644
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -44,4 +44,10 @@ To run with stdout/stderr display in real time (verbose output, but useful for d
 DEBUG=1 ./tests.sh -s -v -x
 ```
 
+Hint: You can compile and run test in single command, useful for local developement:
+
+```shell
+cmake --build build -j --target llama-server && ./examples/server/tests/tests.sh
+```
+
 To see all available arguments, please refer to [pytest documentation](https://docs.pytest.org/en/stable/how-to/usage.html)
diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
index 486c1f87a0856..11bf712b6fe72 100644
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -14,7 +14,7 @@ def create_server():
 @pytest.mark.parametrize(
     "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
     [
-        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
+        (None, "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
         ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
     ]
 )
@@ -30,6 +30,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
         ],
     })
     assert res.status_code == 200
+    assert res.body["model"] == model if model is not None else server.model_alias
     assert res.body["usage"]["prompt_tokens"] == n_prompt
     assert res.body["usage"]["completion_tokens"] == n_predicted
     choice = res.body["choices"][0]
@@ -39,17 +40,17 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
 
 
 @pytest.mark.parametrize(
-    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,truncated",
+    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
     [
-        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, False),
-        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, False),
+        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
+        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
     ]
 )
-def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, truncated):
+def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
     global server
+    server.model_alias = None
     server.start()
     res = server.make_stream_request("POST", "/chat/completions", data={
-        "model": model,
         "max_tokens": max_tokens,
         "messages": [
             {"role": "system", "content": system_prompt},
@@ -60,16 +61,13 @@ def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, r
     content = ""
     for data in res:
         choice = data["choices"][0]
+        assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future
         if choice["finish_reason"] in ["stop", "length"]:
             assert data["usage"]["prompt_tokens"] == n_prompt
             assert data["usage"]["completion_tokens"] == n_predicted
             assert "content" not in choice["delta"]
             assert match_regex(re_content, content)
-            # FIXME: not sure why this is incorrect in stream mode
-            # if truncated:
-            #   assert choice["finish_reason"] == "length"
-            # else:
-            #   assert choice["finish_reason"] == "stop"
+            assert choice["finish_reason"] == finish_reason
         else:
             assert choice["finish_reason"] is None
             content += choice["delta"]["content"]
diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py
index 2fa30dd033431..1c3aa77de5bba 100644
--- a/examples/server/tests/unit/test_completion.py
+++ b/examples/server/tests/unit/test_completion.py
@@ -51,6 +51,24 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp
             content += data["content"]
 
 
+def test_completion_stream_vs_non_stream():
+    global server
+    server.start()
+    res_stream = server.make_stream_request("POST", "/completion", data={
+        "n_predict": 8,
+        "prompt": "I believe the meaning of life is",
+        "stream": True,
+    })
+    res_non_stream = server.make_request("POST", "/completion", data={
+        "n_predict": 8,
+        "prompt": "I believe the meaning of life is",
+    })
+    content_stream = ""
+    for data in res_stream:
+        content_stream += data["content"]
+    assert content_stream == res_non_stream.body["content"]
+
+
 @pytest.mark.parametrize("n_slots", [1, 2])
 def test_consistent_result_same_seed(n_slots: int):
     global server
@@ -221,3 +239,24 @@ def check_slots_status():
         assert len(res.body["content"]) > 10
         # FIXME: the result is not deterministic when using other slot than slot 0
         # assert match_regex(re_content, res.body["content"])
+
+
+def test_n_probs():
+    global server
+    server.start()
+    res = server.make_request("POST", "/completion", data={
+        "prompt": "I believe the meaning of life is",
+        "n_probs": 10,
+        "temperature": 0.0,
+        "n_predict": 5,
+    })
+    assert res.status_code == 200
+    assert "completion_probabilities" in res.body
+    assert len(res.body["completion_probabilities"]) == 5
+    for tok in res.body["completion_probabilities"]:
+        assert "probs" in tok
+        assert len(tok["probs"]) == 10
+        for prob in tok["probs"]:
+            assert "prob" in prob
+            assert "tok_str" in prob
+            assert 0.0 <= prob["prob"] <= 1.0

From 4c3d2580b28e566affe917740462a242b2b283c8 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Dec 2024 23:16:27 +0100
Subject: [PATCH 15/19] small clean up

---
 examples/server/tests/unit/test_chat_completion.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py
index 11bf712b6fe72..f13c6c4ca4bd3 100644
--- a/examples/server/tests/unit/test_chat_completion.py
+++ b/examples/server/tests/unit/test_chat_completion.py
@@ -40,15 +40,15 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte
 
 
 @pytest.mark.parametrize(
-    "model,system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
+    "system_prompt,user_prompt,max_tokens,re_content,n_prompt,n_predicted,finish_reason",
     [
-        ("llama-2", "Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
-        ("codellama70b", "You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
+        ("Book", "What is the best book", 8, "(Suddenly)+", 77, 8, "length"),
+        ("You are a coding assistant.", "Write the fibonacci function in c++.", 128, "(Aside|she|felter|alonger)+", 104, 64, "length"),
     ]
 )
-def test_chat_completion_stream(model, system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
+def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_content, n_prompt, n_predicted, finish_reason):
     global server
-    server.model_alias = None
+    server.model_alias = None # try using DEFAULT_OAICOMPAT_MODEL
     server.start()
     res = server.make_stream_request("POST", "/chat/completions", data={
         "max_tokens": max_tokens,

From ffc4441b1d9c03a8c5b65ee53bdc961d4dfe0de0 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Dec 2024 23:29:27 +0100
Subject: [PATCH 16/19] remove virtual for to_json_oai_compat()

---
 examples/server/server.cpp | 46 ++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 17 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 95d4bfd37f5c0..3685df0d99767 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -128,10 +128,11 @@ struct slot_params {
     bool can_speculative;
 
     // OAI-compat fields
-    bool oaicompat = false;
+    bool        verbose        = false;
+    bool        oaicompat      = false;
+    bool        oaicompat_chat = true;
     std::string oaicompat_model;
     std::string oaicompat_cmpl_id;
-    bool verbose = false;
 
     json to_json() {
         std::vector<std::string> samplers;
@@ -226,10 +227,6 @@ struct server_task_result {
         return -1;
     }
     virtual json to_json() = 0;
-    virtual json to_json_oai_compat() {
-        // used by server_task_result_cmpl_final and server_task_result_cmpl_partial
-        return json();
-    }
     virtual ~server_task_result() = default;
 };
 
@@ -299,16 +296,21 @@ struct server_task_result_cmpl_final : server_task_result {
     slot_params generation_params;
 
     // OAI-compat fields
+    bool        verbose        = false;
+    bool        oaicompat      = false;
+    bool        oaicompat_chat = true; // TODO: support oaicompat for non-chat
     std::string oaicompat_model;
     std::string oaicompat_cmpl_id;
-    bool verbose = false;
 
     virtual int get_index() override {
         return index;
     }
 
     virtual json to_json() override {
-        // non-OAI-compat JSON
+        if (oaicompat) {
+            return to_json_oai_compat();
+        }
+        // otherwise, non-OAI-compat JSON
         json res = json {
             {"index",               index},
             {"content",             content},
@@ -332,7 +334,7 @@ struct server_task_result_cmpl_final : server_task_result {
         return res;
     }
 
-    virtual json to_json_oai_compat() override {
+    json to_json_oai_compat() {
         std::string finish_reason = "length";
         if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
             finish_reason = "stop";
@@ -388,9 +390,11 @@ struct server_task_result_cmpl_partial : server_task_result {
     result_timings timings;
 
     // OAI-compat fields
+    bool        verbose        = false;
+    bool        oaicompat      = false;
+    bool        oaicompat_chat = true; // TODO: support oaicompat for non-chat
     std::string oaicompat_model;
     std::string oaicompat_cmpl_id;
-    bool verbose = false;
 
     virtual int get_index() override {
         return index;
@@ -401,6 +405,9 @@ struct server_task_result_cmpl_partial : server_task_result {
     }
 
     virtual json to_json() override {
+        if (oaicompat) {
+            return to_json_oai_compat();
+        }
         bool is_stop = stop != STOP_TYPE_NONE;
         // non-OAI-compat JSON
         json res = json {
@@ -425,7 +432,7 @@ struct server_task_result_cmpl_partial : server_task_result {
         return res;
     }
 
-    virtual json to_json_oai_compat() override {
+    json to_json_oai_compat() {
         bool first = n_decoded == 0;
 
         std::string finish_reason;
@@ -1461,6 +1468,7 @@ struct server_context {
         if (data.count("__oaicompat") != 0) {
             std::string model_name = params_base.model_alias.empty() ? DEFAULT_OAICOMPAT_MODEL : params_base.model_alias;
             slot.params.oaicompat         = true;
+            slot.params.oaicompat_chat    = json_value(data, "__oaicompat_chat", false);
             slot.params.oaicompat_model   = json_value(data, "model", model_name);
             slot.params.oaicompat_cmpl_id = json_value(data, "completion_id", std::string());
         } else {
@@ -1850,9 +1858,11 @@ struct server_context {
 
         res->stop            = slot.stop;
 
+        res->verbose           = slot.params.verbose;
+        res->oaicompat         = slot.params.oaicompat;
+        res->oaicompat_chat    = slot.params.oaicompat_chat;
         res->oaicompat_model   = slot.params.oaicompat_model;
         res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
-        res->verbose           = slot.params.verbose;
 
         // populate res.probs_output
         if (slot.params.sampling.n_probs > 0) {
@@ -1899,9 +1909,11 @@ struct server_context {
         res->stopping_word   = slot.stopping_word;
         res->stop            = slot.stop;
 
+        res->verbose           = slot.params.verbose;
+        res->oaicompat         = slot.params.oaicompat;
+        res->oaicompat_chat    = slot.params.oaicompat_chat;
         res->oaicompat_model   = slot.params.oaicompat_model;
         res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;
-        res->verbose           = slot.params.verbose;
 
         // populate res.probs_output
         if (slot.params.sampling.n_probs > 0) {
@@ -3397,12 +3409,12 @@ int main(int argc, char ** argv) {
             ctx_server.receive_multi_results(task_ids, [&](std::vector<server_task_result_ptr> & results) {
                 if (results.size() == 1) {
                     // single result
-                    res_ok(res, oai_compat ? results[0]->to_json_oai_compat() : results[0]->to_json());
+                    res_ok(res, results[0]->to_json());
                 } else {
                     // multiple results (multitask)
                     json arr = json::array();
                     for (auto & res : results) {
-                        arr.push_back(oai_compat ? res->to_json_oai_compat() : res->to_json());
+                        arr.push_back(res->to_json());
                     }
                     res_ok(res, arr);
                 }
@@ -3414,7 +3426,7 @@ int main(int argc, char ** argv) {
         } else {
             const auto chunked_content_provider = [task_ids, &ctx_server, oai_compat](size_t, httplib::DataSink & sink) {
                 ctx_server.receive_cmpl_results_stream(task_ids, [&](server_task_result_ptr & result) -> bool {
-                    json res_json = oai_compat ? result->to_json_oai_compat() : result->to_json();
+                    json res_json = result->to_json();
                     if (res_json.is_array()) {
                         for (const auto & res : res_json) {
                             if (!server_sent_event(sink, "data", res)) {
@@ -3506,7 +3518,7 @@ int main(int argc, char ** argv) {
         }
 
         json data = oaicompat_completion_params_parse(ctx_server.model, json::parse(req.body), params.chat_template);
-
+        data["__oaicompat_chat"] = true;
         return handle_completions_generic(SERVER_TASK_INF_TYPE_COMPLETION, data, res, true);
     };
 

From db66153d921c8bf59227d0e2efa7f010bf65ec2e Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Dec 2024 23:34:24 +0100
Subject: [PATCH 17/19] naming oai_compat --> oaicompat

---
 examples/server/server.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 3685df0d99767..50415c6a0ce7a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -308,7 +308,7 @@ struct server_task_result_cmpl_final : server_task_result {
 
     virtual json to_json() override {
         if (oaicompat) {
-            return to_json_oai_compat();
+            return to_json_oaicompat();
         }
         // otherwise, non-OAI-compat JSON
         json res = json {
@@ -334,7 +334,7 @@ struct server_task_result_cmpl_final : server_task_result {
         return res;
     }
 
-    json to_json_oai_compat() {
+    json to_json_oaicompat() {
         std::string finish_reason = "length";
         if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
             finish_reason = "stop";
@@ -406,7 +406,7 @@ struct server_task_result_cmpl_partial : server_task_result {
 
     virtual json to_json() override {
         if (oaicompat) {
-            return to_json_oai_compat();
+            return to_json_oaicompat();
         }
         bool is_stop = stop != STOP_TYPE_NONE;
         // non-OAI-compat JSON
@@ -432,7 +432,7 @@ struct server_task_result_cmpl_partial : server_task_result {
         return res;
     }
 
-    json to_json_oai_compat() {
+    json to_json_oaicompat() {
         bool first = n_decoded == 0;
 
         std::string finish_reason;

From dfa59b908f1c1f47fd79679af5a414f13beed1c0 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Thu, 5 Dec 2024 23:43:48 +0100
Subject: [PATCH 18/19] fix unwanted recursive call

---
 examples/server/server.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 50415c6a0ce7a..881a7b902ba91 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -307,10 +307,10 @@ struct server_task_result_cmpl_final : server_task_result {
     }
 
     virtual json to_json() override {
-        if (oaicompat) {
-            return to_json_oaicompat();
-        }
-        // otherwise, non-OAI-compat JSON
+        return oaicompat ? to_json_oaicompat_chat() : to_json_non_oaicompat();
+    }
+
+    json to_json_non_oaicompat() {
         json res = json {
             {"index",               index},
             {"content",             content},
@@ -334,7 +334,7 @@ struct server_task_result_cmpl_final : server_task_result {
         return res;
     }
 
-    json to_json_oaicompat() {
+    json to_json_oaicompat_chat() {
         std::string finish_reason = "length";
         if (stop == STOP_TYPE_WORD || stop == STOP_TYPE_EOS) {
             finish_reason = "stop";
@@ -366,7 +366,7 @@ struct server_task_result_cmpl_final : server_task_result {
 
         // extra fields for debugging purposes
         if (verbose) {
-            res["__verbose"] = to_json();
+            res["__verbose"] = to_json_non_oaicompat();
         }
         if (timings.prompt_n >= 0) {
             res.push_back({"timings", timings.to_json()});
@@ -3594,12 +3594,12 @@ int main(int argc, char ** argv) {
 
     const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
         const json body = json::parse(req.body);
-        bool is_openai = false;
+        bool oaicompat = false;
 
         // an input prompt can be a string or a list of tokens (integer)
         json prompt;
         if (body.count("input") != 0) {
-            is_openai = true;
+            oaicompat = true;
             prompt = body.at("input");
         } else if (body.count("content") != 0) {
             // with "content", we only support single prompt
@@ -3638,7 +3638,7 @@ int main(int argc, char ** argv) {
         }
 
         // write JSON response
-        json root = is_openai
+        json root = oaicompat
             ? format_embeddings_response_oaicompat(body, responses)
             : responses[0];
         res_ok(res, root);

From 25be4ccc89459b10fb60f6e1b87cd12f4d59928d Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Fri, 6 Dec 2024 10:47:52 +0100
Subject: [PATCH 19/19] update docs

---
 examples/server/README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index b2dd7b65a990c..8dbed2626a444 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -473,9 +473,11 @@ Notice that each `probs` is an array of length `n_probs`.
 - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
 - `model`: The path to the model loaded with `-m`
 - `prompt`: The provided `prompt`
-- `stopped_eos`: Indicating whether the completion has stopped because it encountered the EOS token
-- `stopped_limit`: Indicating whether the completion stopped because `n_predict` tokens were generated before stop words or EOS was encountered
-- `stopped_word`: Indicating whether the completion stopped due to encountering a stopping word from `stop` JSON array provided
+- `stop_type`: Indicating whether the completion has stopped. Possible values are:
+  - `none`: Generating (not stopped)
+  - `eos`: Stopped because it encountered the EOS token
+  - `limit`: Stopped because `n_predict` tokens were generated before stop words or EOS was encountered
+  - `word`: Stopped due to encountering a stopping word from `stop` JSON array provided
 - `stopping_word`: The stopping word encountered which stopped the generation (or "" if not stopped due to a stopping word)
 - `timings`: Hash of timing information about the completion such as the number of tokens `predicted_per_second`
 - `tokens_cached`: Number of tokens from the prompt which could be re-used from previous completion (`n_past`)