Skip to content
This repository was archived by the owner on Aug 7, 2025. It is now read-only.
Closed
4 changes: 4 additions & 0 deletions cpp/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,10 @@ function build() {
mv $DEPS_DIR/../src/examples/libmnist_handler.so $DEPS_DIR/../../test/resources/torchscript_model/mnist/mnist_handler/libmnist_handler.so
fi

if [ -f "$DEPS_DIR/../src/examples/libllamacpp_handler.so" ]; then
mv $DEPS_DIR/../src/examples/libllamacpp_handler.so $DEPS_DIR/../../test/resources/torchscript_model/llamacpp/llamacpp_handler/libllamacpp_handler.so
fi

cd $DEPS_DIR/../..
if [ -f "$DEPS_DIR/../test/torchserve_cpp_test" ]; then
$DEPS_DIR/../test/torchserve_cpp_test
Expand Down
23 changes: 23 additions & 0 deletions cpp/src/examples/CMakeLists.txt
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to create a CMakeLists.txt in the llamacpp directory and use add_subdirectory() in the main file to avoid the main one to getting too crowded.

Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,26 @@ list(APPEND MNIST_SOURCE_FILES ${MNIST_SRC_DIR}/mnist_handler.cc)
add_library(mnist_handler SHARED ${MNIST_SOURCE_FILES})
target_include_directories(mnist_handler PUBLIC ${MNIST_SRC_DIR})
target_link_libraries(mnist_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES})

set(LLM_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/src/examples/llamacpp")
set(LLAMACPP_SRC_DIR "/home/ubuntu/llama.cpp")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good to avoid absolute paths. Is the file included in the PR? What is the license of llama.cpp? Do we need to include the license file?

set(LLM_SOURCE_FILES "")
list(APPEND LLM_SOURCE_FILES ${LLM_SRC_DIR}/llamacpp_handler.cc)
add_library(llamacpp_handler SHARED ${LLM_SOURCE_FILES})
target_include_directories(llamacpp_handler PUBLIC ${LLM_SRC_DIR})
target_include_directories(llamacpp_handler PUBLIC ${LLAMACPP_SRC_DIR})
target_link_libraries(llamacpp_handler PRIVATE ts_backends_torch_scripted ts_utils ${TORCH_LIBRARIES})


set(MY_OBJECT_FILES
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where are the src files to these obj files?

${LLAMACPP_SRC_DIR}/ggml.o
${LLAMACPP_SRC_DIR}/llama.o
${LLAMACPP_SRC_DIR}/common.o
${LLAMACPP_SRC_DIR}/k_quants.o
${LLAMACPP_SRC_DIR}/ggml-alloc.o
${LLAMACPP_SRC_DIR}/grammar-parser.o
${LLAMACPP_SRC_DIR}/console.o

)

target_sources(llamacpp_handler PRIVATE ${MY_OBJECT_FILES})
5 changes: 5 additions & 0 deletions cpp/src/examples/llamacpp/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"checkpoint_path" : "/home/ubuntu/llama-2-7b-chat.Q4_0.gguf"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ditto also: How big is this file?

}


296 changes: 296 additions & 0 deletions cpp/src/examples/llamacpp/llamacpp_handler.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
#include "src/examples/llamacpp/llamacpp_handler.hh"

#include <torch/script.h>
#include <torch/torch.h>

#include <typeinfo>

namespace llm {

void LlamacppHandler::initialize_context() {
llama_ctx = llama_new_context_with_model(llamamodel, ctx_params);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is this defined?


if (llama_ctx == nullptr) {
std::cerr << "Failed to initialize llama context" << std::endl;
} else {
std::cout << "Context initialized successfully" << std::endl;
}
}

std::pair<std::shared_ptr<torch::jit::script::Module>,
std::shared_ptr<torch::Device>>
LlamacppHandler::LoadModel(
std::shared_ptr<torchserve::LoadModelRequest>& load_model_request) {
try {
auto device = GetTorchDevice(load_model_request);
// Load dummy model
auto module = std::make_shared<torch::jit::script::Module>(
torch::jit::load(fmt::format("{}/{}", load_model_request->model_dir,
manifest_->GetModel().serialized_file),
*device));

const std::string configFilePath =
fmt::format("{}/{}", load_model_request->model_dir, "config.json");
std::string jsonContent;
if (!folly::readFile(configFilePath.c_str(), jsonContent)) {
std::cerr << "config.json not found at: " << configFilePath << std::endl;
throw;
}
folly::dynamic json;
json = folly::parseJson(jsonContent);

std::string checkpoint_path;
if (json.find("checkpoint_path") != json.items().end()) {
checkpoint_path = json["checkpoint_path"].asString();
} else {
std::cerr
<< "Required field 'checkpoint_path' not found in JSON."
<< std::endl;
throw;
}

params.model = checkpoint_path;
params.main_gpu = 0;
params.n_gpu_layers = 35;

llama_backend_init(params.numa);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this parameter initialized?

ctx_params = llama_context_default_params();
llamamodel = llama_load_model_from_file(params.model.c_str(), ctx_params);

return std::make_pair(module, device);
} catch (const c10::Error& e) {
TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}",
load_model_request->model_name, load_model_request->gpu_id,
e.msg());
throw e;
} catch (const std::runtime_error& e) {
TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}",
load_model_request->model_name, load_model_request->gpu_id,
e.what());
throw e;
}
}

std::vector<torch::jit::IValue> LlamacppHandler::Preprocess(
std::shared_ptr<torch::Device>& device,
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch,
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {

initialize_context();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would the constructor be a better place for this?


std::vector<torch::jit::IValue> batch_ivalue;
std::vector<torch::Tensor> batch_tensors;
uint8_t idx = 0;
for (auto& request : *request_batch) {
try {
(*response_batch)[request.request_id] =
std::make_shared<torchserve::InferenceResponse>(request.request_id);
idx_to_req_id.first += idx_to_req_id.first.empty()
? request.request_id
: "," + request.request_id;

auto data_it = request.parameters.find(
torchserve::PayloadType::kPARAMETER_NAME_DATA);
auto dtype_it =
request.headers.find(torchserve::PayloadType::kHEADER_NAME_DATA_TYPE);
if (data_it == request.parameters.end()) {
data_it = request.parameters.find(
torchserve::PayloadType::kPARAMETER_NAME_BODY);
dtype_it = request.headers.find(
torchserve::PayloadType::kHEADER_NAME_BODY_TYPE);
}

if (data_it == request.parameters.end() ||
dtype_it == request.headers.end()) {
TS_LOGF(ERROR, "Empty payload for request id: {}", request.request_id);
(*response_batch)[request.request_id]->SetResponse(
500, "data_type", torchserve::PayloadType::kCONTENT_TYPE_TEXT,
"Empty payload");
continue;
}

std::string msg = torchserve::Converter::VectorToStr(data_it->second);

// tokenization

std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(llama_ctx, msg, true);

// const int max_context_size = llama_n_ctx(ctx);
const int max_tokens_list_size = max_context_size - 4;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Always good to give the magic numbers a name to clarify the purpose?


if ((int)tokens_list.size() > max_tokens_list_size) {
std::cout << __func__ << ": error: prompt too long ("
<< tokens_list.size() << " tokens, max "
<< max_tokens_list_size << ")\n";
}

// Print the tokens from the prompt :
std::vector<torch::Tensor> tensor_vector;
for (auto id : tokens_list) {
torch::Tensor tensor = torch::tensor(id, torch::kInt64);
tensor_vector.push_back(tensor);
}

torch::Tensor stacked_tensor = torch::stack(tensor_vector);
batch_ivalue.push_back(stacked_tensor);
idx_to_req_id.second[idx++] = request.request_id;

} catch (const std::runtime_error& e) {
TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}",
request.request_id, e.what());
auto response = (*response_batch)[request.request_id];
response->SetResponse(500, "data_type",
torchserve::PayloadType::kDATA_TYPE_STRING,
"runtime_error, failed to load tensor");
} catch (const c10::Error& e) {
TS_LOGF(ERROR, "Failed to load tensor for request id: {}, c10 error: {}",
request.request_id, e.msg());
auto response = (*response_batch)[request.request_id];
response->SetResponse(500, "data_type",
torchserve::PayloadType::kDATA_TYPE_STRING,
"c10 error, failed to load tensor");
}
}

return batch_ivalue;
}

torch::Tensor LlamacppHandler::Inference(
std::shared_ptr<torch::jit::script::Module> model,
std::vector<torch::jit::IValue>& inputs,
std::shared_ptr<torch::Device>& device,
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {
auto tokens_list_tensor = inputs[0].toTensor();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you implement processing of the whole batch? Serialized in a for loop would be fine for now. Batched processing would be even better if possible with llama.cpp


int64_t num_elements = tokens_list_tensor.numel();

// Convert the tensor to a vector of long values
std::vector<long> long_vector;
long_vector.reserve(num_elements);

auto data_ptr = tokens_list_tensor.data_ptr<int64_t>();
for (int64_t i = 0; i < num_elements; ++i) {
long_vector.push_back(data_ptr[i]);
}

std::vector<llama_token> tokens_list;

for (auto id : long_vector) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we jump through so many loop here? Cant we directly write the tokens_list from the tensor? Or can you create an array using the data_ptr as underlying storage without making a copy?

tokens_list.push_back(id);
}
const int n_gen = std::min(32, max_context_size);

while (llama_get_kv_cache_token_count(llama_ctx) < n_gen) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do I read this correctly that the maximum number of tokens (including context) will be 32?

// evaluate the transformer

if (llama_eval(llama_ctx, tokens_list.data(), int(tokens_list.size()),
llama_get_kv_cache_token_count(llama_ctx),
params.n_threads)) {
std::cout << "Failed to eval\n" << __func__ << std::endl;
break;
}

tokens_list.clear();

// sample the next token

llama_token new_token_id = 0;

auto logits = llama_get_logits(llama_ctx);
auto n_vocab = llama_n_vocab(llama_ctx);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good to avoid auto for primitive datatypes when readability is not suffering.


std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);

for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(
llama_token_data{token_id, logits[token_id], 0.0f});
}

llama_token_data_array candidates_p = {candidates.data(), candidates.size(),
false};

new_token_id = llama_sample_token_greedy(llama_ctx, &candidates_p);

// is it an end of stream ?
if (new_token_id == llama_token_eos(llama_ctx)) {
std::cout << "Reached [end of text]\n";
break;
}

// push this new token for next evaluation
tokens_list.push_back(new_token_id);
}

std::vector<torch::Tensor> tensor_vector;
for (auto id : tokens_list) {
torch::Tensor tensor = torch::tensor(id, torch::kLong);
tensor_vector.push_back(tensor);
}

torch::Tensor stacked_tensor = torch::stack(tensor_vector);
llama_print_timings(llama_ctx);
llama_free(llama_ctx);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can the model be reused? Then this should be moved into the destructor.

return stacked_tensor;
}

void LlamacppHandler::Postprocess(
const torch::Tensor& data,
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {
for (const auto& kv : idx_to_req_id.second) {
try {
int64_t num_elements = data.numel();

// Convert the tensor to a vector of long values
std::stringstream generated_text_stream;

auto data_ptr = data.data_ptr<int64_t>();
for (int64_t i = 0; i < num_elements; ++i) {
generated_text_stream << llama_token_to_piece(llama_ctx, data_ptr[i]);
}

std::string generated_text_str = generated_text_stream.str();

auto response = (*response_batch)[kv.second];

response->SetResponse(200, "data_type",
torchserve::PayloadType::kDATA_TYPE_STRING,
generated_text_str);
} catch (const std::runtime_error& e) {
TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}",
kv.second, e.what());
auto response = (*response_batch)[kv.second];
response->SetResponse(500, "data_type",
torchserve::PayloadType::kDATA_TYPE_STRING,
"runtime_error, failed to postprocess tensor");
} catch (const c10::Error& e) {
TS_LOGF(ERROR,
"Failed to postprocess tensor for request id: {}, error: {}",
kv.second, e.msg());
auto response = (*response_batch)[kv.second];
response->SetResponse(500, "data_type",
torchserve::PayloadType::kDATA_TYPE_STRING,
"c10 error, failed to postprocess tensor");
}
}
}

} // namespace llm

#if defined(__linux__) || defined(__APPLE__)
extern "C" {
torchserve::torchscripted::BaseHandler* allocatorLlamacppHandler() {
return new llm::LlamacppHandler();
}

void deleterLlamacppHandler(torchserve::torchscripted::BaseHandler* p) {
if (p != nullptr) {
delete static_cast<llm::LlamacppHandler*>(p);
}
}
}
#endif
55 changes: 55 additions & 0 deletions cpp/src/examples/llamacpp/llamacpp_handler.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#ifndef LLAMACPP_HANDLER_HH_
#define LLAMACPP_HANDLER_HH_

#include <folly/FileUtil.h>
#include <folly/json.h>

#include "common/common.h"
#include "ggml.h"
#include "llama.h"
#include "src/backends/torch_scripted/handler/base_handler.hh"

namespace llm {
class LlamacppHandler : public torchserve::torchscripted::BaseHandler {
private:
gpt_params params;
llama_model* llamamodel;
llama_context_params ctx_params;
llama_context* llama_ctx;
const int max_context_size = 32;

public:
// NOLINTBEGIN(bugprone-exception-escape)
LlamacppHandler() = default;
// NOLINTEND(bugprone-exception-escape)
~LlamacppHandler() override = default;

void initialize_context();

virtual std::pair<std::shared_ptr<torch::jit::script::Module>,
std::shared_ptr<torch::Device>>
LoadModel(std::shared_ptr<torchserve::LoadModelRequest>& load_model_request);

std::vector<torch::jit::IValue> Preprocess(
std::shared_ptr<torch::Device>& device,
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch,
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch)
override;

torch::Tensor Inference(
std::shared_ptr<torch::jit::script::Module> model,
std::vector<torch::jit::IValue>& inputs,
std::shared_ptr<torch::Device>& device,
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch)
override;

void Postprocess(
const torch::Tensor& data,
std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch)
override;
};
} // namespace llm
#endif // LLAMACPP_HANDLER_HH_
Loading