diff --git a/.gitmodules b/.gitmodules index e60c11fbfb..5ff9ad429d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "third_party/google/rpc"] path = third_party/google/rpc url = https://github.com/googleapis/googleapis.git +[submodule "cpp/third-party/llama.cpp"] + path = cpp/third-party/llama.cpp + url = https://github.com/ggerganov/llama.cpp.git diff --git a/cpp/README.md b/cpp/README.md index 3a8454b501..4f7dd53318 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -49,7 +49,7 @@ By default, TorchServe cpp provides a handler for TorchScript [src/backends/hand ``` torch-model-archiver --model-name mnist_base --version 1.0 --serialized-file mnist_script.pt --handler TorchScriptHandler --runtime LSP ``` -Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/resources/torchscript_model/mnist/base_handler) of unzipped model mar file. +Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/resources/examples/mnist/base_handler) of unzipped model mar file. ##### Using Custom Handler * build customized handler shared lib. For example [Mnist handler](https://github.com/pytorch/serve/blob/cpp_backend/cpp/src/examples/image_classifier/mnist). * set runtime as "LSP" in model archiver option [--runtime](https://github.com/pytorch/serve/tree/master/model-archiver#arguments) @@ -57,7 +57,7 @@ Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/ ``` torch-model-archiver --model-name mnist_handler --version 1.0 --serialized-file mnist_script.pt --handler libmnist_handler:MnistHandler --runtime LSP ``` -Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/resources/torchscript_model/mnist/mnist_handler) of unzipped model mar file. +Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/resources/examples/mnist/mnist_handler) of unzipped model mar file. ##### BabyLLama Example The babyllama example can be found [here](https://github.com/pytorch/serve/blob/master/cpp/src/examples/babyllama/). To run the example we need to download the weights as well as tokenizer files: @@ -65,7 +65,7 @@ To run the example we need to download the weights as well as tokenizer files: wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin ``` -Subsequently, we need to adjust the paths according to our local file structure in [config.json](https://github.com/pytorch/serve/blob/master/serve/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.json). +Subsequently, we need to adjust the paths according to our local file structure in [config.json](https://github.com/pytorch/serve/blob/master/serve/cpp/test/resources/examples/babyllama/babyllama_handler/config.json). ```bash { "checkpoint_path" : "/home/ubuntu/serve/cpp/stories15M.bin", @@ -74,7 +74,7 @@ Subsequently, we need to adjust the paths according to our local file structure ``` Then we can create the mar file and deploy it with: ```bash -cd serve/cpp/test/resources/torchscript_model/babyllama/babyllama_handler +cd serve/cpp/test/resources/examples/babyllama/babyllama_handler torch-model-archiver --model-name llm --version 1.0 --handler libbabyllama_handler:BabyLlamaHandler --runtime LSP --extra-files config.json mkdir model_store && mv llm.mar model_store/ torchserve --ncs --start --model-store model_store @@ -85,7 +85,7 @@ The handler name `libbabyllama_handler:BabyLlamaHandler` consists of our shared To test the model we can run: ```bash -cd serve/cpp/test/resources/torchscript_model/babyllama/ +cd serve/cpp/test/resources/examples/babyllama/ curl http://localhost:8080/predictions/llm -T prompt.txt ``` ##### Mnist example diff --git a/cpp/build.sh b/cpp/build.sh index ca0eecf765..165cf17cbb 100755 --- a/cpp/build.sh +++ b/cpp/build.sh @@ -136,6 +136,14 @@ function install_yaml_cpp() { cd "$BWD" || exit } +function build_llama_cpp() { + BWD=$(pwd) + LLAMA_CPP_SRC_DIR=$BASE_DIR/third-party/llama.cpp + cd "${LLAMA_CPP_SRC_DIR}" + make + cd "$BWD" || exit +} + function build() { MAYBE_BUILD_QUIC="" if [ "$WITH_QUIC" == true ] ; then @@ -206,16 +214,6 @@ function build() { echo -e "${COLOR_GREEN}torchserve_cpp build is complete. To run unit test: \ ./_build/test/torchserve_cpp_test ${COLOR_OFF}" - if [ -f "$DEPS_DIR/../src/examples/libmnist_handler.dylib" ]; then - mv $DEPS_DIR/../src/examples/libmnist_handler.dylib $DEPS_DIR/../../test/resources/torchscript_model/mnist/mnist_handler/libmnist_handler.dylib - elif [ -f "$DEPS_DIR/../src/examples/libmnist_handler.so" ]; then - mv $DEPS_DIR/../src/examples/libmnist_handler.so $DEPS_DIR/../../test/resources/torchscript_model/mnist/mnist_handler/libmnist_handler.so - fi - - if [ -f "$DEPS_DIR/../src/examples/libbabyllama_handler.so" ]; then - mv $DEPS_DIR/../src/examples/libbabyllama_handler.so $DEPS_DIR/../../test/resources/torchscript_model/babyllama/babyllama_handler/libbabyllama_handler.so - fi - cd $DEPS_DIR/../.. if [ -f "$DEPS_DIR/../test/torchserve_cpp_test" ]; then $DEPS_DIR/../test/torchserve_cpp_test @@ -311,10 +309,13 @@ mkdir -p "$LIBS_DIR" # Must execute from the directory containing this script cd $BASE_DIR +git submodule update --init --recursive + install_folly install_kineto install_libtorch install_yaml_cpp +build_llama_cpp build symlink_torch_libs symlink_yaml_cpp_lib diff --git a/cpp/src/examples/CMakeLists.txt b/cpp/src/examples/CMakeLists.txt index d5402a5faa..a313616270 100644 --- a/cpp/src/examples/CMakeLists.txt +++ b/cpp/src/examples/CMakeLists.txt @@ -1,16 +1,6 @@ -set(MNIST_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/src/examples/image_classifier/mnist") -set(MNIST_SOURCE_FILES "") -list(APPEND MNIST_SOURCE_FILES ${MNIST_SRC_DIR}/mnist_handler.cc) -add_library(mnist_handler SHARED ${MNIST_SOURCE_FILES}) -target_include_directories(mnist_handler PUBLIC ${MNIST_SRC_DIR}) -target_link_libraries(mnist_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES}) +add_subdirectory("../../../examples/cpp/babyllama/" "../../../test/resources/examples/babyllama/babyllama_handler/") +add_subdirectory("../../../examples/cpp/llamacpp/" "../../../test/resources/examples/llamacpp/llamacpp_handler/") -set(BABYLLAMA_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/src/examples/babyllama") -set(BABYLLAMA_SOURCE_FILES "") -list(APPEND BABYLLAMA_SOURCE_FILES ${BABYLLAMA_SRC_DIR}/baby_llama_handler.cc) -add_library(babyllama_handler SHARED ${BABYLLAMA_SOURCE_FILES}) -target_include_directories(babyllama_handler PUBLIC ${BABYLLAMA_SRC_DIR}) -target_link_libraries(babyllama_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES}) -target_compile_options(babyllama_handler PRIVATE -Wall -Wextra -Ofast) +add_subdirectory("../../../examples/cpp/mnist/" "../../../test/resources/examples/mnist/mnist_handler/") diff --git a/cpp/test/backends/otf_protocol_and_handler_test.cc b/cpp/test/backends/otf_protocol_and_handler_test.cc index cc0d7960ec..89e70205a9 100644 --- a/cpp/test/backends/otf_protocol_and_handler_test.cc +++ b/cpp/test/backends/otf_protocol_and_handler_test.cc @@ -24,7 +24,7 @@ TEST(BackendIntegTest, TestOTFProtocolAndHandler) { // model_name length .WillOnce(::testing::Return(5)) // model_path length - .WillOnce(::testing::Return(51)) + .WillOnce(::testing::Return(42)) // batch_size .WillOnce(::testing::Return(1)) // handler length @@ -44,9 +44,8 @@ TEST(BackendIntegTest, TestOTFProtocolAndHandler) { strncpy(data, "mnist", length); })) .WillOnce(testing::Invoke([=](size_t length, char* data) { - ASSERT_EQ(length, 51); - strncpy(data, "test/resources/torchscript_model/mnist/base_handler", - length); + ASSERT_EQ(length, 42); + strncpy(data, "test/resources/examples/mnist/base_handler", length); })) .WillOnce(testing::Invoke([=](size_t length, char* data) { ASSERT_EQ(length, 11); @@ -60,7 +59,7 @@ TEST(BackendIntegTest, TestOTFProtocolAndHandler) { EXPECT_CALL(*client_socket, SendAll(testing::_, testing::_)).Times(1); auto load_model_request = OTFMessage::RetrieveLoadMsg(*client_socket); ASSERT_EQ(load_model_request->model_dir, - "test/resources/torchscript_model/mnist/base_handler"); + "test/resources/examples/mnist/base_handler"); ASSERT_EQ(load_model_request->model_name, "mnist"); ASSERT_EQ(load_model_request->envelope, ""); ASSERT_EQ(load_model_request->model_name, "mnist"); @@ -71,7 +70,7 @@ TEST(BackendIntegTest, TestOTFProtocolAndHandler) { auto backend = std::make_shared(); MetricsRegistry::Initialize("test/resources/metrics/default_config.yaml", MetricsContext::BACKEND); - backend->Initialize("test/resources/torchscript_model/mnist/base_handler"); + backend->Initialize("test/resources/examples/mnist/base_handler"); // load the model auto load_model_response = backend->LoadModel(load_model_request); @@ -126,7 +125,7 @@ TEST(BackendIntegTest, TestOTFProtocolAndHandler) { .WillOnce(testing::Invoke([=](size_t length, char* data) { ASSERT_EQ(length, 3883); // strncpy(data, "valu", length); - std::ifstream input("test/resources/torchscript_model/mnist/0_png.pt", + std::ifstream input("test/resources/examples/mnist/0_png.pt", std::ios::in | std::ios::binary); std::vector image((std::istreambuf_iterator(input)), (std::istreambuf_iterator())); diff --git a/cpp/test/examples/examples_test.cc b/cpp/test/examples/examples_test.cc index f3a1d4b231..22254288cc 100644 --- a/cpp/test/examples/examples_test.cc +++ b/cpp/test/examples/examples_test.cc @@ -1,10 +1,38 @@ +#include + #include "test/utils/common.hh" TEST_F(ModelPredictTest, TestLoadPredictBabyLlamaHandler) { + std::string base_dir = "test/resources/examples/babyllama/"; + std::string file1 = base_dir + "babyllama_handler/stories15M.bin"; + std::string file2 = base_dir + "babyllama_handler/tokenizer.bin"; + + std::ifstream f1(file1); + std::ifstream f2(file2); + + if (!f1.good() && !f2.good()) + GTEST_SKIP() + << "Skipping TestLoadPredictBabyLlamaHandler because of missing files: " + << file1 << " or " << file2; + + this->LoadPredict( + std::make_shared( + base_dir + "babyllama_handler", "llm", -1, "", "", 1, false), + base_dir + "babyllama_handler", base_dir + "prompt.txt", "llm_ts", 200); +} + +TEST_F(ModelPredictTest, TestLoadPredictLlmHandler) { + std::string base_dir = "test/resources/examples/llamacpp/"; + std::string file1 = base_dir + "llamacpp_handler/llama-2-7b-chat.Q5_0.gguf"; + std::ifstream f(file1); + + if (!f.good()) + GTEST_SKIP() + << "Skipping TestLoadPredictLlmHandler because of missing file: " + << file1; + this->LoadPredict( std::make_shared( - "test/resources/torchscript_model/babyllama/babyllama_handler", "llm", - -1, "", "", 1, false), - "test/resources/torchscript_model/babyllama/babyllama_handler", - "test/resources/torchscript_model/babyllama/prompt.txt", "llm_ts", 200); + base_dir + "llamacpp_handler", "llamacpp", -1, "", "", 1, false), + base_dir + "llamacpp_handler", base_dir + "prompt.txt", "llm_ts", 200); } diff --git a/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/babyllama/babyllama_handler/MAR-INF/MANIFEST.json similarity index 100% rename from cpp/test/resources/torchscript_model/babyllama/babyllama_handler/MAR-INF/MANIFEST.json rename to cpp/test/resources/examples/babyllama/babyllama_handler/MAR-INF/MANIFEST.json diff --git a/cpp/test/resources/examples/babyllama/babyllama_handler/config.json b/cpp/test/resources/examples/babyllama/babyllama_handler/config.json new file mode 100644 index 0000000000..f75cd1fb53 --- /dev/null +++ b/cpp/test/resources/examples/babyllama/babyllama_handler/config.json @@ -0,0 +1,4 @@ +{ +"checkpoint_path" : "test/resources/examples/babyllama/babyllama_handler/stories15M.bin", +"tokenizer_path" : "test/resources/examples/babyllama/babyllama_handler/tokenizer.bin" +} diff --git a/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.properties b/cpp/test/resources/examples/babyllama/babyllama_handler/config.properties similarity index 100% rename from cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.properties rename to cpp/test/resources/examples/babyllama/babyllama_handler/config.properties diff --git a/cpp/test/resources/torchscript_model/babyllama/prompt.txt b/cpp/test/resources/examples/babyllama/prompt.txt similarity index 100% rename from cpp/test/resources/torchscript_model/babyllama/prompt.txt rename to cpp/test/resources/examples/babyllama/prompt.txt diff --git a/cpp/test/resources/examples/llamacpp/llamacpp_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/llamacpp/llamacpp_handler/MAR-INF/MANIFEST.json new file mode 100644 index 0000000000..d1e57eaddc --- /dev/null +++ b/cpp/test/resources/examples/llamacpp/llamacpp_handler/MAR-INF/MANIFEST.json @@ -0,0 +1,10 @@ +{ + "createdOn": "28/07/2020 06:32:08", + "runtime": "LSP", + "model": { + "modelName": "llamacpp", + "handler": "libllamacpp_handler:LlamaCppHandler", + "modelVersion": "2.0" + }, + "archiverVersion": "0.2.0" +} diff --git a/cpp/test/resources/torchscript_model/mnist/0.png b/cpp/test/resources/examples/mnist/0.png similarity index 100% rename from cpp/test/resources/torchscript_model/mnist/0.png rename to cpp/test/resources/examples/mnist/0.png diff --git a/cpp/test/resources/torchscript_model/mnist/0_png.pt b/cpp/test/resources/examples/mnist/0_png.pt similarity index 100% rename from cpp/test/resources/torchscript_model/mnist/0_png.pt rename to cpp/test/resources/examples/mnist/0_png.pt diff --git a/cpp/test/resources/torchscript_model/mnist/base_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/mnist/base_handler/MAR-INF/MANIFEST.json similarity index 100% rename from cpp/test/resources/torchscript_model/mnist/base_handler/MAR-INF/MANIFEST.json rename to cpp/test/resources/examples/mnist/base_handler/MAR-INF/MANIFEST.json diff --git a/cpp/test/resources/torchscript_model/mnist/base_handler/mnist_script.pt b/cpp/test/resources/examples/mnist/base_handler/mnist_script.pt similarity index 100% rename from cpp/test/resources/torchscript_model/mnist/base_handler/mnist_script.pt rename to cpp/test/resources/examples/mnist/base_handler/mnist_script.pt diff --git a/cpp/test/resources/torchscript_model/mnist/mnist_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/mnist/mnist_handler/MAR-INF/MANIFEST.json similarity index 100% rename from cpp/test/resources/torchscript_model/mnist/mnist_handler/MAR-INF/MANIFEST.json rename to cpp/test/resources/examples/mnist/mnist_handler/MAR-INF/MANIFEST.json diff --git a/cpp/test/resources/torchscript_model/mnist/mnist_handler/mnist_script.pt b/cpp/test/resources/examples/mnist/mnist_handler/mnist_script.pt similarity index 100% rename from cpp/test/resources/torchscript_model/mnist/mnist_handler/mnist_script.pt rename to cpp/test/resources/examples/mnist/mnist_handler/mnist_script.pt diff --git a/cpp/test/resources/torchscript_model/mnist/wrong_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/mnist/wrong_handler/MAR-INF/MANIFEST.json similarity index 100% rename from cpp/test/resources/torchscript_model/mnist/wrong_handler/MAR-INF/MANIFEST.json rename to cpp/test/resources/examples/mnist/wrong_handler/MAR-INF/MANIFEST.json diff --git a/cpp/test/resources/torchscript_model/mnist/wrong_model/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/mnist/wrong_model/MAR-INF/MANIFEST.json similarity index 100% rename from cpp/test/resources/torchscript_model/mnist/wrong_model/MAR-INF/MANIFEST.json rename to cpp/test/resources/examples/mnist/wrong_model/MAR-INF/MANIFEST.json diff --git a/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.json b/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.json deleted file mode 100644 index 2030358b84..0000000000 --- a/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.json +++ /dev/null @@ -1,5 +0,0 @@ -{ -"checkpoint_path" : "/home/ubuntu/serve/cpp/stories15M.bin", -"tokenizer_path" : "/home/ubuntu/serve/cpp/src/examples/babyllama/tokenizer.bin" -} - diff --git a/cpp/test/torch_scripted/torch_scripted_test.cc b/cpp/test/torch_scripted/torch_scripted_test.cc index cc6806d5c7..ecb1d7f69f 100644 --- a/cpp/test/torch_scripted/torch_scripted_test.cc +++ b/cpp/test/torch_scripted/torch_scripted_test.cc @@ -9,47 +9,44 @@ TEST_F(ModelPredictTest, TestLoadPredictBaseHandler) { this->LoadPredict(std::make_shared( - "test/resources/torchscript_model/mnist/mnist_handler", + "test/resources/examples/mnist/mnist_handler", "mnist_scripted_v2", -1, "", "", 1, false), - "test/resources/torchscript_model/mnist/base_handler", - "test/resources/torchscript_model/mnist/0_png.pt", - "mnist_ts", 200); + "test/resources/examples/mnist/base_handler", + "test/resources/examples/mnist/0_png.pt", "mnist_ts", 200); } TEST_F(ModelPredictTest, TestLoadPredictMnistHandler) { this->LoadPredict(std::make_shared( - "test/resources/torchscript_model/mnist/mnist_handler", + "test/resources/examples/mnist/mnist_handler", "mnist_scripted_v2", -1, "", "", 1, false), - "test/resources/torchscript_model/mnist/mnist_handler", - "test/resources/torchscript_model/mnist/0_png.pt", - "mnist_ts", 200); + "test/resources/examples/mnist/mnist_handler", + "test/resources/examples/mnist/0_png.pt", "mnist_ts", 200); } TEST_F(ModelPredictTest, TestBackendInitWrongModelDir) { - auto result = backend_->Initialize("test/resources/torchscript_model/mnist"); + auto result = backend_->Initialize("test/resources/examples/mnist"); ASSERT_EQ(result, false); } TEST_F(ModelPredictTest, TestBackendInitWrongHandler) { - auto result = backend_->Initialize( - "test/resources/torchscript_model/mnist/wrong_handler"); + auto result = + backend_->Initialize("test/resources/examples/mnist/wrong_handler"); ASSERT_EQ(result, false); } TEST_F(ModelPredictTest, TestLoadModelFailure) { - backend_->Initialize("test/resources/torchscript_model/mnist/wrong_model"); + backend_->Initialize("test/resources/examples/mnist/wrong_model"); auto result = backend_->LoadModel(std::make_shared( - "test/resources/torchscript_model/mnist/wrong_model", - "mnist_scripted_v2", -1, "", "", 1, false)); + "test/resources/examples/mnist/wrong_model", "mnist_scripted_v2", -1, + "", "", 1, false)); ASSERT_EQ(result->code, 500); } TEST_F(ModelPredictTest, TestLoadPredictMnistHandlerFailure) { this->LoadPredict(std::make_shared( - "test/resources/torchscript_model/mnist/mnist_handler", + "test/resources/examples/mnist/mnist_handler", "mnist_scripted_v2", -1, "", "", 1, false), - "test/resources/torchscript_model/mnist/mnist_handler", - "test/resources/torchscript_model/mnist/0.png", "mnist_ts", - 500); + "test/resources/examples/mnist/mnist_handler", + "test/resources/examples/mnist/0.png", "mnist_ts", 500); } diff --git a/cpp/test/utils/model_archiver_test.cc b/cpp/test/utils/model_archiver_test.cc index ea3f5082a2..596048e266 100644 --- a/cpp/test/utils/model_archiver_test.cc +++ b/cpp/test/utils/model_archiver_test.cc @@ -6,7 +6,7 @@ namespace torchserve { TEST(ManifestTest, TestInitialize) { torchserve::Manifest manifest; manifest.Initialize( - "test/resources/torchscript_model/mnist/base_handler/MAR-INF/" + "test/resources/examples/mnist/base_handler/MAR-INF/" "MANIFEST.json"); ASSERT_EQ(manifest.GetCreatOn(), "28/07/2020 06:32:08"); ASSERT_EQ(manifest.GetArchiverVersion(), "0.2.0"); diff --git a/cpp/third-party/llama.cpp b/cpp/third-party/llama.cpp new file mode 160000 index 0000000000..cd4fddb29f --- /dev/null +++ b/cpp/third-party/llama.cpp @@ -0,0 +1 @@ +Subproject commit cd4fddb29f81d6a1f6d51a0c016bc6b486d68def diff --git a/examples/cpp/babyllama/CMakeLists.txt b/examples/cpp/babyllama/CMakeLists.txt new file mode 100644 index 0000000000..4da9bbf60d --- /dev/null +++ b/examples/cpp/babyllama/CMakeLists.txt @@ -0,0 +1,5 @@ + +add_library(babyllama_handler SHARED src/baby_llama_handler.cc) + +target_link_libraries(babyllama_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES}) +target_compile_options(babyllama_handler PRIVATE -Wall -Wextra -Ofast) diff --git a/examples/cpp/babyllama/config.json b/examples/cpp/babyllama/config.json new file mode 100644 index 0000000000..81ddd2c27d --- /dev/null +++ b/examples/cpp/babyllama/config.json @@ -0,0 +1,4 @@ +{ +"checkpoint_path" : "/home/ubuntu/serve/examples/cpp/babyllama/stories15M.bin", +"tokenizer_path" : "/home/ubuntu/serve/examples/cpp/babyllama/tokenizer.bin" +} diff --git a/cpp/src/examples/babyllama/baby_llama_handler.cc b/examples/cpp/babyllama/src/baby_llama_handler.cc similarity index 98% rename from cpp/src/examples/babyllama/baby_llama_handler.cc rename to examples/cpp/babyllama/src/baby_llama_handler.cc index 62980e5f78..0d3b2b5491 100644 --- a/cpp/src/examples/babyllama/baby_llama_handler.cc +++ b/examples/cpp/babyllama/src/baby_llama_handler.cc @@ -1,11 +1,11 @@ -#include "src/examples/babyllama/baby_llama_handler.hh" +#include "baby_llama_handler.hh" #include #include #include -#include "src/examples/babyllama/llama2.c/run.c" +#include "llama2.c/run.c" namespace llm { @@ -233,7 +233,6 @@ c10::IValue BabyLlamaHandler::Inference( } catch (...) { TS_LOG(ERROR, "Failed to run inference on this batch"); } - std::cout << "WOOT?" << std::endl; return batch_output_vector; } diff --git a/cpp/src/examples/babyllama/baby_llama_handler.hh b/examples/cpp/babyllama/src/baby_llama_handler.hh similarity index 100% rename from cpp/src/examples/babyllama/baby_llama_handler.hh rename to examples/cpp/babyllama/src/baby_llama_handler.hh diff --git a/cpp/src/examples/babyllama/llama2.c/LICENSE b/examples/cpp/babyllama/src/llama2.c/LICENSE similarity index 100% rename from cpp/src/examples/babyllama/llama2.c/LICENSE rename to examples/cpp/babyllama/src/llama2.c/LICENSE diff --git a/cpp/src/examples/babyllama/llama2.c/run.c b/examples/cpp/babyllama/src/llama2.c/run.c similarity index 100% rename from cpp/src/examples/babyllama/llama2.c/run.c rename to examples/cpp/babyllama/src/llama2.c/run.c diff --git a/examples/cpp/llamacpp/CMakeLists.txt b/examples/cpp/llamacpp/CMakeLists.txt new file mode 100644 index 0000000000..e071167585 --- /dev/null +++ b/examples/cpp/llamacpp/CMakeLists.txt @@ -0,0 +1,20 @@ +set(LLAMACPP_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/third-party/llama.cpp") + +add_library(llamacpp_handler SHARED src/llamacpp_handler.cc) + +set(MY_OBJECT_FILES + ${LLAMACPP_SRC_DIR}/ggml.o + ${LLAMACPP_SRC_DIR}/llama.o + ${LLAMACPP_SRC_DIR}/common.o + ${LLAMACPP_SRC_DIR}/ggml-quants.o + ${LLAMACPP_SRC_DIR}/ggml-alloc.o + ${LLAMACPP_SRC_DIR}/grammar-parser.o + ${LLAMACPP_SRC_DIR}/console.o + ${LLAMACPP_SRC_DIR}/build-info.o + ${LLAMACPP_SRC_DIR}/ggml-backend.o + +) + +target_sources(llamacpp_handler PRIVATE ${MY_OBJECT_FILES}) +target_include_directories(llamacpp_handler PUBLIC ${LLAMACPP_SRC_DIR}) +target_link_libraries(llamacpp_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES}) diff --git a/examples/cpp/llamacpp/README.md b/examples/cpp/llamacpp/README.md new file mode 100644 index 0000000000..8221262858 --- /dev/null +++ b/examples/cpp/llamacpp/README.md @@ -0,0 +1,83 @@ +This example used [llama.cpp](https://github.com/ggerganov/llama.cpp) to deploy a Llama-2-7B-Chat model using the TorchServe C++ backend. +The handler C++ source code for this examples can be found [here](../../../cpp/src/examples/llamacpp/). + +### Setup +1. Follow the instructions in [README.md](../../../cpp/README.md) to build the TorchServe C++ backend. + +```bash +cd ~/serve/cpp +./builld.sh +``` + +2. Download the model + +```bash +cd ~/serve/examples/cpp/llamacpp +curl -L https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_0.gguf?download=true -o llama-2-7b-chat.Q5_0.gguf +``` + +4. Create a [config.json](config.json) with the path of the downloaded model weights: + +```bash +echo '{ +"checkpoint_path" : "/home/ubuntu/serve/examples/cpp/llamacpp/llama-2-7b-chat.Q5_0.gguf" +}' > config.json +``` + +5. Copy handle .so file + +While building the C++ backend the `libllamacpp_handler.so` file is generated in the [llamacpp_handler](../../../cpp/test/resources/examples/llamacpp/llamacpp_handler) folder. + +```bash +cp ../../../cpp/test/resources/examples/llamacpp/llamacpp_handler/libllamacpp_handler.so ./ +``` + +### Generate MAR file + +Now lets generate the mar file + +```bash +torch-model-archiver --model-name llm --version 1.0 --handler libllamacpp_handler:LlamaCppHandler --runtime LSP --extra-files config.json +``` + +Create model store directory and move the mar file + +``` +mkdir model_store +mv llm.mar model_store/ +``` + +### Inference + +Start torchserve using the following command + +``` +torchserve --ncs --model-store model_store/ +``` + +Register the model using the following command + +``` +curl -v -X POST "http://localhost:8081/models?initial_workers=1&url=llm.mar&batch_size=2&max_batch_delay=5000" +``` + +Infer the model using the following command + +``` +curl http://localhost:8080/predictions/llm -T prompt1.txt +``` + +This example supports batching. To run batch prediction, run the following command + +``` +curl http://localhost:8080/predictions/llm -T prompt1.txt & curl http://localhost:8080/predictions/llm -T prompt2.txt & +``` + +Sample Response + +``` +Hello my name is Daisy everybody loves me + I am a sweet and loving person + I have a big heart and I am always willing to help others + I am a good +``` diff --git a/examples/cpp/llamacpp/config.json b/examples/cpp/llamacpp/config.json new file mode 100644 index 0000000000..62ccfff1e9 --- /dev/null +++ b/examples/cpp/llamacpp/config.json @@ -0,0 +1,3 @@ +{ +"checkpoint_path" : "/home/ubuntu/examples/cpp/llamacpp/llama-2-7b-chat.Q4_0.gguf" +} diff --git a/examples/cpp/llamacpp/prompt1.txt b/examples/cpp/llamacpp/prompt1.txt new file mode 100644 index 0000000000..74b56be151 --- /dev/null +++ b/examples/cpp/llamacpp/prompt1.txt @@ -0,0 +1 @@ +Hello my name is diff --git a/examples/cpp/llamacpp/prompt2.txt b/examples/cpp/llamacpp/prompt2.txt new file mode 100644 index 0000000000..99568648e9 --- /dev/null +++ b/examples/cpp/llamacpp/prompt2.txt @@ -0,0 +1 @@ +Hello my name is Daisy diff --git a/examples/cpp/llamacpp/src/llamacpp_handler.cc b/examples/cpp/llamacpp/src/llamacpp_handler.cc new file mode 100644 index 0000000000..6917ee44cf --- /dev/null +++ b/examples/cpp/llamacpp/src/llamacpp_handler.cc @@ -0,0 +1,297 @@ +#include "llamacpp_handler.hh" + +#include +#include + +#include + +namespace llm { + +void LlamaCppHandler::initialize_context() { + llama_ctx = llama_new_context_with_model(llamamodel, ctx_params); + + if (llama_ctx == nullptr) { + TS_LOG(ERROR, "Failed to initialize llama context"); + } else { + TS_LOG(ERROR, "Context initialized successfully"); + } +} + +std::pair, std::shared_ptr> +LlamaCppHandler::LoadModel( + std::shared_ptr& load_model_request) { + try { + auto device = GetTorchDevice(load_model_request); + + const std::string configFilePath = + fmt::format("{}/{}", load_model_request->model_dir, "config.json"); + std::string jsonContent; + if (!folly::readFile(configFilePath.c_str(), jsonContent)) { + TS_LOGF(ERROR, "config.json not found at: {}", configFilePath); + throw; + } + folly::dynamic json; + json = folly::parseJson(jsonContent); + + std::string checkpoint_path; + if (json.find("checkpoint_path") != json.items().end()) { + checkpoint_path = json["checkpoint_path"].asString(); + } else { + TS_LOG(ERROR, "Required field 'checkpoint_path' not found in JSON."); + throw; + } + params.model = checkpoint_path; + params.main_gpu = 0; + params.n_gpu_layers = 35; + + llama_backend_init(params.numa); + ctx_params = llama_context_default_params(); + model_params = llama_model_default_params(); + llamamodel = llama_load_model_from_file(params.model.c_str(), model_params); + + return std::make_pair(nullptr, device); + } catch (const c10::Error& e) { + TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}", + load_model_request->model_name, load_model_request->gpu_id, + e.msg()); + throw e; + } catch (const std::runtime_error& e) { + TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}", + load_model_request->model_name, load_model_request->gpu_id, + e.what()); + throw e; + } +} + +c10::IValue LlamaCppHandler::Preprocess( + std::shared_ptr& device, + std::pair&>& idx_to_req_id, + std::shared_ptr& request_batch, + std::shared_ptr& response_batch) { + initialize_context(); + + auto batch_ivalue = c10::impl::GenericList(torch::TensorType::get()); + std::vector batch_tensors; + uint8_t idx = 0; + for (auto& request : *request_batch) { + try { + (*response_batch)[request.request_id] = + std::make_shared(request.request_id); + idx_to_req_id.first += idx_to_req_id.first.empty() + ? request.request_id + : "," + request.request_id; + + auto data_it = request.parameters.find( + torchserve::PayloadType::kPARAMETER_NAME_DATA); + auto dtype_it = + request.headers.find(torchserve::PayloadType::kHEADER_NAME_DATA_TYPE); + if (data_it == request.parameters.end()) { + data_it = request.parameters.find( + torchserve::PayloadType::kPARAMETER_NAME_BODY); + dtype_it = request.headers.find( + torchserve::PayloadType::kHEADER_NAME_BODY_TYPE); + } + + if (data_it == request.parameters.end() || + dtype_it == request.headers.end()) { + TS_LOGF(ERROR, "Empty payload for request id: {}", request.request_id); + (*response_batch)[request.request_id]->SetResponse( + 500, "data_type", torchserve::PayloadType::kCONTENT_TYPE_TEXT, + "Empty payload"); + continue; + } + + std::string msg = torchserve::Converter::VectorToStr(data_it->second); + + // tokenization + + std::vector tokens_list; + tokens_list = ::llama_tokenize(llama_ctx, msg, true); + + // const int max_context_size = llama_n_ctx(ctx); + const int max_tokens_list_size = max_context_size - 4; + + if ((int)tokens_list.size() > max_tokens_list_size) { + TS_LOGF(ERROR, "{}: error: prompt too long ({} tokens, max {})", __func__, tokens_list.size(), max_tokens_list_size); + } + + // Print the tokens from the prompt : + std::vector tensor_vector; + for (auto id : tokens_list) { + torch::Tensor tensor = torch::tensor(id, torch::kInt64); + tensor_vector.push_back(tensor); + } + + torch::Tensor stacked_tensor = torch::stack(tensor_vector); + batch_ivalue.push_back(stacked_tensor); + idx_to_req_id.second[idx++] = request.request_id; + + } catch (const std::runtime_error& e) { + TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}", + request.request_id, e.what()); + auto response = (*response_batch)[request.request_id]; + response->SetResponse(500, "data_type", + torchserve::PayloadType::kDATA_TYPE_STRING, + "runtime_error, failed to load tensor"); + } catch (const c10::Error& e) { + TS_LOGF(ERROR, "Failed to load tensor for request id: {}, c10 error: {}", + request.request_id, e.msg()); + auto response = (*response_batch)[request.request_id]; + response->SetResponse(500, "data_type", + torchserve::PayloadType::kDATA_TYPE_STRING, + "c10 error, failed to load tensor"); + } + } + + return batch_ivalue; +} + +c10::IValue LlamaCppHandler::Inference( + std::shared_ptr model, c10::IValue& inputs, + std::shared_ptr& device, + std::pair&>& idx_to_req_id, + std::shared_ptr& response_batch) { + torch::InferenceMode guard; + auto batch_output_vector = c10::impl::GenericList(torch::TensorType::get()); + try { + for (const auto input : inputs.toTensorList()) { + torch::Tensor tokens_list_tensor = input.get().toTensor(); + + int64_t num_elements = tokens_list_tensor.numel(); + + int64_t* data_ptr = tokens_list_tensor.data_ptr(); + std::vector tokens_list; + + for (int64_t i = 0; i < num_elements; ++i) { + tokens_list.push_back(data_ptr[i]); + } + const int n_gen = std::min(32, max_context_size); + + std::vector tensor_vector; + + long pos = 0; + while (pos < n_gen) { + // evaluate the transformer + + int n_past = pos == 0 ? 0 : llama_get_kv_cache_token_count(llama_ctx); + + if (llama_eval(llama_ctx, tokens_list.data(), int(tokens_list.size()), + n_past)) { + TS_LOGF(ERROR, "Failed to eval {}", __func__); + break; + } + + tokens_list.clear(); + + // sample the next token + + llama_token new_token_id = 0; + + auto logits = llama_get_logits(llama_ctx); + auto n_vocab = llama_n_vocab(llamamodel); + + std::vector candidates; + candidates.reserve(n_vocab); + + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back( + llama_token_data{token_id, logits[token_id], 0.0f}); + } + + llama_token_data_array candidates_p = {candidates.data(), + candidates.size(), false}; + + new_token_id = llama_sample_token_greedy(llama_ctx, &candidates_p); + + // is it an end of stream ? + if (new_token_id == llama_token_eos(llamamodel)) { + TS_LOG(DEBUG, "Reached [end of text]"); + break; + } + + // print the new token : + TS_LOGF(DEBUG, "New Token: {}", llama_token_to_piece(llama_ctx, new_token_id)); + + // push this new token for next evaluation + tokens_list.push_back(new_token_id); + tensor_vector.push_back(torch::tensor(new_token_id, torch::kLong)); + pos += 1; + } + + batch_output_vector.push_back(torch::stack(tensor_vector)); + } + + llama_print_timings(llama_ctx); + } catch (std::runtime_error& e) { + TS_LOG(ERROR, e.what()); + } catch (const c10::Error& e) { + TS_LOGF(ERROR, "Failed to apply inference on input, c10 error:{}", e.msg()); + } + return batch_output_vector; +} + +void LlamaCppHandler::Postprocess( + c10::IValue& output, + std::pair&>& idx_to_req_id, + std::shared_ptr& response_batch) { + for (const auto& kv : idx_to_req_id.second) { + auto data = output.toTensorList(); + try { + int64_t num_elements = data[kv.first].get().toTensor().numel(); + + // Convert the tensor to a vector of long values + std::stringstream generated_text_stream; + + auto data_ptr = data[kv.first].get().toTensor().data_ptr(); + for (int64_t i = 0; i < num_elements; ++i) { + generated_text_stream << llama_token_to_piece(llama_ctx, data_ptr[i]); + } + + std::string generated_text_str = generated_text_stream.str(); + TS_LOGF(DEBUG, "Generated Text Str: {}", generated_text_str); + + auto response = (*response_batch)[kv.second]; + + response->SetResponse(200, "data_type", + torchserve::PayloadType::kDATA_TYPE_STRING, + generated_text_str); + } catch (const std::runtime_error& e) { + TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}", + kv.second, e.what()); + auto response = (*response_batch)[kv.second]; + response->SetResponse(500, "data_type", + torchserve::PayloadType::kDATA_TYPE_STRING, + "runtime_error, failed to postprocess tensor"); + } catch (const c10::Error& e) { + TS_LOGF(ERROR, + "Failed to postprocess tensor for request id: {}, error: {}", + kv.second, e.msg()); + auto response = (*response_batch)[kv.second]; + response->SetResponse(500, "data_type", + torchserve::PayloadType::kDATA_TYPE_STRING, + "c10 error, failed to postprocess tensor"); + } + } +} + +LlamaCppHandler::~LlamaCppHandler() noexcept { + llama_free(llama_ctx); + llama_free_model(llamamodel); + llama_backend_free(); +} + +} // namespace llm + +#if defined(__linux__) || defined(__APPLE__) +extern "C" { +torchserve::BaseHandler* allocatorLlamaCppHandler() { + return new llm::LlamaCppHandler(); +} + +void deleterLlamaCppHandler(torchserve::BaseHandler* p) { + if (p != nullptr) { + delete static_cast(p); + } +} +} +#endif diff --git a/examples/cpp/llamacpp/src/llamacpp_handler.hh b/examples/cpp/llamacpp/src/llamacpp_handler.hh new file mode 100644 index 0000000000..5164095eeb --- /dev/null +++ b/examples/cpp/llamacpp/src/llamacpp_handler.hh @@ -0,0 +1,52 @@ +#pragma once + +#include +#include + +#include "common/common.h" +#include "ggml.h" +#include "llama.h" +#include "src/backends/handler/base_handler.hh" + +namespace llm { +class LlamaCppHandler : public torchserve::BaseHandler { + private: + gpt_params params; + llama_model_params model_params; + llama_model* llamamodel; + llama_context_params ctx_params; + llama_context* llama_ctx; + const int max_context_size = 32; + + public: + // NOLINTBEGIN(bugprone-exception-escape) + LlamaCppHandler() = default; + // NOLINTEND(bugprone-exception-escape) + ~LlamaCppHandler() noexcept; + + void initialize_context(); + + virtual std::pair, std::shared_ptr> + LoadModel(std::shared_ptr& load_model_request); + + c10::IValue Preprocess( + std::shared_ptr& device, + std::pair&>& idx_to_req_id, + std::shared_ptr& request_batch, + std::shared_ptr& response_batch) + override; + + c10::IValue Inference( + std::shared_ptr model, c10::IValue& inputs, + std::shared_ptr& device, + std::pair&>& idx_to_req_id, + std::shared_ptr& response_batch) + override; + + void Postprocess( + c10::IValue& data, + std::pair&>& idx_to_req_id, + std::shared_ptr& response_batch) + override; +}; +} // namespace llm diff --git a/examples/cpp/mnist/CMakeLists.txt b/examples/cpp/mnist/CMakeLists.txt new file mode 100644 index 0000000000..45a25d2f41 --- /dev/null +++ b/examples/cpp/mnist/CMakeLists.txt @@ -0,0 +1,3 @@ +add_library(mnist_handler SHARED src/mnist_handler.cc) + +target_link_libraries(mnist_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES}) diff --git a/cpp/src/examples/image_classifier/mnist/mnist_handler.cc b/examples/cpp/mnist/src/mnist_handler.cc similarity index 96% rename from cpp/src/examples/image_classifier/mnist/mnist_handler.cc rename to examples/cpp/mnist/src/mnist_handler.cc index 3fae5748a4..f28ca664d2 100644 --- a/cpp/src/examples/image_classifier/mnist/mnist_handler.cc +++ b/examples/cpp/mnist/src/mnist_handler.cc @@ -1,4 +1,4 @@ -#include "src/examples/image_classifier/mnist/mnist_handler.hh" +#include "mnist_handler.hh" namespace mnist { void MnistHandler::Postprocess( diff --git a/cpp/src/examples/image_classifier/mnist/mnist_handler.hh b/examples/cpp/mnist/src/mnist_handler.hh similarity index 100% rename from cpp/src/examples/image_classifier/mnist/mnist_handler.hh rename to examples/cpp/mnist/src/mnist_handler.hh diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt index 865e6400fe..aebba8f7f3 100644 --- a/ts_scripts/spellcheck_conf/wordlist.txt +++ b/ts_scripts/spellcheck_conf/wordlist.txt @@ -1170,3 +1170,4 @@ BabyLLama BabyLlamaHandler CMakeLists TorchScriptHandler +libllamacpp