diff --git a/.gitmodules b/.gitmodules
index e60c11fbfb..5ff9ad429d 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "third_party/google/rpc"]
 	path = third_party/google/rpc
 	url = https://github.com/googleapis/googleapis.git
+[submodule "cpp/third-party/llama.cpp"]
+	path = cpp/third-party/llama.cpp
+	url = https://github.com/ggerganov/llama.cpp.git
diff --git a/cpp/README.md b/cpp/README.md
index 3a8454b501..4f7dd53318 100644
--- a/cpp/README.md
+++ b/cpp/README.md
@@ -49,7 +49,7 @@ By default, TorchServe cpp provides a handler for TorchScript [src/backends/hand
 ```
  torch-model-archiver --model-name mnist_base --version 1.0 --serialized-file mnist_script.pt --handler TorchScriptHandler --runtime LSP
 ```
-Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/resources/torchscript_model/mnist/base_handler) of unzipped model mar file.
+Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/resources/examples/mnist/base_handler) of unzipped model mar file.
 ##### Using Custom Handler
 * build customized handler shared lib. For example [Mnist handler](https://github.com/pytorch/serve/blob/cpp_backend/cpp/src/examples/image_classifier/mnist).
 * set runtime as "LSP" in model archiver option [--runtime](https://github.com/pytorch/serve/tree/master/model-archiver#arguments)
@@ -57,7 +57,7 @@ Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/
 ```
 torch-model-archiver --model-name mnist_handler --version 1.0 --serialized-file mnist_script.pt --handler libmnist_handler:MnistHandler --runtime LSP
 ```
-Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/resources/torchscript_model/mnist/mnist_handler) of unzipped model mar file.
+Here is an [example](https://github.com/pytorch/serve/tree/cpp_backend/cpp/test/resources/examples/mnist/mnist_handler) of unzipped model mar file.
 ##### BabyLLama Example
 The babyllama example can be found [here](https://github.com/pytorch/serve/blob/master/cpp/src/examples/babyllama/).
 To run the example we need to download the weights as well as tokenizer files:
@@ -65,7 +65,7 @@ To run the example we need to download the weights as well as tokenizer files:
 wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.bin
 wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.bin
 ```
-Subsequently, we need to adjust the paths according to our local file structure in [config.json](https://github.com/pytorch/serve/blob/master/serve/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.json).
+Subsequently, we need to adjust the paths according to our local file structure in [config.json](https://github.com/pytorch/serve/blob/master/serve/cpp/test/resources/examples/babyllama/babyllama_handler/config.json).
 ```bash
 {
 "checkpoint_path" : "/home/ubuntu/serve/cpp/stories15M.bin",
@@ -74,7 +74,7 @@ Subsequently, we need to adjust the paths according to our local file structure
 ```
 Then we can create the mar file and deploy it with:
 ```bash
-cd serve/cpp/test/resources/torchscript_model/babyllama/babyllama_handler
+cd serve/cpp/test/resources/examples/babyllama/babyllama_handler
 torch-model-archiver --model-name llm --version 1.0 --handler libbabyllama_handler:BabyLlamaHandler --runtime LSP --extra-files config.json
 mkdir model_store && mv llm.mar model_store/
 torchserve --ncs --start --model-store model_store
@@ -85,7 +85,7 @@ The handler name `libbabyllama_handler:BabyLlamaHandler` consists of our shared
 
 To test the model we can run:
 ```bash
-cd serve/cpp/test/resources/torchscript_model/babyllama/
+cd serve/cpp/test/resources/examples/babyllama/
 curl http://localhost:8080/predictions/llm -T prompt.txt
 ```
 ##### Mnist example
diff --git a/cpp/build.sh b/cpp/build.sh
index ca0eecf765..165cf17cbb 100755
--- a/cpp/build.sh
+++ b/cpp/build.sh
@@ -136,6 +136,14 @@ function install_yaml_cpp() {
   cd "$BWD" || exit
 }
 
+function build_llama_cpp() {
+  BWD=$(pwd)
+  LLAMA_CPP_SRC_DIR=$BASE_DIR/third-party/llama.cpp
+  cd "${LLAMA_CPP_SRC_DIR}"
+  make
+  cd "$BWD" || exit
+}
+
 function build() {
   MAYBE_BUILD_QUIC=""
   if [ "$WITH_QUIC" == true ] ; then
@@ -206,16 +214,6 @@ function build() {
   echo -e "${COLOR_GREEN}torchserve_cpp build is complete. To run unit test: \
   ./_build/test/torchserve_cpp_test ${COLOR_OFF}"
 
-  if [ -f "$DEPS_DIR/../src/examples/libmnist_handler.dylib" ]; then
-    mv $DEPS_DIR/../src/examples/libmnist_handler.dylib $DEPS_DIR/../../test/resources/torchscript_model/mnist/mnist_handler/libmnist_handler.dylib
-  elif [ -f "$DEPS_DIR/../src/examples/libmnist_handler.so" ]; then
-    mv $DEPS_DIR/../src/examples/libmnist_handler.so $DEPS_DIR/../../test/resources/torchscript_model/mnist/mnist_handler/libmnist_handler.so
-  fi
-
-  if [ -f "$DEPS_DIR/../src/examples/libbabyllama_handler.so" ]; then
-    mv $DEPS_DIR/../src/examples/libbabyllama_handler.so $DEPS_DIR/../../test/resources/torchscript_model/babyllama/babyllama_handler/libbabyllama_handler.so
-  fi
-
   cd $DEPS_DIR/../..
   if [ -f "$DEPS_DIR/../test/torchserve_cpp_test" ]; then
     $DEPS_DIR/../test/torchserve_cpp_test
@@ -311,10 +309,13 @@ mkdir -p "$LIBS_DIR"
 # Must execute from the directory containing this script
 cd $BASE_DIR
 
+git submodule update --init --recursive
+
 install_folly
 install_kineto
 install_libtorch
 install_yaml_cpp
+build_llama_cpp
 build
 symlink_torch_libs
 symlink_yaml_cpp_lib
diff --git a/cpp/src/examples/CMakeLists.txt b/cpp/src/examples/CMakeLists.txt
index d5402a5faa..a313616270 100644
--- a/cpp/src/examples/CMakeLists.txt
+++ b/cpp/src/examples/CMakeLists.txt
@@ -1,16 +1,6 @@
-set(MNIST_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/src/examples/image_classifier/mnist")
 
-set(MNIST_SOURCE_FILES "")
-list(APPEND MNIST_SOURCE_FILES ${MNIST_SRC_DIR}/mnist_handler.cc)
-add_library(mnist_handler SHARED ${MNIST_SOURCE_FILES})
-target_include_directories(mnist_handler PUBLIC ${MNIST_SRC_DIR})
-target_link_libraries(mnist_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES})
+add_subdirectory("../../../examples/cpp/babyllama/" "../../../test/resources/examples/babyllama/babyllama_handler/")
 
+add_subdirectory("../../../examples/cpp/llamacpp/" "../../../test/resources/examples/llamacpp/llamacpp_handler/")
 
-set(BABYLLAMA_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/src/examples/babyllama")
-set(BABYLLAMA_SOURCE_FILES "")
-list(APPEND BABYLLAMA_SOURCE_FILES ${BABYLLAMA_SRC_DIR}/baby_llama_handler.cc)
-add_library(babyllama_handler SHARED ${BABYLLAMA_SOURCE_FILES})
-target_include_directories(babyllama_handler PUBLIC ${BABYLLAMA_SRC_DIR})
-target_link_libraries(babyllama_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES})
-target_compile_options(babyllama_handler PRIVATE -Wall -Wextra -Ofast)
+add_subdirectory("../../../examples/cpp/mnist/" "../../../test/resources/examples/mnist/mnist_handler/")
diff --git a/cpp/test/backends/otf_protocol_and_handler_test.cc b/cpp/test/backends/otf_protocol_and_handler_test.cc
index cc0d7960ec..89e70205a9 100644
--- a/cpp/test/backends/otf_protocol_and_handler_test.cc
+++ b/cpp/test/backends/otf_protocol_and_handler_test.cc
@@ -24,7 +24,7 @@ TEST(BackendIntegTest, TestOTFProtocolAndHandler) {
       // model_name length
       .WillOnce(::testing::Return(5))
       // model_path length
-      .WillOnce(::testing::Return(51))
+      .WillOnce(::testing::Return(42))
       // batch_size
       .WillOnce(::testing::Return(1))
       // handler length
@@ -44,9 +44,8 @@ TEST(BackendIntegTest, TestOTFProtocolAndHandler) {
         strncpy(data, "mnist", length);
       }))
       .WillOnce(testing::Invoke([=](size_t length, char* data) {
-        ASSERT_EQ(length, 51);
-        strncpy(data, "test/resources/torchscript_model/mnist/base_handler",
-                length);
+        ASSERT_EQ(length, 42);
+        strncpy(data, "test/resources/examples/mnist/base_handler", length);
       }))
       .WillOnce(testing::Invoke([=](size_t length, char* data) {
         ASSERT_EQ(length, 11);
@@ -60,7 +59,7 @@ TEST(BackendIntegTest, TestOTFProtocolAndHandler) {
   EXPECT_CALL(*client_socket, SendAll(testing::_, testing::_)).Times(1);
   auto load_model_request = OTFMessage::RetrieveLoadMsg(*client_socket);
   ASSERT_EQ(load_model_request->model_dir,
-            "test/resources/torchscript_model/mnist/base_handler");
+            "test/resources/examples/mnist/base_handler");
   ASSERT_EQ(load_model_request->model_name, "mnist");
   ASSERT_EQ(load_model_request->envelope, "");
   ASSERT_EQ(load_model_request->model_name, "mnist");
@@ -71,7 +70,7 @@ TEST(BackendIntegTest, TestOTFProtocolAndHandler) {
   auto backend = std::make_shared<torchserve::Backend>();
   MetricsRegistry::Initialize("test/resources/metrics/default_config.yaml",
                               MetricsContext::BACKEND);
-  backend->Initialize("test/resources/torchscript_model/mnist/base_handler");
+  backend->Initialize("test/resources/examples/mnist/base_handler");
 
   // load the model
   auto load_model_response = backend->LoadModel(load_model_request);
@@ -126,7 +125,7 @@ TEST(BackendIntegTest, TestOTFProtocolAndHandler) {
       .WillOnce(testing::Invoke([=](size_t length, char* data) {
         ASSERT_EQ(length, 3883);
         // strncpy(data, "valu", length);
-        std::ifstream input("test/resources/torchscript_model/mnist/0_png.pt",
+        std::ifstream input("test/resources/examples/mnist/0_png.pt",
                             std::ios::in | std::ios::binary);
         std::vector<char> image((std::istreambuf_iterator<char>(input)),
                                 (std::istreambuf_iterator<char>()));
diff --git a/cpp/test/examples/examples_test.cc b/cpp/test/examples/examples_test.cc
index f3a1d4b231..22254288cc 100644
--- a/cpp/test/examples/examples_test.cc
+++ b/cpp/test/examples/examples_test.cc
@@ -1,10 +1,38 @@
+#include <fstream>
+
 #include "test/utils/common.hh"
 
 TEST_F(ModelPredictTest, TestLoadPredictBabyLlamaHandler) {
+  std::string base_dir = "test/resources/examples/babyllama/";
+  std::string file1 = base_dir + "babyllama_handler/stories15M.bin";
+  std::string file2 = base_dir + "babyllama_handler/tokenizer.bin";
+
+  std::ifstream f1(file1);
+  std::ifstream f2(file2);
+
+  if (!f1.good() && !f2.good())
+    GTEST_SKIP()
+        << "Skipping TestLoadPredictBabyLlamaHandler because of missing files: "
+        << file1 << " or " << file2;
+
+  this->LoadPredict(
+      std::make_shared<torchserve::LoadModelRequest>(
+          base_dir + "babyllama_handler", "llm", -1, "", "", 1, false),
+      base_dir + "babyllama_handler", base_dir + "prompt.txt", "llm_ts", 200);
+}
+
+TEST_F(ModelPredictTest, TestLoadPredictLlmHandler) {
+  std::string base_dir = "test/resources/examples/llamacpp/";
+  std::string file1 = base_dir + "llamacpp_handler/llama-2-7b-chat.Q5_0.gguf";
+  std::ifstream f(file1);
+
+  if (!f.good())
+    GTEST_SKIP()
+        << "Skipping TestLoadPredictLlmHandler because of missing file: "
+        << file1;
+
   this->LoadPredict(
       std::make_shared<torchserve::LoadModelRequest>(
-          "test/resources/torchscript_model/babyllama/babyllama_handler", "llm",
-          -1, "", "", 1, false),
-      "test/resources/torchscript_model/babyllama/babyllama_handler",
-      "test/resources/torchscript_model/babyllama/prompt.txt", "llm_ts", 200);
+          base_dir + "llamacpp_handler", "llamacpp", -1, "", "", 1, false),
+      base_dir + "llamacpp_handler", base_dir + "prompt.txt", "llm_ts", 200);
 }
diff --git a/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/babyllama/babyllama_handler/MAR-INF/MANIFEST.json
similarity index 100%
rename from cpp/test/resources/torchscript_model/babyllama/babyllama_handler/MAR-INF/MANIFEST.json
rename to cpp/test/resources/examples/babyllama/babyllama_handler/MAR-INF/MANIFEST.json
diff --git a/cpp/test/resources/examples/babyllama/babyllama_handler/config.json b/cpp/test/resources/examples/babyllama/babyllama_handler/config.json
new file mode 100644
index 0000000000..f75cd1fb53
--- /dev/null
+++ b/cpp/test/resources/examples/babyllama/babyllama_handler/config.json
@@ -0,0 +1,4 @@
+{
+"checkpoint_path" : "test/resources/examples/babyllama/babyllama_handler/stories15M.bin",
+"tokenizer_path" : "test/resources/examples/babyllama/babyllama_handler/tokenizer.bin"
+}
diff --git a/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.properties b/cpp/test/resources/examples/babyllama/babyllama_handler/config.properties
similarity index 100%
rename from cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.properties
rename to cpp/test/resources/examples/babyllama/babyllama_handler/config.properties
diff --git a/cpp/test/resources/torchscript_model/babyllama/prompt.txt b/cpp/test/resources/examples/babyllama/prompt.txt
similarity index 100%
rename from cpp/test/resources/torchscript_model/babyllama/prompt.txt
rename to cpp/test/resources/examples/babyllama/prompt.txt
diff --git a/cpp/test/resources/examples/llamacpp/llamacpp_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/llamacpp/llamacpp_handler/MAR-INF/MANIFEST.json
new file mode 100644
index 0000000000..d1e57eaddc
--- /dev/null
+++ b/cpp/test/resources/examples/llamacpp/llamacpp_handler/MAR-INF/MANIFEST.json
@@ -0,0 +1,10 @@
+{
+  "createdOn": "28/07/2020 06:32:08",
+  "runtime": "LSP",
+  "model": {
+    "modelName": "llamacpp",
+    "handler": "libllamacpp_handler:LlamaCppHandler",
+    "modelVersion": "2.0"
+  },
+  "archiverVersion": "0.2.0"
+}
diff --git a/cpp/test/resources/torchscript_model/mnist/0.png b/cpp/test/resources/examples/mnist/0.png
similarity index 100%
rename from cpp/test/resources/torchscript_model/mnist/0.png
rename to cpp/test/resources/examples/mnist/0.png
diff --git a/cpp/test/resources/torchscript_model/mnist/0_png.pt b/cpp/test/resources/examples/mnist/0_png.pt
similarity index 100%
rename from cpp/test/resources/torchscript_model/mnist/0_png.pt
rename to cpp/test/resources/examples/mnist/0_png.pt
diff --git a/cpp/test/resources/torchscript_model/mnist/base_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/mnist/base_handler/MAR-INF/MANIFEST.json
similarity index 100%
rename from cpp/test/resources/torchscript_model/mnist/base_handler/MAR-INF/MANIFEST.json
rename to cpp/test/resources/examples/mnist/base_handler/MAR-INF/MANIFEST.json
diff --git a/cpp/test/resources/torchscript_model/mnist/base_handler/mnist_script.pt b/cpp/test/resources/examples/mnist/base_handler/mnist_script.pt
similarity index 100%
rename from cpp/test/resources/torchscript_model/mnist/base_handler/mnist_script.pt
rename to cpp/test/resources/examples/mnist/base_handler/mnist_script.pt
diff --git a/cpp/test/resources/torchscript_model/mnist/mnist_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/mnist/mnist_handler/MAR-INF/MANIFEST.json
similarity index 100%
rename from cpp/test/resources/torchscript_model/mnist/mnist_handler/MAR-INF/MANIFEST.json
rename to cpp/test/resources/examples/mnist/mnist_handler/MAR-INF/MANIFEST.json
diff --git a/cpp/test/resources/torchscript_model/mnist/mnist_handler/mnist_script.pt b/cpp/test/resources/examples/mnist/mnist_handler/mnist_script.pt
similarity index 100%
rename from cpp/test/resources/torchscript_model/mnist/mnist_handler/mnist_script.pt
rename to cpp/test/resources/examples/mnist/mnist_handler/mnist_script.pt
diff --git a/cpp/test/resources/torchscript_model/mnist/wrong_handler/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/mnist/wrong_handler/MAR-INF/MANIFEST.json
similarity index 100%
rename from cpp/test/resources/torchscript_model/mnist/wrong_handler/MAR-INF/MANIFEST.json
rename to cpp/test/resources/examples/mnist/wrong_handler/MAR-INF/MANIFEST.json
diff --git a/cpp/test/resources/torchscript_model/mnist/wrong_model/MAR-INF/MANIFEST.json b/cpp/test/resources/examples/mnist/wrong_model/MAR-INF/MANIFEST.json
similarity index 100%
rename from cpp/test/resources/torchscript_model/mnist/wrong_model/MAR-INF/MANIFEST.json
rename to cpp/test/resources/examples/mnist/wrong_model/MAR-INF/MANIFEST.json
diff --git a/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.json b/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.json
deleted file mode 100644
index 2030358b84..0000000000
--- a/cpp/test/resources/torchscript_model/babyllama/babyllama_handler/config.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-"checkpoint_path" : "/home/ubuntu/serve/cpp/stories15M.bin",
-"tokenizer_path" : "/home/ubuntu/serve/cpp/src/examples/babyllama/tokenizer.bin"
-}
-
diff --git a/cpp/test/torch_scripted/torch_scripted_test.cc b/cpp/test/torch_scripted/torch_scripted_test.cc
index cc6806d5c7..ecb1d7f69f 100644
--- a/cpp/test/torch_scripted/torch_scripted_test.cc
+++ b/cpp/test/torch_scripted/torch_scripted_test.cc
@@ -9,47 +9,44 @@
 
 TEST_F(ModelPredictTest, TestLoadPredictBaseHandler) {
   this->LoadPredict(std::make_shared<torchserve::LoadModelRequest>(
-                        "test/resources/torchscript_model/mnist/mnist_handler",
+                        "test/resources/examples/mnist/mnist_handler",
                         "mnist_scripted_v2", -1, "", "", 1, false),
-                    "test/resources/torchscript_model/mnist/base_handler",
-                    "test/resources/torchscript_model/mnist/0_png.pt",
-                    "mnist_ts", 200);
+                    "test/resources/examples/mnist/base_handler",
+                    "test/resources/examples/mnist/0_png.pt", "mnist_ts", 200);
 }
 
 TEST_F(ModelPredictTest, TestLoadPredictMnistHandler) {
   this->LoadPredict(std::make_shared<torchserve::LoadModelRequest>(
-                        "test/resources/torchscript_model/mnist/mnist_handler",
+                        "test/resources/examples/mnist/mnist_handler",
                         "mnist_scripted_v2", -1, "", "", 1, false),
-                    "test/resources/torchscript_model/mnist/mnist_handler",
-                    "test/resources/torchscript_model/mnist/0_png.pt",
-                    "mnist_ts", 200);
+                    "test/resources/examples/mnist/mnist_handler",
+                    "test/resources/examples/mnist/0_png.pt", "mnist_ts", 200);
 }
 
 TEST_F(ModelPredictTest, TestBackendInitWrongModelDir) {
-  auto result = backend_->Initialize("test/resources/torchscript_model/mnist");
+  auto result = backend_->Initialize("test/resources/examples/mnist");
   ASSERT_EQ(result, false);
 }
 
 TEST_F(ModelPredictTest, TestBackendInitWrongHandler) {
-  auto result = backend_->Initialize(
-      "test/resources/torchscript_model/mnist/wrong_handler");
+  auto result =
+      backend_->Initialize("test/resources/examples/mnist/wrong_handler");
   ASSERT_EQ(result, false);
 }
 
 TEST_F(ModelPredictTest, TestLoadModelFailure) {
-  backend_->Initialize("test/resources/torchscript_model/mnist/wrong_model");
+  backend_->Initialize("test/resources/examples/mnist/wrong_model");
   auto result =
       backend_->LoadModel(std::make_shared<torchserve::LoadModelRequest>(
-          "test/resources/torchscript_model/mnist/wrong_model",
-          "mnist_scripted_v2", -1, "", "", 1, false));
+          "test/resources/examples/mnist/wrong_model", "mnist_scripted_v2", -1,
+          "", "", 1, false));
   ASSERT_EQ(result->code, 500);
 }
 
 TEST_F(ModelPredictTest, TestLoadPredictMnistHandlerFailure) {
   this->LoadPredict(std::make_shared<torchserve::LoadModelRequest>(
-                        "test/resources/torchscript_model/mnist/mnist_handler",
+                        "test/resources/examples/mnist/mnist_handler",
                         "mnist_scripted_v2", -1, "", "", 1, false),
-                    "test/resources/torchscript_model/mnist/mnist_handler",
-                    "test/resources/torchscript_model/mnist/0.png", "mnist_ts",
-                    500);
+                    "test/resources/examples/mnist/mnist_handler",
+                    "test/resources/examples/mnist/0.png", "mnist_ts", 500);
 }
diff --git a/cpp/test/utils/model_archiver_test.cc b/cpp/test/utils/model_archiver_test.cc
index ea3f5082a2..596048e266 100644
--- a/cpp/test/utils/model_archiver_test.cc
+++ b/cpp/test/utils/model_archiver_test.cc
@@ -6,7 +6,7 @@ namespace torchserve {
 TEST(ManifestTest, TestInitialize) {
   torchserve::Manifest manifest;
   manifest.Initialize(
-      "test/resources/torchscript_model/mnist/base_handler/MAR-INF/"
+      "test/resources/examples/mnist/base_handler/MAR-INF/"
       "MANIFEST.json");
   ASSERT_EQ(manifest.GetCreatOn(), "28/07/2020 06:32:08");
   ASSERT_EQ(manifest.GetArchiverVersion(), "0.2.0");
diff --git a/cpp/third-party/llama.cpp b/cpp/third-party/llama.cpp
new file mode 160000
index 0000000000..cd4fddb29f
--- /dev/null
+++ b/cpp/third-party/llama.cpp
@@ -0,0 +1 @@
+Subproject commit cd4fddb29f81d6a1f6d51a0c016bc6b486d68def
diff --git a/examples/cpp/babyllama/CMakeLists.txt b/examples/cpp/babyllama/CMakeLists.txt
new file mode 100644
index 0000000000..4da9bbf60d
--- /dev/null
+++ b/examples/cpp/babyllama/CMakeLists.txt
@@ -0,0 +1,5 @@
+
+add_library(babyllama_handler SHARED src/baby_llama_handler.cc)
+
+target_link_libraries(babyllama_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES})
+target_compile_options(babyllama_handler PRIVATE -Wall -Wextra -Ofast)
diff --git a/examples/cpp/babyllama/config.json b/examples/cpp/babyllama/config.json
new file mode 100644
index 0000000000..81ddd2c27d
--- /dev/null
+++ b/examples/cpp/babyllama/config.json
@@ -0,0 +1,4 @@
+{
+"checkpoint_path" : "/home/ubuntu/serve/examples/cpp/babyllama/stories15M.bin",
+"tokenizer_path" : "/home/ubuntu/serve/examples/cpp/babyllama/tokenizer.bin"
+}
diff --git a/cpp/src/examples/babyllama/baby_llama_handler.cc b/examples/cpp/babyllama/src/baby_llama_handler.cc
similarity index 98%
rename from cpp/src/examples/babyllama/baby_llama_handler.cc
rename to examples/cpp/babyllama/src/baby_llama_handler.cc
index 62980e5f78..0d3b2b5491 100644
--- a/cpp/src/examples/babyllama/baby_llama_handler.cc
+++ b/examples/cpp/babyllama/src/baby_llama_handler.cc
@@ -1,11 +1,11 @@
-#include "src/examples/babyllama/baby_llama_handler.hh"
+#include "baby_llama_handler.hh"
 
 #include <folly/FileUtil.h>
 #include <folly/json.h>
 
 #include <typeinfo>
 
-#include "src/examples/babyllama/llama2.c/run.c"
+#include "llama2.c/run.c"
 
 namespace llm {
 
@@ -233,7 +233,6 @@ c10::IValue BabyLlamaHandler::Inference(
   } catch (...) {
     TS_LOG(ERROR, "Failed to run inference on this batch");
   }
-  std::cout << "WOOT?" << std::endl;
   return batch_output_vector;
 }
 
diff --git a/cpp/src/examples/babyllama/baby_llama_handler.hh b/examples/cpp/babyllama/src/baby_llama_handler.hh
similarity index 100%
rename from cpp/src/examples/babyllama/baby_llama_handler.hh
rename to examples/cpp/babyllama/src/baby_llama_handler.hh
diff --git a/cpp/src/examples/babyllama/llama2.c/LICENSE b/examples/cpp/babyllama/src/llama2.c/LICENSE
similarity index 100%
rename from cpp/src/examples/babyllama/llama2.c/LICENSE
rename to examples/cpp/babyllama/src/llama2.c/LICENSE
diff --git a/cpp/src/examples/babyllama/llama2.c/run.c b/examples/cpp/babyllama/src/llama2.c/run.c
similarity index 100%
rename from cpp/src/examples/babyllama/llama2.c/run.c
rename to examples/cpp/babyllama/src/llama2.c/run.c
diff --git a/examples/cpp/llamacpp/CMakeLists.txt b/examples/cpp/llamacpp/CMakeLists.txt
new file mode 100644
index 0000000000..e071167585
--- /dev/null
+++ b/examples/cpp/llamacpp/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(LLAMACPP_SRC_DIR "${torchserve_cpp_SOURCE_DIR}/third-party/llama.cpp")
+
+add_library(llamacpp_handler SHARED src/llamacpp_handler.cc)
+
+set(MY_OBJECT_FILES
+    ${LLAMACPP_SRC_DIR}/ggml.o
+    ${LLAMACPP_SRC_DIR}/llama.o
+    ${LLAMACPP_SRC_DIR}/common.o
+    ${LLAMACPP_SRC_DIR}/ggml-quants.o
+    ${LLAMACPP_SRC_DIR}/ggml-alloc.o
+    ${LLAMACPP_SRC_DIR}/grammar-parser.o
+    ${LLAMACPP_SRC_DIR}/console.o
+    ${LLAMACPP_SRC_DIR}/build-info.o
+    ${LLAMACPP_SRC_DIR}/ggml-backend.o
+
+)
+
+target_sources(llamacpp_handler PRIVATE ${MY_OBJECT_FILES})
+target_include_directories(llamacpp_handler PUBLIC ${LLAMACPP_SRC_DIR})
+target_link_libraries(llamacpp_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES})
diff --git a/examples/cpp/llamacpp/README.md b/examples/cpp/llamacpp/README.md
new file mode 100644
index 0000000000..8221262858
--- /dev/null
+++ b/examples/cpp/llamacpp/README.md
@@ -0,0 +1,83 @@
+This example used [llama.cpp](https://github.com/ggerganov/llama.cpp) to deploy a Llama-2-7B-Chat model using the TorchServe C++ backend.
+The handler C++ source code for this examples can be found [here](../../../cpp/src/examples/llamacpp/).
+
+### Setup
+1. Follow the instructions in [README.md](../../../cpp/README.md) to build the TorchServe C++ backend.
+
+```bash
+cd ~/serve/cpp
+./builld.sh
+```
+
+2. Download the model
+
+```bash
+cd ~/serve/examples/cpp/llamacpp
+curl -L https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q5_0.gguf?download=true -o llama-2-7b-chat.Q5_0.gguf
+```
+
+4. Create a [config.json](config.json) with the path of the downloaded model weights:
+
+```bash
+echo '{
+"checkpoint_path" : "/home/ubuntu/serve/examples/cpp/llamacpp/llama-2-7b-chat.Q5_0.gguf"
+}' > config.json
+```
+
+5. Copy handle .so file
+
+While building the C++ backend the `libllamacpp_handler.so` file is generated in the [llamacpp_handler](../../../cpp/test/resources/examples/llamacpp/llamacpp_handler) folder.
+
+```bash
+cp ../../../cpp/test/resources/examples/llamacpp/llamacpp_handler/libllamacpp_handler.so ./
+```
+
+### Generate MAR file
+
+Now lets generate the mar file
+
+```bash
+torch-model-archiver --model-name llm --version 1.0 --handler libllamacpp_handler:LlamaCppHandler --runtime LSP --extra-files config.json
+```
+
+Create model store directory and move the mar file
+
+```
+mkdir model_store
+mv llm.mar model_store/
+```
+
+### Inference
+
+Start torchserve using the following command
+
+```
+torchserve --ncs --model-store model_store/
+```
+
+Register the model using the following command
+
+```
+curl -v -X POST "http://localhost:8081/models?initial_workers=1&url=llm.mar&batch_size=2&max_batch_delay=5000"
+```
+
+Infer the model using the following command
+
+```
+curl http://localhost:8080/predictions/llm -T prompt1.txt
+```
+
+This example supports batching. To run batch prediction, run the following command
+
+```
+curl http://localhost:8080/predictions/llm -T prompt1.txt & curl http://localhost:8080/predictions/llm -T prompt2.txt &
+```
+
+Sample Response
+
+```
+Hello my name is Daisy everybody loves me
+ I am a sweet and loving person
+ I have a big heart and I am always willing to help others
+ I am a good
+```
diff --git a/examples/cpp/llamacpp/config.json b/examples/cpp/llamacpp/config.json
new file mode 100644
index 0000000000..62ccfff1e9
--- /dev/null
+++ b/examples/cpp/llamacpp/config.json
@@ -0,0 +1,3 @@
+{
+"checkpoint_path" : "/home/ubuntu/examples/cpp/llamacpp/llama-2-7b-chat.Q4_0.gguf"
+}
diff --git a/examples/cpp/llamacpp/prompt1.txt b/examples/cpp/llamacpp/prompt1.txt
new file mode 100644
index 0000000000..74b56be151
--- /dev/null
+++ b/examples/cpp/llamacpp/prompt1.txt
@@ -0,0 +1 @@
+Hello my name is
diff --git a/examples/cpp/llamacpp/prompt2.txt b/examples/cpp/llamacpp/prompt2.txt
new file mode 100644
index 0000000000..99568648e9
--- /dev/null
+++ b/examples/cpp/llamacpp/prompt2.txt
@@ -0,0 +1 @@
+Hello my name is Daisy
diff --git a/examples/cpp/llamacpp/src/llamacpp_handler.cc b/examples/cpp/llamacpp/src/llamacpp_handler.cc
new file mode 100644
index 0000000000..6917ee44cf
--- /dev/null
+++ b/examples/cpp/llamacpp/src/llamacpp_handler.cc
@@ -0,0 +1,297 @@
+#include "llamacpp_handler.hh"
+
+#include <torch/script.h>
+#include <torch/torch.h>
+
+#include <typeinfo>
+
+namespace llm {
+
+void LlamaCppHandler::initialize_context() {
+  llama_ctx = llama_new_context_with_model(llamamodel, ctx_params);
+
+  if (llama_ctx == nullptr) {
+    TS_LOG(ERROR, "Failed to initialize llama context");
+  } else {
+    TS_LOG(ERROR, "Context initialized successfully");
+  }
+}
+
+std::pair<std::shared_ptr<void>, std::shared_ptr<torch::Device>>
+LlamaCppHandler::LoadModel(
+    std::shared_ptr<torchserve::LoadModelRequest>& load_model_request) {
+  try {
+    auto device = GetTorchDevice(load_model_request);
+
+    const std::string configFilePath =
+        fmt::format("{}/{}", load_model_request->model_dir, "config.json");
+    std::string jsonContent;
+    if (!folly::readFile(configFilePath.c_str(), jsonContent)) {
+      TS_LOGF(ERROR, "config.json not found at: {}", configFilePath);
+      throw;
+    }
+    folly::dynamic json;
+    json = folly::parseJson(jsonContent);
+
+    std::string checkpoint_path;
+    if (json.find("checkpoint_path") != json.items().end()) {
+      checkpoint_path = json["checkpoint_path"].asString();
+    } else {
+      TS_LOG(ERROR, "Required field 'checkpoint_path' not found in JSON.");
+      throw;
+    }
+    params.model = checkpoint_path;
+    params.main_gpu = 0;
+    params.n_gpu_layers = 35;
+
+    llama_backend_init(params.numa);
+    ctx_params = llama_context_default_params();
+    model_params = llama_model_default_params();
+    llamamodel = llama_load_model_from_file(params.model.c_str(), model_params);
+
+    return std::make_pair(nullptr, device);
+  } catch (const c10::Error& e) {
+    TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}",
+            load_model_request->model_name, load_model_request->gpu_id,
+            e.msg());
+    throw e;
+  } catch (const std::runtime_error& e) {
+    TS_LOGF(ERROR, "loading the model: {}, device id: {}, error: {}",
+            load_model_request->model_name, load_model_request->gpu_id,
+            e.what());
+    throw e;
+  }
+}
+
+c10::IValue LlamaCppHandler::Preprocess(
+    std::shared_ptr<torch::Device>& device,
+    std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+    std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch,
+    std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {
+  initialize_context();
+
+  auto batch_ivalue = c10::impl::GenericList(torch::TensorType::get());
+  std::vector<torch::Tensor> batch_tensors;
+  uint8_t idx = 0;
+  for (auto& request : *request_batch) {
+    try {
+      (*response_batch)[request.request_id] =
+          std::make_shared<torchserve::InferenceResponse>(request.request_id);
+      idx_to_req_id.first += idx_to_req_id.first.empty()
+                                 ? request.request_id
+                                 : "," + request.request_id;
+
+      auto data_it = request.parameters.find(
+          torchserve::PayloadType::kPARAMETER_NAME_DATA);
+      auto dtype_it =
+          request.headers.find(torchserve::PayloadType::kHEADER_NAME_DATA_TYPE);
+      if (data_it == request.parameters.end()) {
+        data_it = request.parameters.find(
+            torchserve::PayloadType::kPARAMETER_NAME_BODY);
+        dtype_it = request.headers.find(
+            torchserve::PayloadType::kHEADER_NAME_BODY_TYPE);
+      }
+
+      if (data_it == request.parameters.end() ||
+          dtype_it == request.headers.end()) {
+        TS_LOGF(ERROR, "Empty payload for request id: {}", request.request_id);
+        (*response_batch)[request.request_id]->SetResponse(
+            500, "data_type", torchserve::PayloadType::kCONTENT_TYPE_TEXT,
+            "Empty payload");
+        continue;
+      }
+
+      std::string msg = torchserve::Converter::VectorToStr(data_it->second);
+
+      // tokenization
+
+      std::vector<llama_token> tokens_list;
+      tokens_list = ::llama_tokenize(llama_ctx, msg, true);
+
+      // const int max_context_size = llama_n_ctx(ctx);
+      const int max_tokens_list_size = max_context_size - 4;
+
+      if ((int)tokens_list.size() > max_tokens_list_size) {
+        TS_LOGF(ERROR, "{}: error: prompt too long ({} tokens, max {})", __func__, tokens_list.size(),  max_tokens_list_size);
+      }
+
+      // Print the tokens from the prompt :
+      std::vector<torch::Tensor> tensor_vector;
+      for (auto id : tokens_list) {
+        torch::Tensor tensor = torch::tensor(id, torch::kInt64);
+        tensor_vector.push_back(tensor);
+      }
+
+      torch::Tensor stacked_tensor = torch::stack(tensor_vector);
+      batch_ivalue.push_back(stacked_tensor);
+      idx_to_req_id.second[idx++] = request.request_id;
+
+    } catch (const std::runtime_error& e) {
+      TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}",
+              request.request_id, e.what());
+      auto response = (*response_batch)[request.request_id];
+      response->SetResponse(500, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            "runtime_error, failed to load tensor");
+    } catch (const c10::Error& e) {
+      TS_LOGF(ERROR, "Failed to load tensor for request id: {}, c10 error: {}",
+              request.request_id, e.msg());
+      auto response = (*response_batch)[request.request_id];
+      response->SetResponse(500, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            "c10 error, failed to load tensor");
+    }
+  }
+
+  return batch_ivalue;
+}
+
+c10::IValue LlamaCppHandler::Inference(
+    std::shared_ptr<void> model, c10::IValue& inputs,
+    std::shared_ptr<torch::Device>& device,
+    std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+    std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {
+  torch::InferenceMode guard;
+  auto batch_output_vector = c10::impl::GenericList(torch::TensorType::get());
+  try {
+    for (const auto input : inputs.toTensorList()) {
+      torch::Tensor tokens_list_tensor = input.get().toTensor();
+
+      int64_t num_elements = tokens_list_tensor.numel();
+
+      int64_t* data_ptr = tokens_list_tensor.data_ptr<int64_t>();
+      std::vector<llama_token> tokens_list;
+
+      for (int64_t i = 0; i < num_elements; ++i) {
+        tokens_list.push_back(data_ptr[i]);
+      }
+      const int n_gen = std::min(32, max_context_size);
+
+      std::vector<torch::Tensor> tensor_vector;
+
+      long pos = 0;
+      while (pos < n_gen) {
+        // evaluate the transformer
+
+        int n_past = pos == 0 ? 0 : llama_get_kv_cache_token_count(llama_ctx);
+
+        if (llama_eval(llama_ctx, tokens_list.data(), int(tokens_list.size()),
+                       n_past)) {
+          TS_LOGF(ERROR, "Failed to eval {}", __func__);
+          break;
+        }
+
+        tokens_list.clear();
+
+        // sample the next token
+
+        llama_token new_token_id = 0;
+
+        auto logits = llama_get_logits(llama_ctx);
+        auto n_vocab = llama_n_vocab(llamamodel);
+
+        std::vector<llama_token_data> candidates;
+        candidates.reserve(n_vocab);
+
+        for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+          candidates.emplace_back(
+              llama_token_data{token_id, logits[token_id], 0.0f});
+        }
+
+        llama_token_data_array candidates_p = {candidates.data(),
+                                               candidates.size(), false};
+
+        new_token_id = llama_sample_token_greedy(llama_ctx, &candidates_p);
+
+        // is it an end of stream ?
+        if (new_token_id == llama_token_eos(llamamodel)) {
+          TS_LOG(DEBUG, "Reached [end of text]");
+          break;
+        }
+
+        // print the new token :
+        TS_LOGF(DEBUG, "New Token: {}", llama_token_to_piece(llama_ctx, new_token_id));
+
+        // push this new token for next evaluation
+        tokens_list.push_back(new_token_id);
+        tensor_vector.push_back(torch::tensor(new_token_id, torch::kLong));
+        pos += 1;
+      }
+
+      batch_output_vector.push_back(torch::stack(tensor_vector));
+    }
+
+    llama_print_timings(llama_ctx);
+  } catch (std::runtime_error& e) {
+    TS_LOG(ERROR, e.what());
+  } catch (const c10::Error& e) {
+    TS_LOGF(ERROR, "Failed to apply inference on input, c10 error:{}", e.msg());
+  }
+  return batch_output_vector;
+}
+
+void LlamaCppHandler::Postprocess(
+    c10::IValue& output,
+    std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+    std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch) {
+  for (const auto& kv : idx_to_req_id.second) {
+    auto data = output.toTensorList();
+    try {
+      int64_t num_elements = data[kv.first].get().toTensor().numel();
+
+      // Convert the tensor to a vector of long values
+      std::stringstream generated_text_stream;
+
+      auto data_ptr = data[kv.first].get().toTensor().data_ptr<int64_t>();
+      for (int64_t i = 0; i < num_elements; ++i) {
+        generated_text_stream << llama_token_to_piece(llama_ctx, data_ptr[i]);
+      }
+
+      std::string generated_text_str = generated_text_stream.str();
+      TS_LOGF(DEBUG, "Generated Text Str: {}", generated_text_str);
+
+      auto response = (*response_batch)[kv.second];
+
+      response->SetResponse(200, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            generated_text_str);
+    } catch (const std::runtime_error& e) {
+      TS_LOGF(ERROR, "Failed to load tensor for request id: {}, error: {}",
+              kv.second, e.what());
+      auto response = (*response_batch)[kv.second];
+      response->SetResponse(500, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            "runtime_error, failed to postprocess tensor");
+    } catch (const c10::Error& e) {
+      TS_LOGF(ERROR,
+              "Failed to postprocess tensor for request id: {}, error: {}",
+              kv.second, e.msg());
+      auto response = (*response_batch)[kv.second];
+      response->SetResponse(500, "data_type",
+                            torchserve::PayloadType::kDATA_TYPE_STRING,
+                            "c10 error, failed to postprocess tensor");
+    }
+  }
+}
+
+LlamaCppHandler::~LlamaCppHandler() noexcept {
+  llama_free(llama_ctx);
+  llama_free_model(llamamodel);
+  llama_backend_free();
+}
+
+}  // namespace llm
+
+#if defined(__linux__) || defined(__APPLE__)
+extern "C" {
+torchserve::BaseHandler* allocatorLlamaCppHandler() {
+  return new llm::LlamaCppHandler();
+}
+
+void deleterLlamaCppHandler(torchserve::BaseHandler* p) {
+  if (p != nullptr) {
+    delete static_cast<llm::LlamaCppHandler*>(p);
+  }
+}
+}
+#endif
diff --git a/examples/cpp/llamacpp/src/llamacpp_handler.hh b/examples/cpp/llamacpp/src/llamacpp_handler.hh
new file mode 100644
index 0000000000..5164095eeb
--- /dev/null
+++ b/examples/cpp/llamacpp/src/llamacpp_handler.hh
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <folly/FileUtil.h>
+#include <folly/json.h>
+
+#include "common/common.h"
+#include "ggml.h"
+#include "llama.h"
+#include "src/backends/handler/base_handler.hh"
+
+namespace llm {
+class LlamaCppHandler : public torchserve::BaseHandler {
+ private:
+  gpt_params params;
+  llama_model_params model_params;
+  llama_model* llamamodel;
+  llama_context_params ctx_params;
+  llama_context* llama_ctx;
+  const int max_context_size = 32;
+
+ public:
+  // NOLINTBEGIN(bugprone-exception-escape)
+  LlamaCppHandler() = default;
+  // NOLINTEND(bugprone-exception-escape)
+  ~LlamaCppHandler() noexcept;
+
+  void initialize_context();
+
+  virtual std::pair<std::shared_ptr<void>, std::shared_ptr<torch::Device>>
+  LoadModel(std::shared_ptr<torchserve::LoadModelRequest>& load_model_request);
+
+  c10::IValue Preprocess(
+      std::shared_ptr<torch::Device>& device,
+      std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+      std::shared_ptr<torchserve::InferenceRequestBatch>& request_batch,
+      std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch)
+      override;
+
+  c10::IValue Inference(
+      std::shared_ptr<void> model, c10::IValue& inputs,
+      std::shared_ptr<torch::Device>& device,
+      std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+      std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch)
+      override;
+
+  void Postprocess(
+      c10::IValue& data,
+      std::pair<std::string&, std::map<uint8_t, std::string>&>& idx_to_req_id,
+      std::shared_ptr<torchserve::InferenceResponseBatch>& response_batch)
+      override;
+};
+}  // namespace llm
diff --git a/examples/cpp/mnist/CMakeLists.txt b/examples/cpp/mnist/CMakeLists.txt
new file mode 100644
index 0000000000..45a25d2f41
--- /dev/null
+++ b/examples/cpp/mnist/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_library(mnist_handler SHARED src/mnist_handler.cc)
+
+target_link_libraries(mnist_handler PRIVATE ts_backends_core ts_utils ${TORCH_LIBRARIES})
diff --git a/cpp/src/examples/image_classifier/mnist/mnist_handler.cc b/examples/cpp/mnist/src/mnist_handler.cc
similarity index 96%
rename from cpp/src/examples/image_classifier/mnist/mnist_handler.cc
rename to examples/cpp/mnist/src/mnist_handler.cc
index 3fae5748a4..f28ca664d2 100644
--- a/cpp/src/examples/image_classifier/mnist/mnist_handler.cc
+++ b/examples/cpp/mnist/src/mnist_handler.cc
@@ -1,4 +1,4 @@
-#include "src/examples/image_classifier/mnist/mnist_handler.hh"
+#include "mnist_handler.hh"
 
 namespace mnist {
 void MnistHandler::Postprocess(
diff --git a/cpp/src/examples/image_classifier/mnist/mnist_handler.hh b/examples/cpp/mnist/src/mnist_handler.hh
similarity index 100%
rename from cpp/src/examples/image_classifier/mnist/mnist_handler.hh
rename to examples/cpp/mnist/src/mnist_handler.hh
diff --git a/ts_scripts/spellcheck_conf/wordlist.txt b/ts_scripts/spellcheck_conf/wordlist.txt
index 865e6400fe..aebba8f7f3 100644
--- a/ts_scripts/spellcheck_conf/wordlist.txt
+++ b/ts_scripts/spellcheck_conf/wordlist.txt
@@ -1170,3 +1170,4 @@ BabyLLama
 BabyLlamaHandler
 CMakeLists
 TorchScriptHandler
+libllamacpp