nomic-ai · cebtenzzre · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 7, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -1,5 +1,5 @@
 [submodule "llama.cpp-mainline"]
-	path = gpt4all-backend/llama.cpp-mainline
+	path = gpt4all-backend/llama.cpp
 	url = https://github.com/nomic-ai/llama.cpp.git
 	branch = master
 [submodule "gpt4all-chat/usearch"]

diff --git a/gpt4all-backend/CMakeLists.txt b/gpt4all-backend/CMakeLists.txt
@@ -47,7 +47,7 @@ else()
     message(STATUS "Interprocedural optimization support detected")
 endif()
 
-set(DIRECTORY llama.cpp-mainline)
+set(DIRECTORY llama.cpp)
 include(llama.cpp.cmake)
 
 set(BUILD_VARIANTS)
@@ -108,7 +108,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
     endif()
 
     # Include GGML
-    include_ggml(-mainline-${BUILD_VARIANT})
+    include_ggml(-${BUILD_VARIANT})
 
     # Function for preparing individual implementations
     function(prepare_target TARGET_NAME BASE_LIB)
@@ -127,19 +127,20 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
     endfunction()
 
     # Add each individual implementations
-    add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
-        llamamodel.cpp llmodel_shared.cpp)
-    target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
+    add_library(llamacpp-${BUILD_VARIANT} SHARED llamacpp_backend_impl.cpp)
+    target_compile_definitions(llamacpp-${BUILD_VARIANT} PRIVATE
         LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
-    prepare_target(llamamodel-mainline llama-mainline)
+    prepare_target(llamacpp llama)
 
     if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
         set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
     endif()
 endforeach()
 
 add_library(llmodel
-    llmodel.h llmodel.cpp llmodel_shared.cpp
+    model_backend.h
+    llamacpp_backend.h llamacpp_backend.cpp
+    llamacpp_backend_manager.h llamacpp_backend_manager.cpp
     llmodel_c.h llmodel_c.cpp
     dlhandle.cpp
 )

diff --git a/gpt4all-backend/llama.cpp-mainline → gpt4all-backend/llama.cpp b/gpt4all-backend/llama.cpp-mainline → gpt4all-backend/llama.cpp
diff --git a/gpt4all-backend/llmodel_shared.cpp → gpt4all-backend/llamacpp_backend.cpp b/gpt4all-backend/llmodel_shared.cpp → gpt4all-backend/llamacpp_backend.cpp
@@ -1,4 +1,6 @@
-#include "llmodel.h"
+#include "llamacpp_backend.h"
+
+#include "llamacpp_backend_manager.h"
 
 #include <algorithm>
 #include <cassert>
@@ -15,6 +17,7 @@
 
 namespace ranges = std::ranges;
 
+
 static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err)
 {
     static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
@@ -38,24 +41,25 @@ static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch
     return true;
 }
 
-void LLModel::prompt(const std::string &prompt,
-                     const std::string &promptTemplate,
-                     std::function<bool(int32_t)> promptCallback,
-                     std::function<bool(int32_t, const std::string&)> responseCallback,
-                     bool allowContextShift,
-                     PromptContext &promptCtx,
-                     bool special,
-                     std::string *fakeReply)
-{
+void LlamaCppBackend::prompt(
+    const std::string &prompt,
+    const std::string &promptTemplate,
+    std::function<bool(int32_t)> promptCallback,
+    std::function<bool(int32_t, const std::string&)> responseCallback,
+    bool allowContextShift,
+    PromptContext &promptCtx,
+    bool special,
+    std::string *fakeReply
+) {
     if (!isModelLoaded()) {
-        std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
+        std::cerr << manager().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
         return;
     }
 
     if (!supportsCompletion()) {
         std::string errorMessage = "ERROR: this model does not support text completion or chat!";
         responseCallback(-1, errorMessage);
-        std::cerr << implementation().modelType() << " " << errorMessage << "\n";
+        std::cerr << manager().modelType() << " " << errorMessage << "\n";
         return;
     }
 
@@ -152,15 +156,22 @@ void LLModel::prompt(const std::string &prompt,
     }
 }
 
+const LlamaCppBackendManager &LlamaCppBackend::manager() const
+{
+    return *m_manager;
+}
+
 // returns false on error
-bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
-                           std::function<bool(int32_t, const std::string&)> responseCallback,
-                           bool allowContextShift,
-                           PromptContext &promptCtx,
-                           std::vector<Token> embd_inp) {
+bool LlamaCppBackend::decodePrompt(
+    std::function<bool(int32_t)> promptCallback,
+    std::function<bool(int32_t, const std::string&)> responseCallback,
+    bool allowContextShift,
+    PromptContext &promptCtx,
+    std::vector<Token> embd_inp
+) {
     if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
         responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
-        std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
+        std::cerr << manager().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
             " tokens and the context window is " << promptCtx.n_ctx << "!\n";
         return false;
     }
@@ -188,7 +199,7 @@ bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
         }
 
         if (!evalTokens(promptCtx, batch)) {
-            std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
+            std::cerr << manager().modelType() << " ERROR: Failed to process prompt\n";
             return false;
         }
 
@@ -224,9 +235,11 @@ static std::string::size_type stringsOverlap(const std::string &s, const std::st
     return std::string::npos;
 }
 
-void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
-                               bool allowContextShift,
-                               PromptContext &promptCtx) {
+void LlamaCppBackend::generateResponse(
+    std::function<bool(int32_t, const std::string&)> responseCallback,
+    bool allowContextShift,
+    PromptContext &promptCtx
+) {
     static const char *stopSequences[] {
         "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context",
     };
@@ -265,7 +278,7 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
             Token tok = std::exchange(new_tok, std::nullopt).value();
             if (!evalTokens(promptCtx, { tok })) {
                 // TODO(jared): raise an exception
-                std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
+                std::cerr << manager().modelType() << " ERROR: Failed to predict next token\n";
                 return false;
             }
 
@@ -370,32 +383,3 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
 
     promptCtx.n_past -= cachedTokens.size();
 }
-
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
-) {
-    (void)texts;
-    (void)embeddings;
-    (void)prefix;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    (void)cancelCb;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-}
-
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
-    bool doMean, bool atlas
-) {
-    (void)texts;
-    (void)embeddings;
-    (void)isRetrieval;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-}
diff --git a/gpt4all-backend/llamacpp_backend.h b/gpt4all-backend/llamacpp_backend.h
@@ -0,0 +1,145 @@
+#pragma once
+
+#include "model_backend.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+using namespace std::string_literals;
+
+class LlamaCppBackendManager;
+
+
+class LlamaCppBackend : public EmbCapableBackend {
+public:
+    struct GPUDevice {
+        const char *backend;
+        int index;
+        int type;
+        size_t heapSize;
+        std::string name;
+        std::string vendor;
+
+        GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
+            backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
+            vendor(std::move(vendor)) {}
+
+        std::string selectionName() const
+        {
+            assert(backend == "cuda"s || backend == "kompute"s);
+            return backendName() + ": " + name;
+        }
+
+        std::string backendName() const { return backendIdToName(backend); }
+
+        static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }
+
+        static std::string updateSelectionName(const std::string &name) {
+            if (name == "Auto" || name == "CPU" || name == "Metal")
+                return name;
+            auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
+                return name.starts_with(entry.second + ": ");
+            });
+            if (it != s_backendNames.end())
+                return name;
+            return "Vulkan: " + name; // previously, there were only Vulkan devices
+        }
+
+    private:
+        static inline const std::unordered_map<std::string, std::string> s_backendNames {
+            {"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
+        };
+    };
+
+    using ProgressCallback = std::function<bool(float progress)>;
+
+    virtual bool isModelBlacklisted(const std::string &modelPath) const = 0;
+    virtual bool isEmbeddingModel(const std::string &modelPath) const = 0;
+    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
+
+    void prompt(const std::string &prompt,
+                const std::string &promptTemplate,
+                std::function<bool(int32_t)> promptCallback,
+                std::function<bool(int32_t, const std::string&)> responseCallback,
+                bool allowContextShift,
+                PromptContext &ctx,
+                bool special = false,
+                std::string *fakeReply = nullptr) override;
+
+    virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
+    virtual int32_t threadCount() const { return 1; }
+
+    const LlamaCppBackendManager &manager() const;
+
+    virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const
+    {
+        (void)memoryRequired;
+        return {};
+    }
+
+    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const
+    {
+        (void)memoryRequired;
+        (void)name;
+        return false;
+    }
+
+    virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const
+    {
+        (void)device;
+        if (unavail_reason) {
+            *unavail_reason = "model has no GPU support";
+        }
+        return false;
+    }
+
+    virtual bool usingGPUDevice() const { return false; }
+    virtual const char *backendName() const { return "cpu"; }
+    virtual const char *gpuDeviceName() const { return nullptr; }
+
+    void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
+
+protected:
+    virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) = 0;
+    virtual bool isSpecialToken(Token id) const = 0;
+    virtual std::string tokenToString(Token id) const = 0;
+    virtual Token sampleToken(PromptContext &ctx) const = 0;
+    virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
+    virtual void shiftContext(PromptContext &promptCtx) = 0;
+    virtual int32_t contextLength() const = 0;
+    virtual const std::vector<Token> &endTokens() const = 0;
+    virtual bool shouldAddBOS() const = 0;
+
+    virtual int32_t maxContextLength(std::string const &modelPath) const = 0;
+    virtual int32_t layerCount(std::string const &modelPath) const = 0;
+
+    static bool staticProgressCallback(float progress, void* ctx)
+    {
+        LlamaCppBackend *model = static_cast<LlamaCppBackend *>(ctx);
+        if (model && model->m_progressCallback)
+            return model->m_progressCallback(progress);
+        return true;
+    }
+
+    bool decodePrompt(std::function<bool(int32_t)> promptCallback,
+                      std::function<bool(int32_t, const std::string&)> responseCallback,
+                      bool allowContextShift,
+                      PromptContext &promptCtx,
+                      std::vector<Token> embd_inp);
+    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
+                          bool allowContextShift,
+                          PromptContext &promptCtx);
+
+    const LlamaCppBackendManager *m_manager = nullptr;
+    ProgressCallback              m_progressCallback;
+    Token                         m_tokenize_last_token = -1;
+
+    friend class LlamaCppBackendManager;
+};