Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: new server backend for GPT4All #2806

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[submodule "llama.cpp-mainline"]
path = gpt4all-backend/llama.cpp-mainline
path = gpt4all-backend/llama.cpp
url = https://github.com/nomic-ai/llama.cpp.git
branch = master
[submodule "gpt4all-chat/usearch"]
Expand Down
15 changes: 8 additions & 7 deletions gpt4all-backend/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ else()
message(STATUS "Interprocedural optimization support detected")
endif()

set(DIRECTORY llama.cpp-mainline)
set(DIRECTORY llama.cpp)
include(llama.cpp.cmake)

set(BUILD_VARIANTS)
Expand Down Expand Up @@ -108,7 +108,7 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
endif()

# Include GGML
include_ggml(-mainline-${BUILD_VARIANT})
include_ggml(-${BUILD_VARIANT})

# Function for preparing individual implementations
function(prepare_target TARGET_NAME BASE_LIB)
Expand All @@ -127,19 +127,20 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
endfunction()

# Add each individual implementations
add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
llamamodel.cpp llmodel_shared.cpp)
target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
add_library(llamacpp-${BUILD_VARIANT} SHARED llamacpp_backend_impl.cpp)
target_compile_definitions(llamacpp-${BUILD_VARIANT} PRIVATE
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
prepare_target(llamamodel-mainline llama-mainline)
prepare_target(llamacpp llama)

if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
endif()
endforeach()

add_library(llmodel
llmodel.h llmodel.cpp llmodel_shared.cpp
model_backend.h
llamacpp_backend.h llamacpp_backend.cpp
llamacpp_backend_manager.h llamacpp_backend_manager.cpp
llmodel_c.h llmodel_c.cpp
dlhandle.cpp
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#include "llmodel.h"
#include "llamacpp_backend.h"

#include "llamacpp_backend_manager.h"

#include <algorithm>
#include <cassert>
Expand All @@ -15,6 +17,7 @@

namespace ranges = std::ranges;


static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err)
{
static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
Expand All @@ -38,24 +41,25 @@ static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch
return true;
}

void LLModel::prompt(const std::string &prompt,
const std::string &promptTemplate,
std::function<bool(int32_t)> promptCallback,
std::function<bool(int32_t, const std::string&)> responseCallback,
bool allowContextShift,
PromptContext &promptCtx,
bool special,
std::string *fakeReply)
{
void LlamaCppBackend::prompt(
const std::string &prompt,
const std::string &promptTemplate,
std::function<bool(int32_t)> promptCallback,
std::function<bool(int32_t, const std::string&)> responseCallback,
bool allowContextShift,
PromptContext &promptCtx,
bool special,
std::string *fakeReply
) {
if (!isModelLoaded()) {
std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
std::cerr << manager().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
return;
}

if (!supportsCompletion()) {
std::string errorMessage = "ERROR: this model does not support text completion or chat!";
responseCallback(-1, errorMessage);
std::cerr << implementation().modelType() << " " << errorMessage << "\n";
std::cerr << manager().modelType() << " " << errorMessage << "\n";
return;
}

Expand Down Expand Up @@ -152,15 +156,22 @@ void LLModel::prompt(const std::string &prompt,
}
}

const LlamaCppBackendManager &LlamaCppBackend::manager() const
{
return *m_manager;
}

// returns false on error
bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
std::function<bool(int32_t, const std::string&)> responseCallback,
bool allowContextShift,
PromptContext &promptCtx,
std::vector<Token> embd_inp) {
bool LlamaCppBackend::decodePrompt(
std::function<bool(int32_t)> promptCallback,
std::function<bool(int32_t, const std::string&)> responseCallback,
bool allowContextShift,
PromptContext &promptCtx,
std::vector<Token> embd_inp
) {
if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
std::cerr << manager().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
" tokens and the context window is " << promptCtx.n_ctx << "!\n";
return false;
}
Expand Down Expand Up @@ -188,7 +199,7 @@ bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
}

if (!evalTokens(promptCtx, batch)) {
std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
std::cerr << manager().modelType() << " ERROR: Failed to process prompt\n";
return false;
}

Expand Down Expand Up @@ -224,9 +235,11 @@ static std::string::size_type stringsOverlap(const std::string &s, const std::st
return std::string::npos;
}

void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
bool allowContextShift,
PromptContext &promptCtx) {
void LlamaCppBackend::generateResponse(
std::function<bool(int32_t, const std::string&)> responseCallback,
bool allowContextShift,
PromptContext &promptCtx
) {
static const char *stopSequences[] {
"### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context",
};
Expand Down Expand Up @@ -265,7 +278,7 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>
Token tok = std::exchange(new_tok, std::nullopt).value();
if (!evalTokens(promptCtx, { tok })) {
// TODO(jared): raise an exception
std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
std::cerr << manager().modelType() << " ERROR: Failed to predict next token\n";
return false;
}

Expand Down Expand Up @@ -370,32 +383,3 @@ void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)>

promptCtx.n_past -= cachedTokens.size();
}

void LLModel::embed(
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
) {
(void)texts;
(void)embeddings;
(void)prefix;
(void)dimensionality;
(void)tokenCount;
(void)doMean;
(void)atlas;
(void)cancelCb;
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
}

void LLModel::embed(
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
bool doMean, bool atlas
) {
(void)texts;
(void)embeddings;
(void)isRetrieval;
(void)dimensionality;
(void)tokenCount;
(void)doMean;
(void)atlas;
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
}
145 changes: 145 additions & 0 deletions gpt4all-backend/llamacpp_backend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#pragma once

#include "model_backend.h"

#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <functional>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

using namespace std::string_literals;

class LlamaCppBackendManager;


class LlamaCppBackend : public EmbCapableBackend {
public:
struct GPUDevice {
const char *backend;
int index;
int type;
size_t heapSize;
std::string name;
std::string vendor;

GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
vendor(std::move(vendor)) {}

std::string selectionName() const
{
assert(backend == "cuda"s || backend == "kompute"s);
return backendName() + ": " + name;
}

std::string backendName() const { return backendIdToName(backend); }

static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }

static std::string updateSelectionName(const std::string &name) {
if (name == "Auto" || name == "CPU" || name == "Metal")
return name;
auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
return name.starts_with(entry.second + ": ");
});
if (it != s_backendNames.end())
return name;
return "Vulkan: " + name; // previously, there were only Vulkan devices
}

private:
static inline const std::unordered_map<std::string, std::string> s_backendNames {
{"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
};
};

using ProgressCallback = std::function<bool(float progress)>;

virtual bool isModelBlacklisted(const std::string &modelPath) const = 0;
virtual bool isEmbeddingModel(const std::string &modelPath) const = 0;
virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;

void prompt(const std::string &prompt,
const std::string &promptTemplate,
std::function<bool(int32_t)> promptCallback,
std::function<bool(int32_t, const std::string&)> responseCallback,
bool allowContextShift,
PromptContext &ctx,
bool special = false,
std::string *fakeReply = nullptr) override;

virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
virtual int32_t threadCount() const { return 1; }

const LlamaCppBackendManager &manager() const;

virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const
{
(void)memoryRequired;
return {};
}

virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const
{
(void)memoryRequired;
(void)name;
return false;
}

virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const
{
(void)device;
if (unavail_reason) {
*unavail_reason = "model has no GPU support";
}
return false;
}

virtual bool usingGPUDevice() const { return false; }
virtual const char *backendName() const { return "cpu"; }
virtual const char *gpuDeviceName() const { return nullptr; }

void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }

protected:
virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) = 0;
virtual bool isSpecialToken(Token id) const = 0;
virtual std::string tokenToString(Token id) const = 0;
virtual Token sampleToken(PromptContext &ctx) const = 0;
virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
virtual void shiftContext(PromptContext &promptCtx) = 0;
virtual int32_t contextLength() const = 0;
virtual const std::vector<Token> &endTokens() const = 0;
virtual bool shouldAddBOS() const = 0;

virtual int32_t maxContextLength(std::string const &modelPath) const = 0;
virtual int32_t layerCount(std::string const &modelPath) const = 0;

static bool staticProgressCallback(float progress, void* ctx)
{
LlamaCppBackend *model = static_cast<LlamaCppBackend *>(ctx);
if (model && model->m_progressCallback)
return model->m_progressCallback(progress);
return true;
}

bool decodePrompt(std::function<bool(int32_t)> promptCallback,
std::function<bool(int32_t, const std::string&)> responseCallback,
bool allowContextShift,
PromptContext &promptCtx,
std::vector<Token> embd_inp);
void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
bool allowContextShift,
PromptContext &promptCtx);

const LlamaCppBackendManager *m_manager = nullptr;
ProgressCallback m_progressCallback;
Token m_tokenize_last_token = -1;

friend class LlamaCppBackendManager;
};
Loading