Skip to content

llama : add thread safety test #14035

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -777,6 +777,7 @@ jobs:
cmake -S . -B build ${{ matrix.defines }} `
-DCURL_LIBRARY="$env:CURL_PATH/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:CURL_PATH/include"
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
cp $env:CURL_PATH/bin/libcurl-*.dll build/bin/Release

- name: Add libopenblas.dll
id: add_libopenblas_dll
Expand Down
2 changes: 1 addition & 1 deletion ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ sd=`dirname $0`
cd $sd/../
SRC=`pwd`

CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=OFF"
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON"

if [ ! -z ${GG_BUILD_METAL} ]; then
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
Expand Down
16 changes: 12 additions & 4 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,9 @@ bool fs_validate_filename(const std::string & filename) {
return true;
}

#include <iostream>


// returns true if successful, false otherwise
bool fs_create_directory_with_parents(const std::string & path) {
#ifdef _WIN32
Expand All @@ -784,11 +787,18 @@ bool fs_create_directory_with_parents(const std::string & path) {
// process path from front to back, procedurally creating directories
while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
const std::wstring subpath = wpath.substr(0, pos_slash);
const wchar_t * test = subpath.c_str();
pos_slash += 1;
// skip the drive letter, in some systems it can return an access denied error
if (subpath.length() == 2 && subpath[1] == ':') {
continue;
}

const bool success = CreateDirectoryW(subpath.c_str(), NULL);

const bool success = CreateDirectoryW(test, NULL);
std::wcout << "CreateDirectoryW " << subpath << " returned: " << (success ? "true" : "false") << std::endl;
if (!success) {
const DWORD error = GetLastError();
std::wcout << "GetLastError returned: " << error << std::endl;

// if the path already exists, ensure that it's a directory
if (error == ERROR_ALREADY_EXISTS) {
Expand All @@ -800,8 +810,6 @@ bool fs_create_directory_with_parents(const std::string & path) {
return false;
}
}

pos_slash += 1;
}

return true;
Expand Down
8 changes: 1 addition & 7 deletions ggml/src/ggml-cpu/llamafile/sgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
#include "ggml-cpu-impl.h"
#include "ggml-quants.h"

#include <atomic>
#include <array>
#include <type_traits>

Expand Down Expand Up @@ -394,8 +393,6 @@ class tinyBLAS {

template <int RM, int RN, int BM>
NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) {
static std::atomic<int64_t> current_chunk;

GGML_ASSERT(m % (RM * BM) == 0);
const int64_t ytiles = m / (RM * BM);
const int64_t xtiles = (n + RN -1) / RN;
Expand All @@ -409,8 +406,6 @@ class tinyBLAS {

if (params->ith == 0) {
GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles);
// Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start.
std::atomic_store_explicit(&current_chunk, (int64_t)params->nth, std::memory_order_relaxed);
}

ggml_barrier(params->threadpool);
Expand Down Expand Up @@ -439,8 +434,7 @@ class tinyBLAS {
GGML_ASSERT(jj == jj2);
}

// next step.
job = std::atomic_fetch_add_explicit(&current_chunk, (int64_t)1, std::memory_order_relaxed);
job += params->nth;
}

ggml_barrier(params->threadpool);
Expand Down
2 changes: 1 addition & 1 deletion src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ static struct llama_model * llama_model_load_from_file_impl(
}

// if using single GPU mode, remove all except the main GPU
if (params.split_mode == LLAMA_SPLIT_MODE_NONE) {
if (params.split_mode == LLAMA_SPLIT_MODE_NONE && !model->devices.empty() && params.main_gpu >= 0) {
if (params.main_gpu < 0 || params.main_gpu >= (int)model->devices.size()) {
LLAMA_LOG_ERROR("%s: invalid value for main_gpu: %d (available devices: %d)\n", __func__, params.main_gpu, (int)model->devices.size());
llama_model_free(model);
Expand Down
2 changes: 2 additions & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ llama_build_and_test(test-json-partial.cpp)
llama_build_and_test(test-log.cpp)
llama_build_and_test(test-regex-partial.cpp)

llama_build_and_test(test-thread-safety.cpp ARGS -hf ggml-org/models -hff tinyllamas/stories15M-q4_0.gguf -ngl 99 -p "The meaning of life is" -n 128 -c 256 -ub 32 -np 4)

# this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
if (NOT WIN32)
llama_build_and_test(test-arg-parser.cpp)
Expand Down
150 changes: 150 additions & 0 deletions tests/test-thread-safety.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
// thread safety test
// - Loads a copy of the same model on each GPU, plus a copy on the CPU
// - Creates n_parallel (--parallel) contexts per model
// - Runs inference in parallel on each context

#include <thread>
#include <vector>
#include <atomic>
#include "llama.h"
#include "arg.h"
#include "common.h"
#include "log.h"
#include "sampling.h"

int main(int argc, char ** argv) {
common_params params;

if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
return 1;
}

common_init();

llama_backend_init();
llama_numa_init(params.numa);

LOG_INF("%s\n", common_params_get_system_info(params).c_str());

//llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
// if (level == GGML_LOG_LEVEL_ERROR) {
// common_log_add(common_log_main(), level, "%s", text);
// }
//}, NULL);

auto mparams = common_model_params_to_llama(params);
auto cparams = common_context_params_to_llama(params);

int dev_count = ggml_backend_dev_count();
int gpu_dev_count = 0;
for (int i = 0; i < dev_count; ++i) {
auto * dev = ggml_backend_dev_get(i);
if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_GPU) {
gpu_dev_count++;
}
}
const int num_models = gpu_dev_count + 1; // GPUs + 1 CPU model
//const int num_models = std::max(1, gpu_dev_count);
const int num_contexts = std::max(1, params.n_parallel);

struct model_context {
llama_model_ptr model;
std::vector<llama_context_ptr> contexts;
std::vector<std::unique_ptr<common_sampler, decltype(&common_sampler_free)>> samplers;
};

std::vector<model_context> models;
std::vector<std::thread> threads;
std::atomic<bool> failed = false;

for (int m = 0; m < num_models; ++m) {
model_context this_model;

mparams.split_mode = LLAMA_SPLIT_MODE_NONE;
mparams.main_gpu = m < gpu_dev_count ? m : -1;

llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams);
if (model == NULL) {
LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str());
return 1;
}

this_model.model.reset(model);

for (int c = 0; c < num_contexts; ++c) {
LOG_INF("Creating context %d/%d for model %d/%d\n", c + 1, num_contexts, m + 1, num_models);
llama_context * ctx = llama_init_from_model(model, cparams);
if (ctx == NULL) {
LOG_ERR("%s: failed to create context\n", __func__);
return 1;
}
this_model.contexts.emplace_back(ctx);

common_sampler * sampler = common_sampler_init(model, params.sampling);
if (sampler == NULL) {
LOG_ERR("%s: failed to create sampler\n", __func__);
return 1;
}
this_model.samplers.emplace_back(sampler, common_sampler_free);

threads.emplace_back([model, ctx, sampler, &params, &failed, m, c, num_models, num_contexts]() {
llama_batch batch = {};
{
auto prompt = common_tokenize(ctx, params.prompt, true);
if (prompt.empty()) {
LOG_ERR("failed to tokenize prompt\n");
failed.store(true);
return;
}
batch = llama_batch_get_one(prompt.data(), prompt.size());
if (llama_decode(ctx, batch)) {
LOG_ERR("failed to decode prompt\n");
failed.store(true);
return;
}
}

const auto * vocab = llama_model_get_vocab(model);
std::string result = params.prompt;

for (int i = 0; i < params.n_predict; i++) {
llama_token token;
if (batch.n_tokens > 0) {
token = common_sampler_sample(sampler, ctx, batch.n_tokens - 1);
} else {
token = llama_vocab_bos(vocab);
}

if (llama_vocab_is_eog(vocab, token)) {
break;
}
result += common_token_to_piece(ctx, token);

batch = llama_batch_get_one(&token, 1);
if (llama_decode(ctx, batch)) {
LOG_ERR("failed to decode\n");
failed.store(true);
return;
}
}

LOG_INF("Model %d/%d, Context %d/%d: Result: '%s'\n", m + 1, num_models, c + 1, num_contexts, result.c_str());
});

}

models.emplace_back(std::move(this_model));
}

for (auto & thread : threads) {
thread.join();
}

if (failed) {
LOG_ERR("One or more threads failed.\n");
return 1;
}

LOG_INF("All threads completed successfully.\n");
return 0;
}
Loading