Skip to content

Commit afa8a9e

Browse files
ggerganovslaren
andauthored
llama : add llama_vocab, functions -> methods, naming (ggml-org#11110)
* llama : functions -> methods (ggml-org#11110) * llama : add struct llama_vocab to the API (ggml-org#11156) ggml-ci * hparams : move vocab params to llama_vocab (ggml-org#11159) ggml-ci * vocab : more pimpl (ggml-org#11165) ggml-ci * vocab : minor tokenization optimizations (ggml-org#11160) ggml-ci Co-authored-by: Diego Devesa <[email protected]> * lora : update API names (ggml-org#11167) ggml-ci * llama : update API names to use correct prefix (ggml-org#11174) * llama : update API names to use correct prefix ggml-ci * cont ggml-ci * cont ggml-ci * minor [no ci] * vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (ggml-org#11174) ggml-ci * vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (ggml-org#11174) ggml-ci --------- Co-authored-by: Diego Devesa <[email protected]>
1 parent c05e8c9 commit afa8a9e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

68 files changed

+6015
-5560
lines changed

common/common.cpp

+61-44
Original file line numberDiff line numberDiff line change
@@ -857,21 +857,23 @@ struct common_init_result common_init_from_params(common_params & params) {
857857
return iparams;
858858
}
859859

860+
const llama_vocab * vocab = llama_model_get_vocab(model);
861+
860862
if (params.reranking) {
861863
bool ok = true;
862864

863-
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
864-
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
865+
if (llama_vocab_bos(vocab) == LLAMA_TOKEN_NULL) {
866+
LOG_WRN("%s: warning: vocab does not have a BOS token, reranking will not work\n", __func__);
865867
ok = false;
866868
}
867869

868-
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
869-
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
870+
if (llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
871+
LOG_WRN("%s: warning: vocab does not have an EOS token, reranking will not work\n", __func__);
870872
ok = false;
871873
}
872874

873-
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
874-
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
875+
if (llama_vocab_sep(vocab) == LLAMA_TOKEN_NULL) {
876+
LOG_WRN("%s: warning: vocab does not have a SEP token, reranking will not work\n", __func__);
875877
ok = false;
876878
}
877879

@@ -884,7 +886,7 @@ struct common_init_result common_init_from_params(common_params & params) {
884886

885887
auto cparams = common_context_params_to_llama(params);
886888

887-
llama_context * lctx = llama_new_context_with_model(model, cparams);
889+
llama_context * lctx = llama_init_from_model(model, cparams);
888890
if (lctx == NULL) {
889891
LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str());
890892
llama_model_free(model);
@@ -898,7 +900,7 @@ struct common_init_result common_init_from_params(common_params & params) {
898900

899901
if (!params.control_vectors.empty()) {
900902
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
901-
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
903+
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_model_n_layer(model);
902904

903905
const auto cvec = common_control_vector_load(params.control_vectors);
904906
if (cvec.n_embd == -1) {
@@ -908,12 +910,13 @@ struct common_init_result common_init_from_params(common_params & params) {
908910
return iparams;
909911
}
910912

911-
int err = llama_control_vector_apply(lctx,
912-
cvec.data.data(),
913-
cvec.data.size(),
914-
cvec.n_embd,
915-
params.control_vector_layer_start,
916-
params.control_vector_layer_end);
913+
int err = llama_apply_adapter_cvec(
914+
lctx,
915+
cvec.data.data(),
916+
cvec.data.size(),
917+
cvec.n_embd,
918+
params.control_vector_layer_start,
919+
params.control_vector_layer_end);
917920
if (err) {
918921
llama_free(lctx);
919922
llama_model_free(model);
@@ -924,8 +927,8 @@ struct common_init_result common_init_from_params(common_params & params) {
924927

925928
// load and optionally apply lora adapters
926929
for (auto & la : params.lora_adapters) {
927-
llama_lora_adapter_ptr lora;
928-
lora.reset(llama_lora_adapter_init(model, la.path.c_str()));
930+
llama_adapter_lora_ptr lora;
931+
lora.reset(llama_adapter_lora_init(model, la.path.c_str()));
929932
if (lora == nullptr) {
930933
LOG_ERR("%s: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
931934
llama_free(lctx);
@@ -938,17 +941,17 @@ struct common_init_result common_init_from_params(common_params & params) {
938941
}
939942

940943
if (!params.lora_init_without_apply) {
941-
common_lora_adapters_apply(lctx, params.lora_adapters);
944+
common_set_adapter_lora(lctx, params.lora_adapters);
942945
}
943946

944-
if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
945-
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
947+
if (params.sampling.ignore_eos && llama_vocab_eos(vocab) == LLAMA_TOKEN_NULL) {
948+
LOG_WRN("%s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n", __func__);
946949
params.sampling.ignore_eos = false;
947950
}
948951

949952
if (params.sampling.ignore_eos) {
950-
for (llama_token i = 0; i < llama_n_vocab(model); i++) {
951-
if (llama_token_is_eog(model, i)) {
953+
for (llama_token i = 0; i < llama_vocab_n_tokens(vocab); i++) {
954+
if (llama_vocab_is_eog(vocab, i)) {
952955
LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY);
953956
params.sampling.logit_bias.push_back({i, -INFINITY});
954957
}
@@ -969,8 +972,9 @@ struct common_init_result common_init_from_params(common_params & params) {
969972
LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__);
970973

971974
std::vector<llama_token> tmp;
972-
llama_token bos = llama_token_bos(model);
973-
llama_token eos = llama_token_eos(model);
975+
llama_token bos = llama_vocab_bos(vocab);
976+
llama_token eos = llama_vocab_eos(vocab);
977+
974978
// some models (e.g. T5) don't have a BOS token
975979
if (bos != LLAMA_TOKEN_NULL) {
976980
tmp.push_back(bos);
@@ -1005,11 +1009,11 @@ struct common_init_result common_init_from_params(common_params & params) {
10051009
return iparams;
10061010
}
10071011

1008-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora) {
1009-
llama_lora_adapter_clear(ctx);
1012+
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
1013+
llama_clear_adapter_lora(ctx);
10101014
for (auto & la : lora) {
10111015
if (la.scale != 0.0f) {
1012-
llama_lora_adapter_set(ctx, la.ptr, la.scale);
1016+
llama_set_adapter_lora(ctx, la.ptr, la.scale);
10131017
}
10141018
}
10151019
}
@@ -1559,21 +1563,23 @@ std::vector<llama_token> common_tokenize(
15591563
const std::string & text,
15601564
bool add_special,
15611565
bool parse_special) {
1562-
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
1566+
const llama_model * model = llama_get_model(ctx);
1567+
const llama_vocab * vocab = llama_model_get_vocab(model);
1568+
return common_tokenize(vocab, text, add_special, parse_special);
15631569
}
15641570

15651571
std::vector<llama_token> common_tokenize(
1566-
const struct llama_model * model,
1572+
const struct llama_vocab * vocab,
15671573
const std::string & text,
15681574
bool add_special,
15691575
bool parse_special) {
15701576
// upper limit for the number of tokens
15711577
int n_tokens = text.length() + 2 * add_special;
15721578
std::vector<llama_token> result(n_tokens);
1573-
n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1579+
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
15741580
if (n_tokens < 0) {
15751581
result.resize(-n_tokens);
1576-
int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
1582+
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
15771583
GGML_ASSERT(check == -n_tokens);
15781584
} else {
15791585
result.resize(n_tokens);
@@ -1582,12 +1588,18 @@ std::vector<llama_token> common_tokenize(
15821588
}
15831589

15841590
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
1591+
const llama_model * model = llama_get_model(ctx);
1592+
const llama_vocab * vocab = llama_model_get_vocab(model);
1593+
return common_token_to_piece(vocab, token, special);
1594+
}
1595+
1596+
std::string common_token_to_piece(const struct llama_vocab * vocab, llama_token token, bool special) {
15851597
std::string piece;
15861598
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
1587-
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1599+
const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
15881600
if (n_chars < 0) {
15891601
piece.resize(-n_chars);
1590-
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
1602+
int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
15911603
GGML_ASSERT(check == -n_chars);
15921604
}
15931605
else {
@@ -1597,13 +1609,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
15971609
return piece;
15981610
}
15991611

1600-
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1612+
std::string common_detokenize(const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1613+
const llama_model * model = llama_get_model(ctx);
1614+
const llama_vocab * vocab = llama_model_get_vocab(model);
1615+
return common_detokenize(vocab, tokens, special);
1616+
}
1617+
1618+
std::string common_detokenize(const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
16011619
std::string text;
16021620
text.resize(std::max(text.capacity(), tokens.size()));
1603-
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1621+
int32_t n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
16041622
if (n_chars < 0) {
16051623
text.resize(-n_chars);
1606-
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
1624+
n_chars = llama_detokenize(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
16071625
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
16081626
}
16091627

@@ -1631,7 +1649,7 @@ std::string common_get_builtin_chat_template(const struct llama_model * model) {
16311649

16321650
bool common_chat_verify_template(const std::string & tmpl) {
16331651
llama_chat_message chat[] = {{"user", "test"}};
1634-
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
1652+
const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
16351653
return res >= 0;
16361654
}
16371655

@@ -1642,35 +1660,34 @@ std::string common_chat_apply_template(const struct llama_model * model,
16421660
int alloc_size = 0;
16431661
bool fallback = false; // indicate if we must fallback to default chatml
16441662
std::vector<llama_chat_message> chat;
1645-
for (auto & msg : msgs) {
1663+
for (const auto & msg : msgs) {
16461664
chat.push_back({msg.role.c_str(), msg.content.c_str()});
16471665
alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
16481666
}
16491667

1650-
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
1668+
const char * ptr_tmpl = tmpl.empty() ? llama_model_chat_template(model) : tmpl.c_str();
16511669
std::vector<char> buf(alloc_size);
16521670

16531671
// run the first time to get the total output length
1654-
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1672+
int32_t res = llama_chat_apply_template(ptr_tmpl, chat.data(), chat.size(), add_ass, buf.data(), buf.size());
16551673

16561674
// error: chat template is not supported
16571675
if (res < 0) {
16581676
if (ptr_tmpl != nullptr) {
16591677
// if the custom "tmpl" is not supported, we throw an error
16601678
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
16611679
throw std::runtime_error("this custom template is not supported");
1662-
} else {
1663-
// If the built-in template is not supported, we default to chatml
1664-
res = llama_chat_apply_template(nullptr, "chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1665-
fallback = true;
16661680
}
1681+
1682+
// If the built-in template is not supported, we default to chatml
1683+
res = llama_chat_apply_template("chatml", chat.data(), chat.size(), add_ass, buf.data(), buf.size());
1684+
fallback = true;
16671685
}
16681686

16691687
// if it turns out that our buffer is too small, we resize it
16701688
if ((size_t) res > buf.size()) {
16711689
buf.resize(res);
16721690
res = llama_chat_apply_template(
1673-
fallback ? nullptr : model,
16741691
fallback ? "chatml" : ptr_tmpl,
16751692
chat.data(), chat.size(), add_ass, buf.data(), buf.size());
16761693
}

common/common.h

+18-8
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@
2424

2525
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
2626

27-
struct common_lora_adapter_info {
27+
struct common_adapter_lora_info {
2828
std::string path;
2929
float scale;
3030

31-
struct llama_lora_adapter * ptr;
31+
struct llama_adapter_lora * ptr;
3232
};
3333

3434
using llama_tokens = std::vector<llama_token>;
@@ -246,8 +246,8 @@ struct common_params {
246246
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
247247
std::vector<llama_model_kv_override> kv_overrides;
248248

249-
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
250-
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
249+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
250+
std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
251251

252252
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
253253

@@ -481,7 +481,7 @@ struct common_init_result {
481481
llama_model_ptr model;
482482
llama_context_ptr context;
483483

484-
std::vector<llama_lora_adapter_ptr> lora;
484+
std::vector<llama_adapter_lora_ptr> lora;
485485
};
486486

487487
struct common_init_result common_init_from_params(common_params & params);
@@ -503,7 +503,7 @@ struct llama_model * common_load_model_from_hf(
503503
const struct llama_model_params & params);
504504

505505
// clear LoRA adapters from context, then apply new list of adapters
506-
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_info> & lora);
506+
void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
507507

508508
//
509509
// Batch utils
@@ -541,7 +541,7 @@ std::vector<llama_token> common_tokenize(
541541
bool parse_special = false);
542542

543543
std::vector<llama_token> common_tokenize(
544-
const struct llama_model * model,
544+
const struct llama_vocab * vocab,
545545
const std::string & text,
546546
bool add_special,
547547
bool parse_special = false);
@@ -553,11 +553,21 @@ std::string common_token_to_piece(
553553
llama_token token,
554554
bool special = true);
555555

556+
std::string common_token_to_piece(
557+
const struct llama_vocab * vocab,
558+
llama_token token,
559+
bool special = true);
560+
556561
// detokenizes a vector of tokens into a string
557562
// should work similar to Python's `tokenizer.decode`
558563
// optionally renders special/control tokens
559564
std::string common_detokenize(
560-
llama_context * ctx,
565+
const struct llama_context * ctx,
566+
const std::vector<llama_token> & tokens,
567+
bool special = true);
568+
569+
std::string common_detokenize(
570+
const struct llama_vocab * vocab,
561571
const std::vector<llama_token> & tokens,
562572
bool special = true);
563573

common/sampling.cpp

+11-6
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,10 @@ struct common_sampler {
113113
void set_logits(struct llama_context * ctx, int idx) {
114114
const auto * logits = llama_get_logits_ith(ctx, idx);
115115

116-
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
116+
const llama_model * model = llama_get_model(ctx);
117+
const llama_vocab * vocab = llama_model_get_vocab(model);
118+
119+
const int n_vocab = llama_vocab_n_tokens(vocab);
117120

118121
cur.resize(n_vocab);
119122

@@ -142,13 +145,15 @@ std::string common_params_sampling::print() const {
142145
}
143146

144147
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
148+
const llama_vocab * vocab = llama_model_get_vocab(model);
149+
145150
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
146151

147152
lparams.no_perf = params.no_perf;
148153

149154
auto * result = new common_sampler {
150155
/* .params = */ params,
151-
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
156+
/* .grmr = */ llama_sampler_init_grammar(vocab, params.grammar.c_str(), "root"),
152157
/* .chain = */ llama_sampler_chain_init(lparams),
153158
/* .prev = */ ring_buffer<llama_token>(std::max(32, params.n_prev)),
154159
/* .cur = */ {},
@@ -157,7 +162,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
157162

158163
llama_sampler_chain_add(result->chain,
159164
llama_sampler_init_logit_bias(
160-
llama_n_vocab(model),
165+
llama_vocab_n_tokens(vocab),
161166
params.logit_bias.size(),
162167
params.logit_bias.data()));
163168

@@ -172,7 +177,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
172177
c_breakers.push_back(str.c_str());
173178
}
174179

175-
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
180+
llama_sampler_chain_add(result->chain, llama_sampler_init_dry (vocab, llama_model_n_ctx_train(model), params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
176181
}
177182
break;
178183
case COMMON_SAMPLER_TYPE_TOP_K:
@@ -194,7 +199,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
194199
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
195200
break;
196201
case COMMON_SAMPLER_TYPE_INFILL:
197-
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
202+
llama_sampler_chain_add(result->chain, llama_sampler_init_infill (vocab));
198203
break;
199204
case COMMON_SAMPLER_TYPE_PENALTIES:
200205
llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
@@ -206,7 +211,7 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
206211
llama_sampler_chain_add(result->chain, llama_sampler_init_dist(params.seed));
207212
} else if (params.mirostat == 1) {
208213
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
209-
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_n_vocab(model), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
214+
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat(llama_vocab_n_tokens(vocab), params.seed, params.mirostat_tau, params.mirostat_eta, 100));
210215
} else if (params.mirostat == 2) {
211216
llama_sampler_chain_add(result->chain, llama_sampler_init_temp(params.temp));
212217
llama_sampler_chain_add(result->chain, llama_sampler_init_mirostat_v2(params.seed, params.mirostat_tau, params.mirostat_eta));

0 commit comments

Comments
 (0)