@@ -857,21 +857,23 @@ struct common_init_result common_init_from_params(common_params & params) {
857
857
return iparams;
858
858
}
859
859
860
+ const llama_vocab * vocab = llama_model_get_vocab (model);
861
+
860
862
if (params.reranking ) {
861
863
bool ok = true ;
862
864
863
- if (llama_token_bos (model ) == LLAMA_TOKEN_NULL) {
864
- LOG_WRN (" %s: warning: model does not have a BOS token, reranking will not work\n " , __func__);
865
+ if (llama_vocab_bos (vocab ) == LLAMA_TOKEN_NULL) {
866
+ LOG_WRN (" %s: warning: vocab does not have a BOS token, reranking will not work\n " , __func__);
865
867
ok = false ;
866
868
}
867
869
868
- if (llama_token_eos (model ) == LLAMA_TOKEN_NULL) {
869
- LOG_WRN (" %s: warning: model does not have an EOS token, reranking will not work\n " , __func__);
870
+ if (llama_vocab_eos (vocab ) == LLAMA_TOKEN_NULL) {
871
+ LOG_WRN (" %s: warning: vocab does not have an EOS token, reranking will not work\n " , __func__);
870
872
ok = false ;
871
873
}
872
874
873
- if (llama_token_sep (model ) == LLAMA_TOKEN_NULL) {
874
- LOG_WRN (" %s: warning: model does not have a SEP token, reranking will not work\n " , __func__);
875
+ if (llama_vocab_sep (vocab ) == LLAMA_TOKEN_NULL) {
876
+ LOG_WRN (" %s: warning: vocab does not have a SEP token, reranking will not work\n " , __func__);
875
877
ok = false ;
876
878
}
877
879
@@ -884,7 +886,7 @@ struct common_init_result common_init_from_params(common_params & params) {
884
886
885
887
auto cparams = common_context_params_to_llama (params);
886
888
887
- llama_context * lctx = llama_new_context_with_model (model, cparams);
889
+ llama_context * lctx = llama_init_from_model (model, cparams);
888
890
if (lctx == NULL ) {
889
891
LOG_ERR (" %s: failed to create context with model '%s'\n " , __func__, params.model .c_str ());
890
892
llama_model_free (model);
@@ -898,7 +900,7 @@ struct common_init_result common_init_from_params(common_params & params) {
898
900
899
901
if (!params.control_vectors .empty ()) {
900
902
if (params.control_vector_layer_start <= 0 ) params.control_vector_layer_start = 1 ;
901
- if (params.control_vector_layer_end <= 0 ) params.control_vector_layer_end = llama_n_layer (model);
903
+ if (params.control_vector_layer_end <= 0 ) params.control_vector_layer_end = llama_model_n_layer (model);
902
904
903
905
const auto cvec = common_control_vector_load (params.control_vectors );
904
906
if (cvec.n_embd == -1 ) {
@@ -908,12 +910,13 @@ struct common_init_result common_init_from_params(common_params & params) {
908
910
return iparams;
909
911
}
910
912
911
- int err = llama_control_vector_apply (lctx,
912
- cvec.data .data (),
913
- cvec.data .size (),
914
- cvec.n_embd ,
915
- params.control_vector_layer_start ,
916
- params.control_vector_layer_end );
913
+ int err = llama_apply_adapter_cvec (
914
+ lctx,
915
+ cvec.data .data (),
916
+ cvec.data .size (),
917
+ cvec.n_embd ,
918
+ params.control_vector_layer_start ,
919
+ params.control_vector_layer_end );
917
920
if (err) {
918
921
llama_free (lctx);
919
922
llama_model_free (model);
@@ -924,8 +927,8 @@ struct common_init_result common_init_from_params(common_params & params) {
924
927
925
928
// load and optionally apply lora adapters
926
929
for (auto & la : params.lora_adapters ) {
927
- llama_lora_adapter_ptr lora;
928
- lora.reset (llama_lora_adapter_init (model, la.path .c_str ()));
930
+ llama_adapter_lora_ptr lora;
931
+ lora.reset (llama_adapter_lora_init (model, la.path .c_str ()));
929
932
if (lora == nullptr ) {
930
933
LOG_ERR (" %s: failed to apply lora adapter '%s'\n " , __func__, la.path .c_str ());
931
934
llama_free (lctx);
@@ -938,17 +941,17 @@ struct common_init_result common_init_from_params(common_params & params) {
938
941
}
939
942
940
943
if (!params.lora_init_without_apply ) {
941
- common_lora_adapters_apply (lctx, params.lora_adapters );
944
+ common_set_adapter_lora (lctx, params.lora_adapters );
942
945
}
943
946
944
- if (params.sampling .ignore_eos && llama_token_eos (model ) == LLAMA_TOKEN_NULL) {
945
- LOG_WRN (" %s: warning: model does not have an EOS token, ignoring --ignore-eos\n " , __func__);
947
+ if (params.sampling .ignore_eos && llama_vocab_eos (vocab ) == LLAMA_TOKEN_NULL) {
948
+ LOG_WRN (" %s: warning: vocab does not have an EOS token, ignoring --ignore-eos\n " , __func__);
946
949
params.sampling .ignore_eos = false ;
947
950
}
948
951
949
952
if (params.sampling .ignore_eos ) {
950
- for (llama_token i = 0 ; i < llama_n_vocab (model ); i++) {
951
- if (llama_token_is_eog (model , i)) {
953
+ for (llama_token i = 0 ; i < llama_vocab_n_tokens (vocab ); i++) {
954
+ if (llama_vocab_is_eog (vocab , i)) {
952
955
LOG_INF (" %s: added %s logit bias = %f\n " , __func__, common_token_to_piece (lctx, i).c_str (), -INFINITY);
953
956
params.sampling .logit_bias .push_back ({i, -INFINITY});
954
957
}
@@ -969,8 +972,9 @@ struct common_init_result common_init_from_params(common_params & params) {
969
972
LOG_WRN (" %s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n " , __func__);
970
973
971
974
std::vector<llama_token> tmp;
972
- llama_token bos = llama_token_bos (model);
973
- llama_token eos = llama_token_eos (model);
975
+ llama_token bos = llama_vocab_bos (vocab);
976
+ llama_token eos = llama_vocab_eos (vocab);
977
+
974
978
// some models (e.g. T5) don't have a BOS token
975
979
if (bos != LLAMA_TOKEN_NULL) {
976
980
tmp.push_back (bos);
@@ -1005,11 +1009,11 @@ struct common_init_result common_init_from_params(common_params & params) {
1005
1009
return iparams;
1006
1010
}
1007
1011
1008
- void common_lora_adapters_apply (struct llama_context * ctx, std::vector<common_lora_adapter_info > & lora) {
1009
- llama_lora_adapter_clear (ctx);
1012
+ void common_set_adapter_lora (struct llama_context * ctx, std::vector<common_adapter_lora_info > & lora) {
1013
+ llama_clear_adapter_lora (ctx);
1010
1014
for (auto & la : lora) {
1011
1015
if (la.scale != 0 .0f ) {
1012
- llama_lora_adapter_set (ctx, la.ptr , la.scale );
1016
+ llama_set_adapter_lora (ctx, la.ptr , la.scale );
1013
1017
}
1014
1018
}
1015
1019
}
@@ -1559,21 +1563,23 @@ std::vector<llama_token> common_tokenize(
1559
1563
const std::string & text,
1560
1564
bool add_special,
1561
1565
bool parse_special) {
1562
- return common_tokenize (llama_get_model (ctx), text, add_special, parse_special);
1566
+ const llama_model * model = llama_get_model (ctx);
1567
+ const llama_vocab * vocab = llama_model_get_vocab (model);
1568
+ return common_tokenize (vocab, text, add_special, parse_special);
1563
1569
}
1564
1570
1565
1571
std::vector<llama_token> common_tokenize (
1566
- const struct llama_model * model ,
1572
+ const struct llama_vocab * vocab ,
1567
1573
const std::string & text,
1568
1574
bool add_special,
1569
1575
bool parse_special) {
1570
1576
// upper limit for the number of tokens
1571
1577
int n_tokens = text.length () + 2 * add_special;
1572
1578
std::vector<llama_token> result (n_tokens);
1573
- n_tokens = llama_tokenize (model , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
1579
+ n_tokens = llama_tokenize (vocab , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
1574
1580
if (n_tokens < 0 ) {
1575
1581
result.resize (-n_tokens);
1576
- int check = llama_tokenize (model , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
1582
+ int check = llama_tokenize (vocab , text.data (), text.length (), result.data (), result.size (), add_special, parse_special);
1577
1583
GGML_ASSERT (check == -n_tokens);
1578
1584
} else {
1579
1585
result.resize (n_tokens);
@@ -1582,12 +1588,18 @@ std::vector<llama_token> common_tokenize(
1582
1588
}
1583
1589
1584
1590
std::string common_token_to_piece (const struct llama_context * ctx, llama_token token, bool special) {
1591
+ const llama_model * model = llama_get_model (ctx);
1592
+ const llama_vocab * vocab = llama_model_get_vocab (model);
1593
+ return common_token_to_piece (vocab, token, special);
1594
+ }
1595
+
1596
+ std::string common_token_to_piece (const struct llama_vocab * vocab, llama_token token, bool special) {
1585
1597
std::string piece;
1586
1598
piece.resize (piece.capacity ()); // using string internal cache, 15 bytes + '\n'
1587
- const int n_chars = llama_token_to_piece (llama_get_model (ctx) , token, &piece[0 ], piece.size (), 0 , special);
1599
+ const int n_chars = llama_token_to_piece (vocab , token, &piece[0 ], piece.size (), 0 , special);
1588
1600
if (n_chars < 0 ) {
1589
1601
piece.resize (-n_chars);
1590
- int check = llama_token_to_piece (llama_get_model (ctx) , token, &piece[0 ], piece.size (), 0 , special);
1602
+ int check = llama_token_to_piece (vocab , token, &piece[0 ], piece.size (), 0 , special);
1591
1603
GGML_ASSERT (check == -n_chars);
1592
1604
}
1593
1605
else {
@@ -1597,13 +1609,19 @@ std::string common_token_to_piece(const struct llama_context * ctx, llama_token
1597
1609
return piece;
1598
1610
}
1599
1611
1600
- std::string common_detokenize (llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1612
+ std::string common_detokenize (const struct llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
1613
+ const llama_model * model = llama_get_model (ctx);
1614
+ const llama_vocab * vocab = llama_model_get_vocab (model);
1615
+ return common_detokenize (vocab, tokens, special);
1616
+ }
1617
+
1618
+ std::string common_detokenize (const struct llama_vocab * vocab, const std::vector<llama_token> & tokens, bool special) {
1601
1619
std::string text;
1602
1620
text.resize (std::max (text.capacity (), tokens.size ()));
1603
- int32_t n_chars = llama_detokenize (llama_get_model (ctx) , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
1621
+ int32_t n_chars = llama_detokenize (vocab , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
1604
1622
if (n_chars < 0 ) {
1605
1623
text.resize (-n_chars);
1606
- n_chars = llama_detokenize (llama_get_model (ctx) , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
1624
+ n_chars = llama_detokenize (vocab , tokens.data (), (int32_t )tokens.size (), &text[0 ], (int32_t )text.size (), false , special);
1607
1625
GGML_ASSERT (n_chars <= (int32_t )text.size ()); // whitespace trimming is performed after per-token detokenization
1608
1626
}
1609
1627
@@ -1631,7 +1649,7 @@ std::string common_get_builtin_chat_template(const struct llama_model * model) {
1631
1649
1632
1650
bool common_chat_verify_template (const std::string & tmpl) {
1633
1651
llama_chat_message chat[] = {{" user" , " test" }};
1634
- int res = llama_chat_apply_template (nullptr , tmpl.c_str (), chat, 1 , true , nullptr , 0 );
1652
+ const int res = llama_chat_apply_template (tmpl.c_str (), chat, 1 , true , nullptr , 0 );
1635
1653
return res >= 0 ;
1636
1654
}
1637
1655
@@ -1642,35 +1660,34 @@ std::string common_chat_apply_template(const struct llama_model * model,
1642
1660
int alloc_size = 0 ;
1643
1661
bool fallback = false ; // indicate if we must fallback to default chatml
1644
1662
std::vector<llama_chat_message> chat;
1645
- for (auto & msg : msgs) {
1663
+ for (const auto & msg : msgs) {
1646
1664
chat.push_back ({msg.role .c_str (), msg.content .c_str ()});
1647
1665
alloc_size += (msg.role .size () + msg.content .size ()) * 1.25 ;
1648
1666
}
1649
1667
1650
- const char * ptr_tmpl = tmpl.empty () ? nullptr : tmpl.c_str ();
1668
+ const char * ptr_tmpl = tmpl.empty () ? llama_model_chat_template (model) : tmpl.c_str ();
1651
1669
std::vector<char > buf (alloc_size);
1652
1670
1653
1671
// run the first time to get the total output length
1654
- int32_t res = llama_chat_apply_template (model, ptr_tmpl, chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
1672
+ int32_t res = llama_chat_apply_template (ptr_tmpl, chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
1655
1673
1656
1674
// error: chat template is not supported
1657
1675
if (res < 0 ) {
1658
1676
if (ptr_tmpl != nullptr ) {
1659
1677
// if the custom "tmpl" is not supported, we throw an error
1660
1678
// this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
1661
1679
throw std::runtime_error (" this custom template is not supported" );
1662
- } else {
1663
- // If the built-in template is not supported, we default to chatml
1664
- res = llama_chat_apply_template (nullptr , " chatml" , chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
1665
- fallback = true ;
1666
1680
}
1681
+
1682
+ // If the built-in template is not supported, we default to chatml
1683
+ res = llama_chat_apply_template (" chatml" , chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
1684
+ fallback = true ;
1667
1685
}
1668
1686
1669
1687
// if it turns out that our buffer is too small, we resize it
1670
1688
if ((size_t ) res > buf.size ()) {
1671
1689
buf.resize (res);
1672
1690
res = llama_chat_apply_template (
1673
- fallback ? nullptr : model,
1674
1691
fallback ? " chatml" : ptr_tmpl,
1675
1692
chat.data (), chat.size (), add_ass, buf.data (), buf.size ());
1676
1693
}
0 commit comments