Skip to content

Commit ae2e4f8

Browse files
committed
name the tokenizer methods cleaner: encode and decode
1 parent c74456f commit ae2e4f8

File tree

1 file changed

+4
-4
lines changed

1 file changed

+4
-4
lines changed

run.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@ void free_tokenizer(Tokenizer* t) {
381381
free(t->vocab_scores);
382382
}
383383

384-
char* get_piece(Tokenizer* t, int prev_token, int token) {
384+
char* decode(Tokenizer* t, int prev_token, int token) {
385385
char *piece = t->vocab[token];
386386
// following BOS (1) token, sentencepiece decoder strips any leading whitespace (see PR #89)
387387
if (prev_token == 1 && piece[0] == ' ') { piece++; }
@@ -414,7 +414,7 @@ int str_lookup(char *str, TokenIndex *sorted_vocab, int vocab_size) {
414414
return res != NULL ? res->id : -1;
415415
}
416416

417-
void bpe_encode(Tokenizer* t, char *text, int *tokens, int *n_tokens) {
417+
void encode(Tokenizer* t, char *text, int *tokens, int *n_tokens) {
418418
// encode the string text (input) into an upper-bound preallocated tokens[] array
419419

420420
// sort vocabulary
@@ -694,7 +694,7 @@ int main(int argc, char *argv[]) {
694694
int num_prompt_tokens = 0;
695695
if (prompt != NULL) {
696696
prompt_tokens = (int*)malloc((strlen(prompt)+1) * sizeof(int));
697-
bpe_encode(&tokenizer, prompt, prompt_tokens, &num_prompt_tokens);
697+
encode(&tokenizer, prompt, prompt_tokens, &num_prompt_tokens);
698698
}
699699

700700
// start the main loop
@@ -737,7 +737,7 @@ int main(int argc, char *argv[]) {
737737
if (next == 1) { break; }
738738

739739
// print the token as string, decode it with the Tokenizer object
740-
char* piece = get_piece(&tokenizer, token, next);
740+
char* piece = decode(&tokenizer, token, next);
741741
printf("%s", piece);
742742
fflush(stdout);
743743
token = next;

0 commit comments

Comments
 (0)