From 4e4b99b5441e8c7377847215b86ceb39486ccb18 Mon Sep 17 00:00:00 2001
From: Vulcan <93451215+trholding@users.noreply.github.com>
Date: Mon, 31 Jul 2023 17:00:25 +0530
Subject: [PATCH 1/6] run.c - Output buffering

Previously much of time was spent writing to screen which is relatively slow.

By enabling output buffering more work can be performed by writing groups of computed tokens to the buffer which is relatively fast, and then flushing the buffer periodically to screen/console.

Testing with the smallest model, a interactive tokens/s speed up of ~14% on standard builds to ~84% on open-mp builds has been achieved.

Usage:

run <checkpoint_file> [temperature] [steps] [prompt] [buffer_tokens]

Where buffer_tokens is the number of tokens to be buffered.

Multiples of 2 seem to be ideal. 64 worked well for my use case on a low end machine.

The speed up may depend on model size and OS.

Example:

./run model.bin 0 0 "A car" 64
---
 run.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/run.c b/run.c
index d8f153eb..4ec08546 100644
--- a/run.c
+++ b/run.c
@@ -455,10 +455,11 @@ int main(int argc, char *argv[]) {
     float temperature = 0.9f; // e.g. 1.0, or 0.0
     int steps = 256;          // max number of steps to run for, 0: use seq_len
     char *prompt = NULL;      // prompt string
+    int buffertokens = 1;     // output token buffer size
 
     // 'checkpoint' is necessary arg
     if (argc < 2) {
-        printf("Usage: %s <checkpoint_file> [temperature] [steps] [prompt]\n", argv[0]);
+        printf("Usage: %s <checkpoint_file> [temperature] [steps] [prompt] [buffer_tokens]\n", argv[0]);
         return 1;
     }
     if (argc >= 2) {
@@ -474,6 +475,9 @@ int main(int argc, char *argv[]) {
     if (argc >= 5) {
         prompt = argv[4];
     }
+    if (argc >= 6) {
+        buffertokens = atoi(argv[5]);
+    }
 
     // seed rng with time. if you want deterministic behavior use temperature 0.0
     rng_seed = (unsigned int)time(NULL);
@@ -543,7 +547,11 @@ int main(int argc, char *argv[]) {
     int next;        // will store the next token in the sequence
     int token = 1;   // init with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
     int pos = 0;     // position in the sequence
+    int bufferflush = 1; // buffer flush after token counter 
+    char outbuff[2048]; // used for output buffering              
+    memset( outbuff, '\0', sizeof( outbuff )); // clear buffer area
     printf("<s>\n"); // explicit print the initial BOS token for stylistic symmetry reasons
+    setvbuf(stdout, outbuff, _IOFBF, 2048); // setup output buffering
     while (pos < steps) {
 
         // forward the transformer to get logits for the next token
@@ -570,7 +578,7 @@ int main(int argc, char *argv[]) {
         // following BOS token (1), sentencepiece decoder strips any leading whitespace (see PR #89)
         char *token_str = (token == 1 && vocab[next][0] == ' ') ? vocab[next]+1 : vocab[next];
         printf("%s", token_str);
-        fflush(stdout);
+        if (bufferflush==pos) { fflush(stdout); bufferflush+=buffertokens; } // flush after every n tokens
 
         // advance forward
         token = next;

From 4b4b90d57217e93cc164b536be0ecbee37cf4bcf Mon Sep 17 00:00:00 2001
From: Vulcan <93451215+trholding@users.noreply.github.com>
Date: Wed, 2 Aug 2023 22:20:52 +0530
Subject: [PATCH 2/6] Update run.c - Buffer size fix for edge cases

Increased output buffer from 2048 to 4096

This fixes output buffer overflow and garbled output when output is larger than buffer in rare cases such as when blocks of token sequences are repeated during inference.

./run stories110M.bin 0  0  "A big dog" 256
---
 run.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/run.c b/run.c
index 7b7691ee..fe258341 100644
--- a/run.c
+++ b/run.c
@@ -548,10 +548,10 @@ int main(int argc, char *argv[]) {
     int token = 1;   // init with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
     int pos = 0;     // position in the sequence
     int bufferflush = 1; // buffer flush after token counter 
-    char outbuff[2048]; // used for output buffering              
+    char outbuff[4096]; // used for output buffering              
     memset( outbuff, '\0', sizeof( outbuff )); // clear buffer area
     printf("<s>\n"); // explicit print the initial BOS token for stylistic symmetry reasons
-    setvbuf(stdout, outbuff, _IOFBF, 2048); // setup output buffering
+    setvbuf(stdout, outbuff, _IOFBF, 4096); // setup output buffering
     while (pos < steps) {
 
         // forward the transformer to get logits for the next token

From 24dddcc4466cf0705b949fdfa55d5013e3ca4346 Mon Sep 17 00:00:00 2001
From: Vulcan <93451215+trholding@users.noreply.github.com>
Date: Thu, 3 Aug 2023 02:24:40 +0530
Subject: [PATCH 3/6] run.c - remove memset

Making the outbuff static, auto initializes it to zero, thus the memset can be avoided.

Ref: https://github.com/karpathy/llama2.c/pull/193#discussion_r1282272178
---
 run.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/run.c b/run.c
index fe258341..8f9ca5f1 100644
--- a/run.c
+++ b/run.c
@@ -548,8 +548,7 @@ int main(int argc, char *argv[]) {
     int token = 1;   // init with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
     int pos = 0;     // position in the sequence
     int bufferflush = 1; // buffer flush after token counter 
-    char outbuff[4096]; // used for output buffering              
-    memset( outbuff, '\0', sizeof( outbuff )); // clear buffer area
+    static char outbuff[4096]; // used for output buffering              
     printf("<s>\n"); // explicit print the initial BOS token for stylistic symmetry reasons
     setvbuf(stdout, outbuff, _IOFBF, 4096); // setup output buffering
     while (pos < steps) {
@@ -578,7 +577,7 @@ int main(int argc, char *argv[]) {
         // following BOS token (1), sentencepiece decoder strips any leading whitespace (see PR #89)
         char *token_str = (token == 1 && vocab[next][0] == ' ') ? vocab[next]+1 : vocab[next];
         printf("%s", token_str);
-        if (bufferflush==pos) { fflush(stdout); bufferflush+=buffertokens; } // flush after every n tokens
+        if (bufferflush==pos && strlen(outbuff)<=4096) { fflush(stdout); bufferflush+=buffertokens; } // flush after every n tokens
 
         // advance forward
         token = next;

From e9bf38301f1e832f7479287fb48e6f35bfab3d24 Mon Sep 17 00:00:00 2001
From: Vulcan <93451215+trholding@users.noreply.github.com>
Date: Thu, 3 Aug 2023 22:08:45 +0530
Subject: [PATCH 4/6] run.c - fixed output buffering

Fixed a buffer overflow bug by changing hardcoded size of outbuff to one computed at runtime.
---
 run.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/run.c b/run.c
index 8f9ca5f1..403edd8c 100644
--- a/run.c
+++ b/run.c
@@ -455,7 +455,7 @@ int main(int argc, char *argv[]) {
     float temperature = 0.9f; // e.g. 1.0, or 0.0
     int steps = 256;          // max number of steps to run for, 0: use seq_len
     char *prompt = NULL;      // prompt string
-    int buffertokens = 1;     // output token buffer size
+    int buffertokens = 1;     // number of tokens to buffer before flushing to screen
 
     // 'checkpoint' is necessary arg
     if (argc < 2) {
@@ -547,10 +547,15 @@ int main(int argc, char *argv[]) {
     int next;        // will store the next token in the sequence
     int token = 1;   // init with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
     int pos = 0;     // position in the sequence
-    int bufferflush = 1; // buffer flush after token counter 
-    static char outbuff[4096]; // used for output buffering              
+    int bufferflush = 1; // token counter for flushing buffer
+    char outbuff[config.seq_len * (6 + 2)]; // buffersize is context length * average size of subwords + margin
     printf("<s>\n"); // explicit print the initial BOS token for stylistic symmetry reasons
-    setvbuf(stdout, outbuff, _IOFBF, 4096); // setup output buffering
+
+    // setvbuf is used to buffer output into outbuff instead of flushing to screen directly
+    if (setvbuf(stdout, outbuff, _IOFBF, sizeof(outbuff)) != 0) {
+    puts("Error: Buffer allocation!"); exit(EXIT_FAILURE);
+    }
+
     while (pos < steps) {
 
         // forward the transformer to get logits for the next token
@@ -576,8 +581,10 @@ int main(int argc, char *argv[]) {
 
         // following BOS token (1), sentencepiece decoder strips any leading whitespace (see PR #89)
         char *token_str = (token == 1 && vocab[next][0] == ' ') ? vocab[next]+1 : vocab[next];
+        
         printf("%s", token_str);
-        if (bufferflush==pos && strlen(outbuff)<=4096) { fflush(stdout); bufferflush+=buffertokens; } // flush after every n tokens
+        // flush output to screen after the defined number of buffertokens have accumulated
+        if (bufferflush==pos) { fflush(stdout); bufferflush+=buffertokens; } 
 
         // advance forward
         token = next;

From 4681774ccbc6e8c8b2f625efd58961051f43f331 Mon Sep 17 00:00:00 2001
From: Vulcan <93451215+trholding@users.noreply.github.com>
Date: Thu, 3 Aug 2023 23:13:55 +0530
Subject: [PATCH 5/6] run.c - fix windows builds

MSVC does not support VLA.

outbuff size is now a hard coded value:

Context size of Meta Llama 2 * (avg token length + margin)

4096  * ( 6 + 2  )

meh
---
 run.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run.c b/run.c
index 403edd8c..5c604856 100644
--- a/run.c
+++ b/run.c
@@ -548,7 +548,7 @@ int main(int argc, char *argv[]) {
     int token = 1;   // init with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
     int pos = 0;     // position in the sequence
     int bufferflush = 1; // token counter for flushing buffer
-    char outbuff[config.seq_len * (6 + 2)]; // buffersize is context length * average size of subwords + margin
+    char outbuff[4096 * (6 + 2)] ; // buffersize is context length * average size of subwords + margin
     printf("<s>\n"); // explicit print the initial BOS token for stylistic symmetry reasons
 
     // setvbuf is used to buffer output into outbuff instead of flushing to screen directly

From 4cf79e83da2f24c8e31ef1bef06efaab8a9c2280 Mon Sep 17 00:00:00 2001
From: Vulcan <93451215+trholding@users.noreply.github.com>
Date: Sun, 6 Aug 2023 02:02:38 +0530
Subject: [PATCH 6/6] Update run.c - changed outbuff char array to static

---
 run.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run.c b/run.c
index 5c604856..957f2f8e 100644
--- a/run.c
+++ b/run.c
@@ -548,7 +548,7 @@ int main(int argc, char *argv[]) {
     int token = 1;   // init with token 1 (=BOS), as done in Llama-2 sentencepiece tokenizer
     int pos = 0;     // position in the sequence
     int bufferflush = 1; // token counter for flushing buffer
-    char outbuff[4096 * (6 + 2)] ; // buffersize is context length * average size of subwords + margin
+    static char outbuff[4096 * (6 + 2)] ; // buffersize is context length * average size of subwords + margin
     printf("<s>\n"); // explicit print the initial BOS token for stylistic symmetry reasons
 
     // setvbuf is used to buffer output into outbuff instead of flushing to screen directly