rasbt
diff --git a/‎ch04/03_kv-cache/gpt_ch04.py
Lines changed: 1 addition & 1 deletion b/‎ch04/03_kv-cache/gpt_ch04.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎ch05/07_gpt_to_llama/README.md
Lines changed: 6 additions & 6 deletions b/‎ch05/07_gpt_to_llama/README.md
Lines changed: 6 additions & 6 deletions
diff --git a/‎ch05/11_qwen3/README.md
Lines changed: 6 additions & 6 deletions b/‎ch05/11_qwen3/README.md
Lines changed: 6 additions & 6 deletions
diff --git a/‎pkg/llms_from_scratch/kv_cache/generate.py
Lines changed: 4 additions & 3 deletions b/‎pkg/llms_from_scratch/kv_cache/generate.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎pkg/llms_from_scratch/kv_cache/gpt2.py
Lines changed: 43 additions & 140 deletions b/‎pkg/llms_from_scratch/kv_cache/gpt2.py
Lines changed: 43 additions & 140 deletions
@@ -27,7 +27,7 @@ def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=Fal
         self.dropout = nn.Dropout(dropout)
         self.register_buffer(
             "mask",
-            torch.triu(torch.ones(context_length, context_length),diagonal=1),
+            torch.triu(torch.ones(context_length, context_length), diagonal=1),
             persistent=False
         )
 
 
@@ -236,14 +236,14 @@ token_ids = generate_text_simple(
 )
 ```
 
-Note that the peak memory usage is only listed for Nvidia CUDA devices, as it is easier to calculate. However, the memory usage on other devices is likely similar as it uses a similar precision format, and the KV cache storage dominates here for the generated 150-token text (however, different devices may implement matrix multiplication differently and may result in different peak memory requirements).
+Note that the peak memory usage is only listed for Nvidia CUDA devices, as it is easier to calculate. However, the memory usage on other devices is likely similar as it uses a similar precision format, and the KV cache storage results in even lower memory usage here for the generated 150-token text (however, different devices may implement matrix multiplication differently and may result in different peak memory requirements; and KV-cache memory may increase prohibitively for longer contexts lengths).
 
 | Model       | Mode              | Hardware        | Tokens/sec | GPU Memory (VRAM) |
-|-------------|-------------------|-----------------|------------|-------------------|
+| ----------- | ----------------- | --------------- | ---------- | ----------------- |
 | Llama3Model | Regular           | Mac Mini M4 CPU | 1          | -                 |
 | Llama3Model | Regular compiled  | Mac Mini M4 CPU | -          | -                 |
-| Llama3Model | KV cache          | Mac Mini M4 CPU | 62         | -                 |
-| Llama3Model | KV cache compiled | Mac Mini M4 CPU | -          | -                 |
+| Llama3Model | KV cache          | Mac Mini M4 CPU | 68         | -                 |
+| Llama3Model | KV cache compiled | Mac Mini M4 CPU | 86         | -                 |
 |             |                   |                 |            |                   |
 | Llama3Model | Regular           | Mac Mini M4 GPU | 15         | -                 |
 | Llama3Model | Regular compiled  | Mac Mini M4 GPU | -          | -                 |
@@ -252,7 +252,7 @@ Note that the peak memory usage is only listed for Nvidia CUDA devices, as it is
 |             |                   |                 |            |                   |
 | Llama3Model | Regular           | Nvidia A100 GPU | 42         | 2.91 GB           |
 | Llama3Model | Regular compiled  | Nvidia A100 GPU | 170        | 3.12 GB           |
-| Llama3Model | KV cache          | Nvidia A100 GPU | 60         | 18.87 GB          |
-| Llama3Model | KV cache compiled | Nvidia A100 GPU | 59         | 19.12 GB          |
+| Llama3Model | KV cache          | Nvidia A100 GPU | 58         | 2.87 GB           |
+| Llama3Model | KV cache compiled | Nvidia A100 GPU | 161        | 3.61 GB           |
 
 Note that all settings above have been tested to produce the same text outputs.
@@ -209,23 +209,23 @@ token_ids = generate_text_simple(
 )
 ```
 
-Note that the peak memory usage is only listed for Nvidia CUDA devices, as it is easier to calculate. However, the memory usage on other devices is likely similar as it uses a similar precision format, and the KV cache storage dominates here for the generated 150-token text (however, different devices may implement matrix multiplication differently and may result in different peak memory requirements).
+Note that the peak memory usage is only listed for Nvidia CUDA devices, as it is easier to calculate. However, the memory usage on other devices is likely similar as it uses a similar precision format, and the KV cache storage results in even lower memory usage here for the generated 150-token text (however, different devices may implement matrix multiplication differently and may result in different peak memory requirements; and KV-cache memory may increase prohibitively for longer contexts lengths).
 
 | Model      | Mode              | Hardware        | Tokens/sec | GPU Memory (VRAM) |
 | ---------- | ----------------- | --------------- | ---------- | ----------------- |
 | Qwen3Model | Regular           | Mac Mini M4 CPU | 1          | -                 |
 | Qwen3Model | Regular compiled  | Mac Mini M4 CPU | 1          | -                 |
 | Qwen3Model | KV cache          | Mac Mini M4 CPU | 80         | -                 |
-| Qwen3Model | KV cache compiled | Mac Mini M4 CPU | 82         | -                 |
+| Qwen3Model | KV cache compiled | Mac Mini M4 CPU | 137        | -                 |
 |            |                   |                 |            |                   |
 | Qwen3Model | Regular           | Mac Mini M4 GPU | 21         | -                 |
 | Qwen3Model | Regular compiled  | Mac Mini M4 GPU | Error      | -                 |
-| Qwen3Model | KV cache          | Mac Mini M4 GPU | 32         | -                 |
+| Qwen3Model | KV cache          | Mac Mini M4 GPU | 28         | -                 |
 | Qwen3Model | KV cache compiled | Mac Mini M4 GPU | Error      | -                 |
 |            |                   |                 |            |                   |
-| Qwen3Model | Regular           | Nvidia A100 GPU | 25         | 1.49 GB           |
+| Qwen3Model | Regular           | Nvidia A100 GPU | 26         | 1.49 GB           |
 | Qwen3Model | Regular compiled  | Nvidia A100 GPU | 107        | 1.99 GB           |
-| Qwen3Model | KV cache          | Nvidia A100 GPU | 25         | 10.20 GB          |
-| Qwen3Model | KV cache compiled | Nvidia A100 GPU | 24         | 10.61 GB          |
+| Qwen3Model | KV cache          | Nvidia A100 GPU | 25         | 1.47 GB           |
+| Qwen3Model | KV cache compiled | Nvidia A100 GPU | 90         | 1.48 GB           |
 
 Note that all settings above have been tested to produce the same text outputs.
@@ -3,23 +3,24 @@
 #   - https://www.manning.com/books/build-a-large-language-model-from-scratch
 # Code: https://github.com/rasbt/LLMs-from-scratch
 
+from .utils import KVCache
 import torch
 
 
 def generate_text_simple(model, idx, max_new_tokens, context_size=None, use_cache=True):
     model.eval()
-
     ctx_len = context_size or model.cfg["context_length"]
+    cache = KVCache(n_layers=model.cfg["n_layers"]) if use_cache else None
 
     with torch.no_grad():
         if use_cache:
             model.reset_kv_cache()
-            logits = model(idx[:, -ctx_len:], use_cache=True)
+            logits = model(idx[:, -ctx_len:], use_cache=True, cache=cache)
 
             for _ in range(max_new_tokens):
                 next_idx = logits[:, -1].argmax(dim=-1, keepdim=True)
                 idx = torch.cat([idx, next_idx], dim=1)
-                logits = model(next_idx, use_cache=True)
+                logits = model(next_idx, use_cache=True, cache=cache)
         else:
             for _ in range(max_new_tokens):
                 logits = model(idx[:, -ctx_len:], use_cache=False)
 
@@ -3,6 +3,8 @@
 #   - https://www.manning.com/books/build-a-large-language-model-from-scratch
 # Code: https://github.com/rasbt/LLMs-from-scratch
 
+from .utils import KVCache   # noqa: F401
+
 import torch
 import torch.nn as nn
 
@@ -11,7 +13,7 @@
 # Chapter 3
 #####################################
 class MultiHeadAttention(nn.Module):
-    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False, max_seq_len=None, window_size=None):
+    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
         super().__init__()
         assert d_out % num_heads == 0, "d_out must be divisible by num_heads"
 
@@ -25,80 +27,41 @@ def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=Fal
         self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
         self.dropout = nn.Dropout(dropout)
 
-        ####################################################
-        # NEW
-        self.max_seq_len = max_seq_len or context_length
-        self.window_size = window_size or self.max_seq_len
-        self.register_buffer("cache_k", None, persistent=False)
-        self.register_buffer("cache_v", None, persistent=False)
-        ####################################################
-
-    def forward(self, x, use_cache=False):
+    def forward(self, x, use_cache=False, start_pos=0, cache=None):
         b, num_tokens, d_in = x.shape
 
-        keys_new = self.W_key(x)  # Shape: (b, num_tokens, d_out)
-        values_new = self.W_value(x)
+        keys = self.W_key(x)  # Shape: (b, num_tokens, d_out)
+        values = self.W_value(x)
         queries = self.W_query(x)
 
         # We implicitly split the matrix by adding a `num_heads` dimension
         # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
-        keys_new = keys_new.view(b, num_tokens, self.num_heads, self.head_dim)
-        values_new = values_new.view(b, num_tokens, self.num_heads, self.head_dim)
+        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
+        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
         queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)
 
         # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
-        keys_new = keys_new.transpose(1, 2)
-        values_new = values_new.transpose(1, 2)
+        keys = keys.transpose(1, 2)
         queries = queries.transpose(1, 2)
+        values = values.transpose(1, 2)
 
-        ####################################################
-        # NEW
         if use_cache:
-            if self.cache_k is None or self.cache_k.size(0) != b:
-                self.cache_k = torch.zeros(b, self.num_heads,
-                                           self.window_size, self.head_dim,
-                                           device=x.device)
-                self.cache_v = torch.zeros_like(self.cache_k)
-                self.ptr_cur = 0  # pointer to next free slot
-
-            # if incoming chunk would overflow discard oldest tokens
-            if self.ptr_cur + num_tokens > self.window_size:
-                overflow = self.ptr_cur + num_tokens - self.window_size
-                # shift everything left by `overflow` (cheap view-copy)
-                self.cache_k[:, :, :-overflow, :] = self.cache_k[:, :, overflow:, :].clone()
-                self.cache_v[:, :, :-overflow, :] = self.cache_v[:, :, overflow:, :].clone()
-                self.ptr_cur -= overflow  # pointer after shift
-
-            self.cache_k[:, :, self.ptr_cur:self.ptr_cur + num_tokens, :] = keys_new
-            self.cache_v[:, :, self.ptr_cur:self.ptr_cur + num_tokens, :] = values_new
-            self.ptr_cur += num_tokens
-
-            keys = self.cache_k[:, :, :self.ptr_cur, :]
-            values = self.cache_v[:, :, :self.ptr_cur, :]
+            if cache is not None:
+                keys = torch.cat([cache[0], keys], dim=2)
+                values = torch.cat([cache[1], values], dim=2)
+            next_cache = (keys, values)
         else:
-            keys, values = keys_new, values_new
-            self.ptr_cur = 0  # keep pointer sane if you interleave modes
-        ####################################################
-        # Compute scaled dot-product attention (aka self-attention) with a causal mask
-        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head
+            next_cache = None
 
-        ####################################################
-        # NEW
-        K = attn_scores.size(-1)
+        seq_len = keys.size(2)
+        causal_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool, device=x.device), diagonal=1)
+        causal_mask = causal_mask[:, -num_tokens:][None, None, :, :]
 
-        if num_tokens == K:
-            # No cache → use the pre‑baked triangular mask slice
-            causal_mask = torch.triu(torch.ones(num_tokens, K, device=x.device, dtype=torch.bool), diagonal=1)
-        else:
-            # Cached: need to offset the diagonal by (K − num_tokens)
-            offset = K - num_tokens  # number of tokens already in cache before this chunk
-            row_idx = torch.arange(num_tokens, device=x.device).unsqueeze(1)  # (num_tokens, 1)
-            col_idx = torch.arange(K, device=x.device).unsqueeze(0)           # (1, K)
-            causal_mask = row_idx + offset < col_idx                          # True where j > i+offset
-        ####################################################
+        # Compute scaled dot-product attention (aka self-attention) with a causal mask
+        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head
 
         # Use the mask to fill attention scores
-        attn_scores.masked_fill_(causal_mask.unsqueeze(0).unsqueeze(0), -torch.inf)
+        attn_scores.masked_fill_(causal_mask, -torch.inf)
 
         attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
         attn_weights = self.dropout(attn_weights)
@@ -110,13 +73,7 @@ def forward(self, x, use_cache=False):
         context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
         context_vec = self.out_proj(context_vec)  # optional projection
 
-        return context_vec
-
-    ####################################################
-    # NEW
-    def reset_cache(self):
-        self.cache_k, self.cache_v = None, None
-    ####################################################
+        return context_vec, next_cache
 
 
 #####################################
@@ -169,25 +126,17 @@ def __init__(self, cfg):
             context_length=cfg["context_length"],
             num_heads=cfg["n_heads"],
             dropout=cfg["drop_rate"],
-            qkv_bias=cfg["qkv_bias"],
-            window_size=cfg["kv_window_size"] if "kv_window_size" in cfg else cfg["context_length"]  # NEW
-        )
+            qkv_bias=cfg["qkv_bias"])
         self.ff = FeedForward(cfg)
         self.norm1 = LayerNorm(cfg["emb_dim"])
         self.norm2 = LayerNorm(cfg["emb_dim"])
         self.drop_shortcut = nn.Dropout(cfg["drop_rate"])
 
-    def forward(self, x, use_cache=False):
+    def forward(self, x, use_cache=False, start_pos=0, cache=None):
         # Shortcut connection for attention block
         shortcut = x
         x = self.norm1(x)
-
-        # x = self.att(x)   # Shape [batch_size, num_tokens, emb_size]
-        ####################################################
-        # NEW
-        x = self.att(x, use_cache=use_cache)
-        ####################################################
-
+        x, next_cache = self.att(x, use_cache=use_cache, start_pos=start_pos, cache=cache) # Shape [batch_size, num_tokens, emb_size]
         x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
@@ -198,7 +147,7 @@ def forward(self, x, use_cache=False):
         x = self.drop_shortcut(x)
         x = x + shortcut  # Add the original input back
 
-        return x
+        return x, next_cache
 
 
 class GPTModel(nn.Module):
@@ -208,80 +157,34 @@ def __init__(self, cfg):
         self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
         self.drop_emb = nn.Dropout(cfg["drop_rate"])
 
-        # self.trf_blocks = nn.Sequential(
-        #    *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
-        ####################################################
-        # NEW
-        self.trf_blocks = nn.ModuleList(
-            [TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
-
-        self.ptr_current_pos = 0
-        ####################################################
+        self.trf_blocks = nn.Sequential(
+            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
 
         self.final_norm = LayerNorm(cfg["emb_dim"])
         self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)
+        self.current_pos = 0
 
-    def forward(self, in_idx, use_cache=False):
+    def forward(self, in_idx, use_cache=False, cache=None):
         batch_size, seq_len = in_idx.shape
+        pos = torch.arange(self.current_pos, self.current_pos + seq_len, device=in_idx.device)
         tok_embeds = self.tok_emb(in_idx)
-
-        # pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
-
-        ####################################################
-        # NEW
+        pos_embeds = self.pos_emb(pos)
+        x = self.drop_emb(tok_embeds + pos_embeds)
 
         if use_cache:
-            pos_ids = torch.arange(self.ptr_current_pos, self.ptr_current_pos + seq_len, device=in_idx.device, dtype=torch.long)
-            self.ptr_current_pos += seq_len
+            start_pos = self.current_pos
+            self.current_pos += seq_len
         else:
-            pos_ids = torch.arange(0, seq_len, device=in_idx.device, dtype=torch.long)
-        pos_embeds = self.pos_emb(pos_ids).unsqueeze(0)
-        ####################################################
-
-        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
-        x = self.drop_emb(x)
+            start_pos = 0
 
-        # x = self.trf_blocks(x)
-        ####################################################
-        # NEW
-        for blk in self.trf_blocks:
-            x = blk(x, use_cache=use_cache)
-        ####################################################
+        next_cache = []
+        for i, block in enumerate(self.trf_blocks):
+            blk_cache = cache.get(i) if cache else None
+            x, new_cache = block(x, use_cache=use_cache, start_pos=start_pos, cache=blk_cache)
+            if cache:
+                cache.update(i, new_cache)
+            next_cache.append(new_cache)
 
         x = self.final_norm(x)
         logits = self.out_head(x)
         return logits
-
-    ####################################################
-    # NEW
-    def reset_kv_cache(self):
-        for blk in self.trf_blocks:
-            blk.att.reset_cache()
-        self.ptr_current_pos = 0
-    ####################################################
-
-
-def generate_text_simple(model, idx, max_new_tokens, context_size):
-    # idx is (B, T) array of indices in the current context
-    for _ in range(max_new_tokens):
-
-        # Crop current context if it exceeds the supported context size
-        # E.g., if LLM supports only 5 tokens, and the context size is 10
-        # then only the last 5 tokens are used as context
-        idx_cond = idx[:, -context_size:]
-
-        # Get the predictions
-        with torch.no_grad():
-            logits = model(idx_cond)
-
-        # Focus only on the last time step
-        # (batch, n_token, vocab_size) becomes (batch, vocab_size)
-        logits = logits[:, -1, :]
-
-        # Get the idx of the vocab entry with the highest logits value
-        idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch, 1)
-
-        # Append sampled index to the running sequence
-        idx = torch.cat((idx, idx_next), dim=1)  # (batch, n_tokens+1)
-
-    return idx
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=Fal`
`27`	`27`	`self.dropout = nn.Dropout(dropout)`
`28`	`28`	`self.register_buffer(`
`29`	`29`	`"mask",`
`30`		`- torch.triu(torch.ones(context_length, context_length),diagonal=1),`
	`30`	`+ torch.triu(torch.ones(context_length, context_length), diagonal=1),`
`31`	`31`	`persistent=False`
`32`	`32`	`)`
`33`	`33`