address #3

lucidrains · lucidrains · commit becf35d09b3e · 2024-12-22T15:52:02.000Z
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'speculative-decoding',
   packages = find_packages(exclude=[]),
-  version = '0.1.4',
+  version = '0.2.0',
   license='MIT',
   description = 'Speculative Decoding',
   author = 'Phil Wang',
@@ -18,8 +18,8 @@
   ],
   install_requires=[
     'beartype',
-    'einops>=0.6.1',
-    'torch>=1.12',
+    'einops>=0.8.0',
+    'torch>=2.4',
   ],
   classifiers=[
     'Development Status :: 4 - Beta',
diff --git a/speculative_decoding/speculative_decoding.py b/speculative_decoding/speculative_decoding.py
@@ -5,7 +5,6 @@
 from torch import nn, einsum, Tensor
 import torch.nn.functional as F
 
-from rotary_embedding_torch import RotaryEmbedding
 from beartype import beartype
 
 from collections import namedtuple
@@ -205,38 +204,39 @@ def speculative_decoding(
 
         # do a bunch of slicing and align everything to the right, including kv caches
 
-        max_num_rejected = num_rejected.amax()
-        seq_arange = torch.arange(out.shape[-1], device = device, dtype = torch.long)
-        seq_offset_indices = seq_arange + (max_num_rejected - num_rejected)[..., None]
-
         seq_lens -= num_rejected
         max_seq_len = seq_lens.amax()
+        curr_len = out.shape[-1]
 
-        if batch > 1:
-            out = F.pad(out, (0, max_num_rejected), value = pad_id)
-            out = out[batch_range, seq_offset_indices]
+        seq_arange = torch.arange(max_seq_len, device = device, dtype = torch.long) + (curr_len - max_seq_len)
+        seq_offset_indices = seq_arange - num_rejected[..., None]
 
-            cache = tuple(F.pad(t, (0, 0, 0, max_num_rejected), value = pad_id) for t in cache)
-            small_cache = tuple(F.pad(t, (0, 0, 0, max_num_rejected), value = pad_id) for t in small_cache)
+        cached_kv, _ = cache
+        small_cached_kv, _ = small_cache
 
-            cache = tuple(rearrange(t, 'b ... n d -> b n ... d') for t in cache)
-            small_cache = tuple(rearrange(t, 'b ... n d -> b n ... d') for t in small_cache)
+        if batch > 1:
+            small_cached_kv = F.pad(small_cached_kv, (0, 0, 0, 1))
 
-            cache = tuple(t[batch_range, seq_offset_indices] for t in cache)
-            small_cache = tuple(t[batch_range, seq_offset_indices] for t in small_cache)
+            out = out[batch_range, seq_offset_indices]
 
-            cache = tuple(rearrange(t, 'b n ... d -> b ... n d') for t in cache)
-            small_cache = tuple(rearrange(t, 'b n ... d -> b ... n d') for t in small_cache)
+            cached_kv = rearrange(cached_kv, 'b ... n d -> b n ... d')
+            small_cached_kv = rearrange(small_cached_kv, 'b ... n d -> b n ... d')
 
-            if out.shape[-1] > max_seq_len:
-                left_index = out.shape[-1] - max_seq_len
-                out = out[:, left_index:]
-                cache = tuple(t[..., left_index:, :] for t in cache)
-                small_cache = tuple(t[..., left_index:, :] for t in small_cache)
+            cached_kv = cached_kv[batch_range, seq_offset_indices]
+            small_cached_kv = small_cached_kv[batch_range, seq_offset_indices]
 
+            cached_kv = rearrange(cached_kv, 'b n ... d -> b ... n d')
+            small_cached_kv = rearrange(small_cached_kv, 'b n ... d -> b ... n d')
+
+            small_cached_kv = small_cached_kv[..., :-1, :]
         else:
-            # if batch size of 1, just slice to be equal to the lone int in seq_lens
-            out = out[..., :seq_lens.item()]
+            # if batch size of 1, just slice to max_seq_len
+            out = out[..., :max_seq_len]
+            cached_kv = cached_kv[..., :max_seq_len, :]
+            small_cached_kv = small_cached_kv[..., :max_seq_len, :]
+
+        cache = (cached_kv, None)
+        small_cache = (small_cached_kv, None)
 
         # sample the additional token, one of the tricks in the paper to better bound the worst case
 
@@ -364,37 +364,38 @@ def speculative_decoding_with_same_model(
 
         # do a bunch of slicing and align everything to the right, including kv caches
 
-        max_num_rejected = num_rejected.amax()
-        seq_arange = torch.arange(out.shape[-1], device = device, dtype = torch.long)
-        seq_offset_indices = seq_arange + (max_num_rejected - num_rejected)[..., None]
-
         seq_lens -= num_rejected
         max_seq_len = seq_lens.amax()
+        curr_len = out.shape[-1]
+
+        seq_arange = torch.arange(max_seq_len, device = device, dtype = torch.long) + (curr_len - max_seq_len)
+        seq_offset_indices = seq_arange - num_rejected[..., None]
+
+        cached_kv, _ = cache
+        small_cached_kv, _ = small_cache
 
         if batch > 1:
-            out = F.pad(out, (0, max_num_rejected), value = pad_id)
+            small_cached_kv = F.pad(small_cached_kv, (0, 0, 0, 1))
             out = out[batch_range, seq_offset_indices]
 
-            cache = tuple(F.pad(t, (0, 0, 0, max_num_rejected), value = pad_id) for t in cache)
-            small_cache = tuple(F.pad(t, (0, 0, 0, max_num_rejected), value = pad_id) for t in small_cache)
-
-            cache = tuple(rearrange(t, 'b ... n d -> b n ... d') for t in cache)
-            small_cache = tuple(rearrange(t, 'b ... n d -> b n ... d') for t in small_cache)
+            cached_kv = rearrange(cached_kv, 'b ... n d -> b n ... d')
+            small_cached_kv = rearrange(small_cached_kv, 'b ... n d -> b n ... d')
 
-            cache = tuple(t[batch_range, seq_offset_indices] for t in cache)
-            small_cache = tuple(t[batch_range, seq_offset_indices] for t in small_cache)
+            cached_kv = cached_kv[batch_range, seq_offset_indices]
+            small_cached_kv = small_cached_kv[batch_range, seq_offset_indices]
 
-            cache = tuple(rearrange(t, 'b n ... d -> b ... n d') for t in cache)
-            small_cache = tuple(rearrange(t, 'b n ... d -> b ... n d') for t in small_cache)
+            cached_kv = rearrange(cached_kv, 'b n ... d -> b ... n d')
+            small_cached_kv = rearrange(small_cached_kv, 'b n ... d -> b ... n d')
 
-            if out.shape[-1] > max_seq_len:
-                left_index = out.shape[-1] - max_seq_len
-                out = out[:, left_index:]
-                cache = tuple(t[..., left_index:, :] for t in cache)
-                small_cache = tuple(t[..., left_index:, :] for t in small_cache)
+            small_cached_kv[..., :-1, :]
         else:
-            # if batch size of 1, just slice to be equal to the lone int in seq_lens
-            out = out[..., :seq_lens.item()]
+            # if batch size of 1, just slice to max_seq_len
+            out = out[..., :max_seq_len]
+            cached_kv = cached_kv[..., :max_seq_len, :]
+            small_cached_kv = small_cached_kv[..., :max_seq_len, :]
+
+        cache = (cached_kv, None)
+        small_cache = (small_cached_kv, None)
 
         # sample the additional token, one of the tricks in the paper to better bound the worst case
 
@@ -414,17 +415,6 @@ def speculative_decoding_with_same_model(
 
     return out[..., prompt_seq_len:], total_accepted / num_steps
 
-# norm
-
-class RMSNorm(Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.scale = dim ** 0.5
-        self.gamma = nn.Parameter(torch.ones(dim))
-
-    def forward(self, x):
-        return F.normalize(x, dim = -1) * self.scale * self.gamma
-
 # attention and feedforward
 
 class CausalAttention(Module):
@@ -440,7 +430,7 @@ def __init__(
         self.heads = heads
         dim_inner = dim_head * heads
 
-        self.norm = RMSNorm(dim)
+        self.norm = nn.RMSNorm(dim)
 
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias = False)
         self.to_out = nn.Linear(dim_inner, dim, bias = False)
@@ -492,7 +482,7 @@ def forward(
 def FeedForward(dim, mult = 4):
     dim_inner = dim * mult
     return nn.Sequential(
-        RMSNorm(dim),
+        nn.RMSNorm(dim),
         nn.Linear(dim, dim_inner),
         nn.GELU(),
         nn.Linear(dim_inner, dim)
@@ -529,7 +519,7 @@ def __init__(
             ]))
 
         self.to_logits = nn.Sequential(
-            RMSNorm(dim),
+            nn.RMSNorm(dim),
             nn.Linear(dim, num_tokens, bias = False)
         )
 
diff --git a/speculative_decoding/speculative_decoding_with_prophet.py b/speculative_decoding/speculative_decoding_with_prophet.py
@@ -5,7 +5,6 @@
 from torch import nn, einsum, Tensor
 import torch.nn.functional as F
 
-from rotary_embedding_torch import RotaryEmbedding
 from beartype import beartype
 
 from collections import namedtuple
@@ -92,17 +91,6 @@ def base_decoding(
 
     return out[..., prompt_seq_len:]
 
-# norm
-
-class RMSNorm(Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.scale = dim ** 0.5
-        self.gamma = nn.Parameter(torch.ones(dim))
-
-    def forward(self, x):
-        return F.normalize(x, dim = -1) * self.scale * self.gamma
-
 # attention and feedforward
 
 class CausalAttention(Module):
@@ -118,7 +106,7 @@ def __init__(
         self.heads = heads
         dim_inner = dim_head * heads
 
-        self.norm = RMSNorm(dim)
+        self.norm = nn.RMSNorm(dim)
 
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias = False)
         self.to_out = nn.Linear(dim_inner, dim, bias = False)
@@ -170,7 +158,7 @@ def forward(
 def FeedForward(dim, mult = 4):
     dim_inner = dim * mult
     return nn.Sequential(
-        RMSNorm(dim),
+        nn.RMSNorm(dim),
         nn.Linear(dim, dim_inner),
         nn.GELU(),
         nn.Linear(dim_inner, dim)
@@ -205,7 +193,7 @@ def __init__(
             ]))
 
         self.to_logits = nn.Sequential(
-            RMSNorm(dim),
+            nn.RMSNorm(dim),
             nn.Linear(dim, num_tokens, bias = False)
         )
 
@@ -509,25 +497,28 @@ def speculative_decoding_with_prophet_model(
         # do a bunch of slicing and align everything to the right, including kv caches
 
         max_num_rejected = num_rejected.amax()
-        seq_arange = torch.arange(out.shape[-1], device = device, dtype = torch.long)
-        seq_offset_indices = seq_arange + (max_num_rejected - num_rejected)[..., None]
 
+        curr_len = out.shape[-1]
         seq_lens -= num_rejected
         max_seq_len = seq_lens.amax()
 
+        seq_arange = torch.arange(max_seq_len, device = device, dtype = torch.long) + (curr_len - max_seq_len)
+
+        seq_offset_indices = seq_arange - num_rejected[..., None]
+
+        cached_kv, embed = cache
+
         if batch > 1:
-            out = F.pad(out, (0, max_num_rejected), value = pad_id)
             out = out[batch_range, seq_offset_indices]
 
-            cache = tuple(F.pad(t, (0, 0, 0, max_num_rejected), value = pad_id) for t in cache)
-            cache = tuple(rearrange(t, 'b ... n d -> b n ... d') for t in cache)
-            cache = tuple(t[batch_range, seq_offset_indices] for t in cache)
-            cache = tuple(rearrange(t, 'b n ... d -> b ... n d') for t in cache)
+            cached_kv = rearrange(cached_kv, 'b ... n d -> b n ... d')
+            cached_kv = cached_kv[batch_range, seq_offset_indices]
+            cached_kv = rearrange(cached_kv, 'b n ... d -> b ... n d')
+        else:
+            out = out[..., :max_seq_len]
+            cached_kv = cached_kv[..., :max_seq_len, :]
 
-            if out.shape[-1] > max_seq_len:
-                left_index = out.shape[-1] - max_seq_len
-                out = out[:, left_index:]
-                cache = tuple(t[..., left_index:, :] for t in cache)
+        cache = (cached_kv, None)
 
         # sample the additional token, one of the tricks in the paper to better bound the worst case
 
@@ -536,7 +527,6 @@ def speculative_decoding_with_prophet_model(
         out = torch.cat((out, next_token), dim = -1)
         seq_lens += 1
 
-        _, embeds = cache
         next_prophet_start_tokens = to_prophet_start_token(embeds[:, -num_start_tokens:])
 
     # now left align
diff --git a/train.py b/train.py
@@ -27,7 +27,7 @@
 LEARNING_RATE = 1e-4
 VALIDATE_EVERY = 100
 PRIME_LENGTH = 128
-GENERATE_EVERY = 500
+GENERATE_EVERY = 100
 GENERATE_LENGTH = 512
 SEQ_LEN = 512
 GAMMA = 5
@@ -104,7 +104,7 @@ def __len__(self):
 train_dataset = TextSamplerDataset(data_train, SEQ_LEN)
 val_dataset = TextSamplerDataset(data_val, SEQ_LEN)
 train_loader = cycle(DataLoader(train_dataset, batch_size=BATCH_SIZE))
-val_loader = cycle(DataLoader(val_dataset, batch_size=BATCH_SIZE))
+val_loader = cycle(DataLoader(val_dataset, batch_size=1))
 
 # optimizer
 
@@ -126,8 +126,7 @@ def __len__(self):
         (loss / GRAD_ACCUM_EVERY).backward()
         (small_loss / GRAD_ACCUM_EVERY).backward()
 
-    print(f"training loss: {loss.item():.3f}")
-    print(f"training small loss: {small_loss.item():.3f}")
+    print(f"loss: {loss.item():.3f}\tsmall loss: {small_loss.item():.3f}")
 
     torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
     torch.nn.utils.clip_grad_norm_(small_model.parameters(), 0.5)
@@ -144,10 +143,9 @@ def __len__(self):
             valid_data = next(val_loader)
 
             loss = model(valid_data, return_loss = True)
-            print(f"validation loss: {loss.item():.3f}")
-
             small_loss = small_model(valid_data, return_loss = True)
-            print(f"validation small loss: {small_loss.item():.3f}")
+
+            print(f"validation - loss: {loss.item():.3f}\tsmall loss: {small_loss.item():.3f}")
 
     if i % GENERATE_EVERY == 0:
         model.eval()
@@ -157,7 +155,8 @@ def __len__(self):
         prime = decode_tokens(inp)
         print(f"%s \n\n %s", (prime, "*" * 100))
 
-        prompt = inp[None, ...]
+        from einops import repeat
+        prompt = repeat(inp, '... -> b ...', b = 2)
 
         sampled, base_decode_elapsed = benchmark(base_decoding)(model, prompt, GENERATE_LENGTH)
 
diff --git a/train_early_exit.py b/train_early_exit.py
@@ -116,8 +116,7 @@ def __len__(self):
 
         ((loss + small_loss * EARLY_EXIT_LOSS_WEIGHT) / GRAD_ACCUM_EVERY).backward()
 
-    print(f"training loss: {loss.item():.3f}")
-    print(f"training small loss: {small_loss.item():.3f}")
+    print(f"loss: {loss.item():.3f}\tsmall loss: {small_loss.item():.3f}")
 
     torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
 
@@ -130,8 +129,7 @@ def __len__(self):
             valid_data = next(val_loader)
 
             loss, small_loss = model(valid_data, return_loss = True)
-            print(f"validation loss: {loss.item():.3f}")
-            print(f"validation small loss: {small_loss.item():.3f}")
+            print(f"validation - loss: {loss.item():.3f}\tsmall loss: {small_loss.item():.3f}")
 
     if i % GENERATE_EVERY == 0:
         model.eval()
diff --git a/train_prophet.py b/train_prophet.py
@@ -130,8 +130,7 @@ def __len__(self):
 
         (total_loss / GRAD_ACCUM_EVERY).backward()
 
-    print(f"training loss: {loss.item():.3f}")
-    print(f"training prophet loss: {prophet_loss.item():.3f}")
+    print(f"loss: {loss.item():.3f}\tprophet loss: {prophet_loss.item():.3f}")
 
     torch.nn.utils.clip_grad_norm_(model_and_prophet.parameters(), 0.5)