Transformer converging to always predicting the same token #4689

loshmii · 2025-04-05T17:54:57Z

loshmii
Apr 5, 2025

The link to ipynb file - Notebook

Hi, I have been trying to implement a naive transformer using Jax and Flax. I have managed to implement something that looks like it works. The problem is, when I'm trying to train my model, after 2-3 epochs it starts predicting the same token - the, no matter what. I have checked for exploding gradient, not updating parameters and all transformer blocks pointing to the same instance, but none of that seem to be the case. The dataset that I'm using is WikiText-2-raw

class RNGManager : 
    def __init__(self, seed = 0) : 
        self.key = jax.random.key(seed)
        self.count = seed

    def next(self) -> jax.Array :
        key, new_key = jax.random.split(self.key)
        return new_key
    
    def next_rngs(self) -> nnx.Rngs :
        self.count += 1
        return nnx.Rngs(self.count)

mngr = RNGManager()

batch, seq_len, d_model, n_heads, n_layers = 8, 256, 512, 8, 12
tokenizer = tiktoken.get_encoding("r50k_base")
vocab_size = tokenizer.n_vocab
padding_id = vocab_size - 1
input = random.normal(random.key(0), (batch, seq_len, d_model))
mngr = RNGManager()

class TokenEmbeddings(nnx.Module) :
  def __init__(self, manager : RNGManager, vocab_size, d_model) :
    super().__init__()
    self.layer = nnx.Embed(num_embeddings=vocab_size, features=d_model, rngs=manager.next_rngs())

  def __call__(self, x : jax.Array) -> jax.Array :
    """Forward pass of token embeddings

    Args :
      x : jax.Array
        The input tensor of shape [batch, seq_len]

    Return :
      output : jax.Array
        The output tensor of shape [batch, seq_len, d_model]
    """
    output = self.layer(x)
    return output

_WAWELENGTH = 10000

class PositionalEncoding(nnx.Module) :
  def __init__(self, manager : RNGManager, d_p = 0.2) :
    super().__init__()
    self.droput = nnx.Dropout(rngs= manager.next_rngs(), rate = d_p)


  def __call__(self, x : jax.Array) -> jax.Array :
    """Forward pass of positional encoder

    Args :
      x : jax.Array
        The input tensor of shape [batch, seq_len, d_model]

    Return :
      output : jax.Array
        The output tensor of shape [batch, seq_len, d_model]
"""
    batch, seq_len, d_model = x.shape
    pos = jnp.arange(seq_len, dtype=jnp.float32)
    pos = jnp.expand_dims(pos, axis = 1) #[seq_len, 1]
    idx = jnp.arange(d_model // 2, dtype = jnp.float32)
    idx = jnp.expand_dims(idx, axis = 0) #[1, d_model//2]
    div_term = jnp.pow(jnp.array(_WAWELENGTH), 2* idx/d_model) #[1, d_model // 2]
    sin_part = jnp.sin(pos/div_term)
    cos_part = jnp.cos(pos/div_term)
    pe = jnp.stack([sin_part, cos_part], axis = 1)
    pe = jnp.reshape(pe, (seq_len, d_model))
    pe = jnp.expand_dims(pe, axis = 0)
    pe = jnp.broadcast_to(pe, (batch, seq_len, d_model))
    pe = self.droput(pe)
    return pe

class LayerNorm(nnx.Module) :
  def __init__(self, d_model) :
    super().__init__()
    self.gamma = nnx.Param(jnp.ones((1,d_model)))
    self.beta = nnx.Param(jnp.zeros((1,d_model)))
    self.eps = 1e-6

  def __call__(self, x : jax.Array) -> jax.Array  :
    """Forward pass of layer normalization.

    Args :
      x : jax.Array
        The input tensor of shape [batch, seq_len, d_model]

    Return :
      output : jax.Array
        The output tensor of shape [batch, seq_len, d_model]
    """
    mean = jnp.mean(x, axis = -1, keepdims=True)
    std = jnp.std(x, axis = -1, keepdims = True)
    return self.gamma*(x - mean) / (std + self.eps) + self.beta

class AddNorm(nnx.Module) :
  def __init__(self, d_model) :
    super().__init__()
    self.norm = LayerNorm(d_model)

  def __call__(self, x : jax.Array, y : jax.Array) -> jax.Array :
    """Forward pass of Add&Norm layer

    Args :
      x : jax.Array
        The input tensor of shape [batch, seq_len, d_model]
      y : jax.Array
        The input tensor of shape [batch, seq_len, d_model]

    Return :
      output : jax.Array
        The output tensor of shape [batch, seq_len, d_model]
    """
    return self.norm(x + y)

class FeedForward(nnx.Module) :
  def __init__(self, manager : RNGManager, d_model, d_p = 0.2) :
    self.d_model = d_model
    self.linear1 = nnx.Linear(in_features=d_model, out_features=4*d_model, rngs=manager.next_rngs())
    self.linear2 = nnx.Linear(in_features=4*d_model, out_features=d_model, rngs=manager.next_rngs())
    self.dropout = nnx.Dropout(d_p, rngs = mngr.next_rngs())

  def __call__(self, x : jax.Array) -> jax.Array :
    """Forward pass of feed forward layer

    Args :
      x : jax.Array
        The input tensor of shape [batch, seq_len, d_model]

    Return :
      output : jax.Array
        The output tensor of shape [batch, seq_len, d_model]
    """

    hidden = self.linear1(x)
    activation = nnx.relu(hidden)
    output = self.linear2(activation)
    output = self.dropout(output)
    return output

class Attention(nnx.Module) :
  def __init__(self, manager : RNGManager, d_model, n_heads, d_p = 0.2) :
    super().__init__()
    self.n_heads = n_heads
    self. d_model = d_model
    self.d_k = d_model//n_heads
    self.d_v = self.d_k
    self.W_q = nnx.Linear(in_features=self.d_model, out_features=self.d_model, rngs=manager.next_rngs())
    self.W_k = nnx.Linear(in_features=self.d_model, out_features=self.d_model, rngs = manager.next_rngs())
    self.W_v = nnx.Linear(in_features=self.d_model, out_features=self.d_model, rngs = manager.next_rngs())
    self.out_proj = nnx.Linear(in_features= self.d_model, out_features=self.d_model, rngs = manager.next_rngs())
    self.dropout = nnx.Dropout(rate=d_p, rngs= manager.next_rngs())

  def __call__(self, x : jax.Array, padding_mask : jax.Array) -> jax.Array :
    """Forward pass of attention mechanism

    Args :
      x : jax.Array
        The input tensor of shape [batch, seq_len, d_model]
      padding_mask : jax.Array
        Tensor of shape [batch, seq_len] that has 1s at positions that should be
        masked in softmax, and 0s elsewhere

    Return :
      output : jax.Array
        The output tensor of shape [batch, seq_len, d_model]
  """

    def safe_softmax(scores : jax.Array, axis = -1) : 
      all_masked = jnp.all(jnp.isneginf(scores), axis = axis, keepdims=True)
      softmax_scores = nnx.softmax(scores, axis=axis)
      safe_scores = jnp.where(all_masked, jnp.zeros_like(softmax_scores), softmax_scores)
      return safe_scores
    
    batch, seq_len, _ = x.shape
    causal_mask = True
    all_pad = jnp.all(padding_mask == 1, axis = -1)
    all_pad = jnp.expand_dims(all_pad, axis = 1)
    all_pad = jnp.expand_dims(all_pad, axis = 2)
    all_pad = jnp.broadcast_to(all_pad, (batch, seq_len, self.n_heads*self.d_k))
    padding_mask = jnp.expand_dims(padding_mask, axis=1)
    padding_mask = jnp.expand_dims(padding_mask, axis=1)
    padding_mask = jnp.broadcast_to(padding_mask, (batch, self.n_heads, seq_len, seq_len))
    padding_mask = jnp.where(padding_mask == 1, float("-Inf"), 0)
    W_q_weight = self.W_q.kernel
    W_q_weight = jnp.reshape(W_q_weight, (self.d_model, self.n_heads, self.d_k))
    q_bias = jnp.reshape(self.W_q.bias, (self.n_heads, self.d_k))
    q_bias = jnp.expand_dims(q_bias, axis=0)
    q_bias = jnp.expand_dims(q_bias, axis=0)
    q_bias = jnp.broadcast_to(q_bias, (batch, seq_len, self.n_heads, self.d_k))
    W_k_weight = self.W_k.kernel
    W_k_weight = jnp.reshape(W_k_weight, (self.d_model, self.n_heads, self.d_k))
    k_bias = jnp.reshape(self.W_k.bias, (self.n_heads, self.d_k))
    k_bias = jnp.expand_dims(k_bias, axis=0)
    k_bias = jnp.expand_dims(k_bias, axis=0)
    k_bias = jnp.broadcast_to(k_bias, (batch, seq_len, self.n_heads, self.d_k))
    W_v_weight = self.W_v.kernel
    W_v_weight = jnp.reshape(W_v_weight, (self.d_model, self.n_heads, self.d_k))
    v_bias = jnp.reshape(self.W_v.bias, (self.n_heads, self.d_k))
    v_bias = jnp.expand_dims(v_bias, axis=0)
    v_bias = jnp.expand_dims(v_bias, axis=0)
    v_bias = jnp.broadcast_to(v_bias, (batch, seq_len, self.n_heads, self.d_k))
    Q = jnp.einsum("b l m, m n v -> b l n v", x, W_q_weight)
    Q = jnp.add(Q, q_bias)
    K = jnp.einsum("b l m, m n v -> b l n v", x, W_k_weight)
    K = jnp.add(K, k_bias)
    V = jnp.einsum("b l m, m n v -> b l n v", x, W_v_weight)
    V = jnp.add(V, v_bias)
    scores = jnp.einsum("b l n v, b m n v -> b n l m", Q, K)
    mask = jnp.full((seq_len, seq_len), fill_value=-float("Inf"))
    mask = jnp.triu(mask, k=1) if causal_mask else jnp.zeros((seq_len, seq_len))
    mask = jnp.expand_dims(mask, axis = 0)
    mask = jnp.expand_dims(mask, axis=0)
    mask = jnp.broadcast_to(mask, (batch, self.n_heads, seq_len, seq_len))
    scores = jnp.divide(scores, jnp.sqrt(jnp.array(self.d_k, dtype=jnp.float32)))
    scores = jnp.add(scores, mask)
    scores = jnp.add(scores, padding_mask)
    scores = safe_softmax(scores, axis=-1)
    attn = jnp.einsum("b n l s, b s n v-> b l n v", scores, V)
    attn = jnp.reshape(attn, (batch, seq_len, self.n_heads*self.d_k))
    attn = jnp.where(all_pad==True, 0, attn)
    attn = self.dropout(attn)
    attn = self.out_proj(attn)
    return attn

class TransformerBlock(nnx.Module) :
  def __init__(self, manager : RNGManager, d_model, n_heads) :
    super().__init__()
    self.attention = Attention(manager, d_model=d_model, n_heads=n_heads)
    self.add_norm1 = AddNorm(d_model)
    self.ff = FeedForward(manager, d_model=d_model)
    self.add_norm2 = AddNorm(d_model)

  def __call__(self, x : jax.Array, padding_mask : jax.Array) -> jax.Array :
    """Forward pass of the transformer block.

    Args :
      x : jax.Array
        The input tensor of shape [batch, seq_len, d_model]
      padding_mask : jax.Array
        The padding mask of shape [batch, seq_len]

    Return :
      output : jax.Array
        The output tensor of shape [batch, seq_len, d_model]
    """
    x_attn = self.attention(x, padding_mask)
    x_norm = self.add_norm1(x, x_attn)
    return self.add_norm2(x_norm, self.ff(x_norm))

class TransformerOutput(nnx.Module) :
  def __init__(self, manager : RNGManager, d_model, vocab_size) :
    super().__init__()
    self.linear = nnx.Linear(in_features=d_model, out_features=vocab_size, rngs=manager.next_rngs())
    self.dropout = nnx.Dropout(rate = 0.2, rngs=manager.next_rngs())

  def __call__(self, x : jax.Array) -> jax.Array :
    """Forward pass of the transformer output layer.
    The layer is used to project the output over vocabulary and apply softmax
    to get probabilities.

    Args :
      x : jax.Array
        The input tensor of shape [batch, seq_len, d_model]

    Return :
      output : jax.Array
        The output tensor of shape [batch, seq_len, vocab_size]
    """
    projected = self.linear(x)
    projected = self.dropout(projected)
    if self.dropout.deterministic : 
      projected = nnx.softmax(projected, axis = -1)
    return projected

class Transformer(nnx.Module) :
  def __init__(self, manager : RNGManager, d_model, n_heads, n_layers, padding_id = padding_id) :
    super().__init__()
    self.token_embedding = TokenEmbeddings(manager, vocab_size=vocab_size, d_model=d_model)
    self.pos_enc = PositionalEncoding(manager)
    self.transformer_blocks = [TransformerBlock(manager, d_model=d_model, n_heads=n_heads) for _ in range(n_layers)]
    self.transformer_output = TransformerOutput(manager, d_model=d_model, vocab_size=vocab_size)
    self.padding_id = padding_id

  def __call__(self, x : jax.Array) -> jax.Array :
    """Forward pass of the transformer model.

    Args :
      x : jax.Array
        The input tensor of shape [batch, seq_len]

    Return  :
      output : jax.Array
        The output tensor of shape [batch, seq_len, vocab_size]
    """

    padding_mask = jnp.where(x == self.padding_id, 1, 0)
    x = self.token_embedding(x)
    x = jnp.add(x, self.pos_enc(x))
    for transformer_block in self.transformer_blocks :
      x = transformer_block(x, padding_mask)
    return self.transformer_output(x)

Here are my functions regarding training :

def create_transformer(manager : RNGManager) : 
    transformer = Transformer(manager, d_model=d_model, n_heads=n_heads, n_layers=n_layers)
    optimizer = optax.adam(1e-4)
    init_state = optimizer.init(nnx.state(transformer, nnx.Param))
    
    return transformer, optimizer, init_state

model, optimizer, init_state = create_transformer(mngr)

@nnx.jit
def train_step(model : Transformer, state : optax.OptState, inputs : jax.Array, targets : jax.Array) :
    def loss_fn(model : Transformer, inputs : jax.Array, targets : jax.Array) : 
       logits = model(inputs)
       batch, seq_len, _ = logits.shape
       targets = jnp.reshape(targets, (batch*seq_len))
       logits = jnp.reshape(logits, (batch*seq_len, vocab_size))
       loss_fn = optax.softmax_cross_entropy_with_integer_labels
       loss = loss_fn(logits, targets)
       return jnp.mean(loss)

    loss, grad = nnx.value_and_grad(loss_fn)(model, inputs, targets)
    updates, state = optimizer.update(grad, state)
    params = optax.apply_updates(nnx.state(model, nnx.Param), updates)
    nnx.update(model, params)
    return loss, state

@nnx.jit
def valid_step(model : Transformer, inputs : jax.Array, targets : jax.Array) : 
  def loss_fn(model : Transformer, inputs : jax.Array, targets : jax.Array) : 
    logits = model(inputs)
    batch, seq_len, _ = logits.shape
    targets = jnp.reshape(targets, (batch*seq_len))
    logits = jnp.reshape(logits, (batch*seq_len, vocab_size))
    loss_fn = optax.softmax_cross_entropy_with_integer_labels
    loss = loss_fn(logits, targets)
    return jnp.mean(loss)
  
  loss = loss_fn(model, inputs, targets)
  return loss


num_epochs = 20
epoch_loss = []
epoch_val = []
state = init_state

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))

for epoch in range(num_epochs) : 
  train_loss = []
  model.train()
  for input, target in data_generator(trainset, batch//2, mngr.next()) :
    loss, state = train_step(model=model, state=state, inputs=input, targets=target)
    train_loss.append(float(loss))
  epoch_loss = jnp.mean(jnp.array(train_loss))
  print(f'Training Loss = {epoch_loss} at epoch {epoch+1}')
  input_text = "Before the morning of 1 September had passed , reports coming in to US 2nd Division headquarters made it clear that North Koreans had penetrated to the north @-@ south Changnyong @-@ Yongsan road and cut the division in two ; the 38th and 23d Infantry Regiments with the bulk of the division artillery in the north were separated from the division headquarters and the 9th Infantry Regiment in the south . Keiser decided that this situation made it advisable to control and direct the divided division as two special forces . Accordingly , he placed the division artillery commander , Brigadier General Loyal M. Haynes , in command of the northern group ."
  generate_text(model=model, tokenizer=tokenizer, seq_len=seq_len, input_text=input_text)

  valid_loss = []
  for input, target in data_generator(valset, 1, mngr.next()) : 
    loss = valid_step(model=model, inputs=input, targets=target)
    valid_loss.append(float(loss))
  epoch_loss = jnp.mean(jnp.array(valid_loss))
  print(f'Validation Loss = {epoch_loss} at epoch {epoch+1}')
  
  ax1.plot(train_loss, label = f'Epoch {epoch+1}')
  ax2.plot(valid_loss, label = f'Epoch {epoch+1}')

ax1.set_title('Training loss')
ax1.set_ylabel('Loss')
ax1.legend()

ax2.set_title('Validation loss')
ax2.set_ylabel('Loss')
ax2.legend()

plt.tight_layout()
plt.show()

Now the problem is the model seem to converge to predicting the same token no matter the input (it looks like it is the most likely token in the dataset, because it's - the). I have been trying to figure out why's that happening, but cannot seem to do it. I have attached my Jupyter Notebook with the whole code that has some other interesting debugging data, like visualizing some matrix on update parameters during training. It seems like model is updating just fine, but I cannot comprehend why the training converges to this degenerative behavior. Any help is greatly appreciated 🙏 All cells after saving and restoring the model were my debugging cells

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Transformer converging to always predicting the same token #4689

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Transformer converging to always predicting the same token #4689

Uh oh!

loshmii Apr 5, 2025

Replies: 0 comments

loshmii
Apr 5, 2025