AI-Hypercomputer · Cjkkkk · Jun 4, 2025 · bvandermoon · Jun 4, 2025 · gobbleturk
@@ -888,7 +888,7 @@ def cudnn_jax_flash_attention(
     if model_mode == MODEL_MODE_AUTOREGRESSIVE:
       lengths = jnp.sum(decoder_segment_ids, axis=-1)
 
-      return dot_product_attention(
+      output, lse = dot_product_attention(
           query,
           key,
           value,
@@ -901,7 +901,7 @@ def cudnn_jax_flash_attention(
           return_residual=True
       )
     else:
-      return dot_product_attention(
+      output, lse = dot_product_attention(
           query,
           key,
           value,
@@ -911,6 +911,9 @@ def cudnn_jax_flash_attention(
           qkv_layout="BTNH",
           return_residual=True
       )
+    output = checkpoint_name(output, "context")
+    lse = checkpoint_name(lse, "context")
+    return output, lse
 
   def compute_local_attention(
       self, attn_weights: Array, value: Array | KVTensor, q_seq_len: int, model_mode: str