Update cudnn-frontend to 1.0.3 to fix cuDNN v9 SDPA NaNs (#650)

cyanguwa · timmoon10 · ptrendx · commit 5b90b7f5ed67 · 2024-02-02T20:47:52.000-08:00
* Update cudnn frontend to 1.0.3 to fix cudnn v9 Nans

Signed-off-by: Charlene Yang &lt;8636796+cyanguwa@users.noreply.github.com&gt;

* make d_out contiguous for bwd

Signed-off-by: Charlene Yang &lt;8636796+cyanguwa@users.noreply.github.com&gt;

* remove cudnnDestroy to let torch handle it

Signed-off-by: Charlene Yang &lt;8636796+cyanguwa@users.noreply.github.com&gt;

* Update transformer_engine/pytorch/attention.py

Co-authored-by: Tim Moon &lt;4406448+timmoon10@users.noreply.github.com&gt;
Signed-off-by: cyanguwa &lt;8636796+cyanguwa@users.noreply.github.com&gt;

* Update transformer_engine/pytorch/attention.py

Co-authored-by: Tim Moon &lt;4406448+timmoon10@users.noreply.github.com&gt;
Signed-off-by: cyanguwa &lt;8636796+cyanguwa@users.noreply.github.com&gt;

* Update transformer_engine/pytorch/attention.py

Co-authored-by: Tim Moon &lt;4406448+timmoon10@users.noreply.github.com&gt;
Signed-off-by: cyanguwa &lt;8636796+cyanguwa@users.noreply.github.com&gt;

---------

Signed-off-by: Charlene Yang &lt;8636796+cyanguwa@users.noreply.github.com&gt;
Signed-off-by: cyanguwa &lt;8636796+cyanguwa@users.noreply.github.com&gt;
Co-authored-by: Tim Moon &lt;4406448+timmoon10@users.noreply.github.com&gt;
diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
@@ -1 +1 @@
-Subproject commit 9f82dda5c029d15a5f371f0fe003dc0c74a0c987
+Subproject commit a86ad708db725e4d29919bb6fadf8e6cdfa5dc06
diff --git a/transformer_engine/common/fused_attn/utils.h b/transformer_engine/common/fused_attn/utils.h
@@ -152,11 +152,6 @@ class cudnnExecutionPlanManager {
     }
 
     ~cudnnExecutionPlanManager() {
-        static thread_local std::once_flag flag;
-        std::call_once(flag, [&] {
-                        if (handle_ != nullptr) {
-                          cudnnDestroy(handle_);
-                        }});
     }
 
  private:
diff --git a/transformer_engine/pytorch/attention.py b/transformer_engine/pytorch/attention.py
@@ -1733,6 +1733,7 @@ def forward(ctx, is_training, max_seqlen, cu_seqlens, qkv, qkv_dtype, attn_bias,
 
     @staticmethod
     def backward(ctx, d_out):
+        d_out = d_out.contiguous()
         qkv, out, cu_seqlens = ctx.saved_tensors
         if not ctx.aux_ctx_tensors[0].is_contiguous():
             ctx.aux_ctx_tensors[0] = ctx.aux_ctx_tensors[0].contiguous()
@@ -1802,6 +1803,7 @@ def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seql
 
     @staticmethod
     def backward(ctx, d_out):
+        d_out = d_out.contiguous()
         q, kv, out, cu_seqlens_q, cu_seqlens_kv = ctx.saved_tensors
         if not ctx.aux_ctx_tensors[0].is_contiguous():
             ctx.aux_ctx_tensors[0] = ctx.aux_ctx_tensors[0].contiguous()
@@ -1883,6 +1885,7 @@ def forward(ctx, is_training, max_seqlen_q, max_seqlen_kv, cu_seqlens_q, cu_seql
 
     @staticmethod
     def backward(ctx, d_out):
+        d_out = d_out.contiguous()
         q, k, v, out, cu_seqlens_q, cu_seqlens_kv = ctx.saved_tensors
         if not ctx.aux_ctx_tensors[0].is_contiguous():
             ctx.aux_ctx_tensors[0] = ctx.aux_ctx_tensors[0].contiguous()

Original file line number	Diff line number	Diff line change
`@@ -152,11 +152,6 @@ class cudnnExecutionPlanManager {`
`152`	`152`	`}`
`153`	`153`
`154`	`154`	`~cudnnExecutionPlanManager() {`
`155`		`- static thread_local std::once_flag flag;`
`156`		`- std::call_once(flag, [&] {`
`157`		`- if (handle_ != nullptr) {`
`158`		`- cudnnDestroy(handle_);`
`159`		`- }});`
`160`	`155`	`}`
`161`	`156`
`162`	`157`	`private:`