FlagOpen
diff --git a/‎benchmark/core_shapes.yaml
+8 b/‎benchmark/core_shapes.yaml
+8
diff --git a/‎benchmark/test_attention_perf.py
+35 b/‎benchmark/test_attention_perf.py
+35
diff --git a/‎src/flag_gems/__init__.py
+5 b/‎src/flag_gems/__init__.py
+5
diff --git a/‎src/flag_gems/ops/__init__.py
+2 b/‎src/flag_gems/ops/__init__.py
+2
@@ -170,3 +170,11 @@ ConvBenchmark:
   - [16, 32, 24, 24, 24, 3, 3, 1, 1, 2]
   - [16, 32, 24, 24, 24, 3, 3, 2, 2, 2]
   - [16, 32, 24, 24, 24, 3, 3, 1, 2, 2]
+
+AttentionBenchmark:
+  shapes:
+  - [4, 8, 512, 128]
+  - [4, 8, 1024, 128]
+  - [4, 8, 2048, 128]
+  - [4, 8, 3072, 128]
+  - [4, 8, 4096, 128]
@@ -0,0 +1,35 @@
+import pytest
+import torch
+
+from .performance_utils import GenericBenchmark
+
+
+class AttentionBenchmark(GenericBenchmark):
+    """
+    benchmark for attention
+    """
+
+    def set_more_shapes(self):
+        # self.shapes is a list of tuples, each containing three elements:
+        # (batch, num_heads, seq_len, head_size).
+        return None
+
+
+@pytest.mark.attention
+def test_perf_scaled_dot_product_attention():
+    def scaled_dot_product_attention_kwargs(shape, dtype, device):
+        query = torch.randn(shape, device=device, dtype=dtype)
+        key = torch.randn(shape, device=device, dtype=dtype)
+        value = torch.randn(shape, device=device, dtype=dtype)
+        yield query, key, value, None, 0.0, True
+
+    bench = AttentionBenchmark(
+        op_name="scaled_dot_product_attention",
+        input_fn=scaled_dot_product_attention_kwargs,
+        torch_op=torch.nn.functional.scaled_dot_product_attention,
+        dtypes=[
+            torch.float16,
+            torch.bfloat16,
+        ],
+    )
+    bench.run()
@@ -146,6 +146,11 @@ def enable(lib=aten_lib, unused=None, registrar=registrar):
             ("prod.dim_int", prod_dim, Autograd.disable),
             ("sum", sum, Autograd.disable),
             ("sum.dim_IntList", sum_dim, Autograd.disable),
+            (
+                "scaled_dot_product_attention",
+                scaled_dot_product_attention,
+                Autograd.disable,
+            ),
             ("all", all, Autograd.disable),
             ("all.dim", all_dim, Autograd.disable),
             ("all.dims", all_dims, Autograd.disable),
 
@@ -6,6 +6,7 @@
 from .any import any, any_dim, any_dims
 from .arange import arange, arange_start
 from .argmax import argmax
+from .attention import scaled_dot_product_attention
 from .bitwise_and import (
     bitwise_and_scalar,
     bitwise_and_scalar_tensor,
@@ -274,6 +275,7 @@
     "repeat_interleave_self_int",
     "vstack",
     "repeat_interleave_tensor",
+    "scaled_dot_product_attention",
     "conv2d",
     "conv1d",
     "_conv_depthwise2d",