optimization prompt

Kaiming Cheng · Kaiming Cheng · commit d5e6edc2c03c · 2026-01-15T11:47:28.000-08:00
diff --git a/triton_kernel_agent/prompt_manager.py b/triton_kernel_agent/prompt_manager.py
@@ -15,6 +15,7 @@
 """Prompt Manager for handling Jinja2 templates."""
 
 from pathlib import Path
+from typing import Any
 
 from triton_kernel_agent.platform_config import PlatformConfig, get_platform
 
@@ -88,6 +89,7 @@ def _load_templates(self):
             "test_generation": "test_generation.j2",
             "kernel_generation": "kernel_generation.j2",
             "kernel_refinement": "kernel_refinement.j2",
+            "kernel_optimization": "kernel_optimization.j2",
             "triton_guidelines": "triton_guidelines.j2",
         }
 
@@ -188,6 +190,59 @@ def render_kernel_refinement_prompt(
             kernel_guidance=self.target_platform.kernel_guidance,
         )
 
+    def render_kernel_optimization_prompt(
+        self,
+        kernel_code: str,
+        problem_description: str,
+        bottleneck_analysis: dict[str, Any],
+        bottleneck_id: int = 1,
+        gpu_specs: dict[str, Any] | None = None,
+        pytorch_baseline_ms: float | None = None,
+        error_feedback: str | None = None,
+    ) -> str:
+        """
+        Render the kernel optimization prompt based on bottleneck analysis.
+
+        Args:
+            kernel_code: Current kernel code to optimize
+            problem_description: Problem description
+            bottleneck_analysis: Dual-bottleneck analysis with bottleneck_1 and bottleneck_2
+            bottleneck_id: Which bottleneck to focus on (1 or 2)
+            gpu_specs: GPU specifications dict
+            pytorch_baseline_ms: PyTorch baseline time in ms
+            error_feedback: Error feedback from previous failed attempt
+
+        Returns:
+            Rendered prompt string
+        """
+        template = self.templates["kernel_optimization"]
+
+        # Select bottleneck
+        if bottleneck_id == 2:
+            bottleneck = bottleneck_analysis.get("bottleneck_2", {})
+            bottleneck_label = "Bottleneck 2 (Secondary)"
+        else:
+            bottleneck = bottleneck_analysis.get("bottleneck_1", {})
+            bottleneck_label = "Bottleneck 1 (Primary)"
+
+        # Calculate target time if baseline provided
+        target_ms = None
+        if pytorch_baseline_ms and pytorch_baseline_ms != float("inf"):
+            target_ms = pytorch_baseline_ms * 0.8
+
+        return template.render(
+            kernel_code=kernel_code,
+            problem_description=problem_description,
+            bottleneck=bottleneck,
+            bottleneck_label=bottleneck_label,
+            gpu_specs=gpu_specs,
+            pytorch_baseline_ms=pytorch_baseline_ms
+            if pytorch_baseline_ms != float("inf")
+            else None,
+            target_ms=target_ms,
+            error_feedback=error_feedback,
+        )
+
     def render_triton_guidelines(self) -> str:
         """
         Render the Triton guidelines.
diff --git a/triton_kernel_agent/templates/kernel_optimization.j2 b/triton_kernel_agent/templates/kernel_optimization.j2
@@ -0,0 +1,89 @@
+{#
+Copyright (c) Meta Platforms, Inc. and affiliates.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+#}
+
+TASK: Optimize the following Triton kernel based on hardware profiling analysis to achieve better performance.
+
+{% if gpu_specs %}
+TARGET GPU:
+{% if gpu_specs.name %}- GPU: {{ gpu_specs.name }}
+{% endif %}
+{% if gpu_specs.architecture %}- Architecture: {{ gpu_specs.architecture }}
+{% endif %}
+{% if gpu_specs.peak_memory_bw_gbps %}- Peak Memory Bandwidth: {{ gpu_specs.peak_memory_bw_gbps }} GB/s
+{% endif %}
+{% if gpu_specs.peak_fp32_tflops %}- Peak FP32: {{ gpu_specs.peak_fp32_tflops }} TFLOPS
+{% endif %}
+{% if gpu_specs.peak_fp16_tflops %}- Peak FP16: {{ gpu_specs.peak_fp16_tflops }} TFLOPS
+{% endif %}
+{% if gpu_specs.peak_bf16_tflops %}- Peak BF16: {{ gpu_specs.peak_bf16_tflops }} TFLOPS
+{% endif %}
+{% if gpu_specs.sm_count %}- SM Count: {{ gpu_specs.sm_count }}
+{% endif %}
+{% if gpu_specs.max_threads_per_sm %}- Max Threads per SM: {{ gpu_specs.max_threads_per_sm }}
+{% endif %}
+{% if gpu_specs.l1_cache_kb %}- L1 Cache per SM: {{ gpu_specs.l1_cache_kb }} KB
+{% endif %}
+{% if gpu_specs.l2_cache_mb %}- L2 Cache (Total): {{ gpu_specs.l2_cache_mb }} MB
+{% endif %}
+{% if gpu_specs.memory_gb %}- Memory: {{ gpu_specs.memory_gb }} GB {{ gpu_specs.memory_type | default('') }}
+{% endif %}
+
+{% endif %}
+PROBLEM DESCRIPTION:
+{{ problem_description }}
+{% if pytorch_baseline_ms %}
+PyTorch Eager baseline: {{ "%.4f"|format(pytorch_baseline_ms) }} ms
+{% endif %}
+
+CURRENT KERNEL IMPLEMENTATION:
+```python
+{{ kernel_code }}
+```
+
+OPTIMIZATION STRATEGY ({{ bottleneck_label }}):
+The hardware profiling (NCU) analysis identified the following bottleneck:
+- Category: {{ bottleneck.category | default('unknown') }}
+- Root Cause: {{ bottleneck.root_cause | default('N/A') }}
+- Suggested Optimization: {{ bottleneck.suggestion | default('N/A') }}
+- Expected Improvement: {{ bottleneck.expected_improvement | default('N/A') }}
+
+{% if error_feedback %}
+PREVIOUS ATTEMPT FAILED:
+{{ error_feedback }}
+
+{% endif %}
+PERFORMANCE TARGET:
+{% if target_ms %}
+- Achieve at least 1.25x speedup vs PyTorch Eager (target: <= {{ "%.4f"|format(target_ms) }} ms)
+{% else %}
+- Achieve 20-100% performance improvement over baseline
+{% endif %}
+- Maintain numerical correctness (atol=1e-4 or rtol=1e-4)
+- Preserve public API (same inputs/outputs, shapes, dtypes)
+
+CRITICAL REQUIREMENTS:
+1. Apply the optimization strategy described above to address the identified bottleneck
+2. The implementation must be a complete, valid Python file
+3. The main function must be named 'kernel_function' that wraps the actual Triton kernel
+4. Focus on the specific optimization while maintaining correctness
+5. Keep the wrapper free of PyTorch compute primitives
+
+OUTPUT FORMAT:
+1. Output complete optimized kernel code in ```python blocks
+2. Include only: imports, Triton kernel (@triton.jit), wrapper function (kernel_function)
+3. No testing code, benchmarks, or explanatory comments
+
+Generate the complete optimized kernel implementation: