fix

Kaiming Cheng · Kaiming Cheng · commit 72ac4d193663 · 2026-01-15T11:46:50.000-08:00
diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs.py
@@ -22,71 +22,16 @@
 """
 
 import subprocess
-from typing import Any, Dict, Optional
-
-# GPU specifications database
-# Sources: NVIDIA official specifications, manufacturer datasheets
-GPU_SPECS_DATABASE = {
-    "NVIDIA A100": {
-        "name": "NVIDIA A100",
-        "architecture": "Ampere",
-        "peak_fp32_tflops": 19.5,
-        "peak_fp16_tflops": 312.0,
-        "peak_bf16_tflops": 312.0,
-        "peak_memory_bw_gbps": 1555,
-        "sm_count": 108,
-        "max_threads_per_sm": 2048,
-        "l1_cache_kb": 192,
-        "l2_cache_mb": 40,
-        "memory_gb": 40,
-        "memory_type": "HBM2e",
-    },
-    "NVIDIA H100": {
-        "name": "NVIDIA H100",
-        "architecture": "Hopper",
-        "peak_fp32_tflops": 51.0,
-        "peak_fp16_tflops": 989.0,
-        "peak_bf16_tflops": 989.0,
-        "peak_memory_bw_gbps": 3352,
-        "sm_count": 132,
-        "max_threads_per_sm": 2048,
-        "l1_cache_kb": 256,
-        "l2_cache_mb": 50,
-        "memory_gb": 80,
-        "memory_type": "HBM3",
-    },
-    "NVIDIA RTX 4090": {
-        "name": "NVIDIA RTX 4090",
-        "architecture": "Ada Lovelace",
-        "peak_fp32_tflops": 82.6,
-        "peak_fp16_tflops": 165.0,
-        "peak_bf16_tflops": 165.0,
-        "peak_memory_bw_gbps": 1008,
-        "sm_count": 128,
-        "max_threads_per_sm": 1536,
-        "l1_cache_kb": 128,
-        "l2_cache_mb": 72,
-        "memory_gb": 24,
-        "memory_type": "GDDR6X",
-    },
-    "NVIDIA RTX 5080": {
-        "name": "NVIDIA RTX 5080",
-        "architecture": "Blackwell",
-        "peak_fp32_tflops": 57.0,
-        "peak_fp16_tflops": 114.0,
-        "peak_bf16_tflops": 114.0,
-        "peak_memory_bw_gbps": 960,
-        "sm_count": 84,
-        "max_threads_per_sm": 1536,
-        "l1_cache_kb": 128,
-        "l2_cache_mb": 64,
-        "memory_gb": 16,
-        "memory_type": "GDDR7",
-    },
-}
-
-
-def query_gpu_name() -> Optional[str]:
+from typing import Any
+
+from kernel_perf_agent.kernel_opt.diagnose_prompt.gpu_specs_database import (
+    GPU_SPECS_DATABASE,
+)
+
+__all__ = ["GPU_SPECS_DATABASE", "query_gpu_name", "get_gpu_specs"]
+
+
+def query_gpu_name() -> str | None:
     """
     Query GPU name using nvidia-smi.
 
@@ -109,7 +54,7 @@ def query_gpu_name() -> Optional[str]:
     return None
 
 
-def get_gpu_specs(gpu_name: Optional[str] = None) -> Dict[str, Any]:
+def get_gpu_specs(gpu_name: str | None = None) -> dict[str, Any]:
     """
     Get GPU specifications for bottleneck analysis.
 
@@ -179,6 +124,7 @@ def get_gpu_specs(gpu_name: Optional[str] = None) -> Dict[str, Any]:
         print(f"\nDetected GPU: {detected_name}")
     else:
         print("\nNo GPU detected (nvidia-smi not available)")
+        exit()
 
     # Get specs
     specs = get_gpu_specs()
diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/gpu_specs_database.py
@@ -0,0 +1,82 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+GPU Specifications Database
+
+This module contains the GPU hardware specifications database used for
+performance analysis and bottleneck identification. Separated into its
+own file to allow easier module overriding.
+
+Sources: NVIDIA official specifications, manufacturer datasheets
+"""
+
+GPU_SPECS_DATABASE: dict[str, dict[str, object]] = {
+    "NVIDIA A100": {
+        "name": "NVIDIA A100",
+        "architecture": "Ampere",
+        "peak_fp32_tflops": 19.5,
+        "peak_fp16_tflops": 312.0,
+        "peak_bf16_tflops": 312.0,
+        "peak_memory_bw_gbps": 1555,
+        "sm_count": 108,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 192,
+        "l2_cache_mb": 40,
+        "memory_gb": 40,
+        "memory_type": "HBM2e",
+    },
+    "NVIDIA H100": {
+        "name": "NVIDIA H100",
+        "architecture": "Hopper",
+        "peak_fp32_tflops": 51.0,
+        "peak_fp16_tflops": 989.0,
+        "peak_bf16_tflops": 989.0,
+        "peak_memory_bw_gbps": 3352,
+        "sm_count": 132,
+        "max_threads_per_sm": 2048,
+        "l1_cache_kb": 256,
+        "l2_cache_mb": 50,
+        "memory_gb": 80,
+        "memory_type": "HBM3",
+    },
+    "NVIDIA RTX 4090": {
+        "name": "NVIDIA RTX 4090",
+        "architecture": "Ada Lovelace",
+        "peak_fp32_tflops": 82.6,
+        "peak_fp16_tflops": 165.0,
+        "peak_bf16_tflops": 165.0,
+        "peak_memory_bw_gbps": 1008,
+        "sm_count": 128,
+        "max_threads_per_sm": 1536,
+        "l1_cache_kb": 128,
+        "l2_cache_mb": 72,
+        "memory_gb": 24,
+        "memory_type": "GDDR6X",
+    },
+    "NVIDIA RTX 5080": {
+        "name": "NVIDIA RTX 5080",
+        "architecture": "Blackwell",
+        "peak_fp32_tflops": 57.0,
+        "peak_fp16_tflops": 114.0,
+        "peak_bf16_tflops": 114.0,
+        "peak_memory_bw_gbps": 960,
+        "sm_count": 84,
+        "max_threads_per_sm": 1536,
+        "l1_cache_kb": 128,
+        "l2_cache_mb": 64,
+        "memory_gb": 16,
+        "memory_type": "GDDR7",
+    },
+}
diff --git a/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompts.py b/kernel_perf_agent/kernel_opt/diagnose_prompt/judger_prompts.py
@@ -32,7 +32,7 @@
 Metric definitions are in metric_schema.py.
 """
 
-from typing import Any, Callable, Dict, List, Optional, Tuple
+from typing import Any, Callable
 
 from .metric_schema import GPU_MEMORY_FIELDS, GPU_SPEC_FIELDS, NCU_METRIC_SECTIONS
 
@@ -42,23 +42,26 @@
 # =============================================================================
 
 
-def render_problem_description(problem_description: str) -> List[str]:
+def render_problem_description(problem_description: str) -> list[str]:
     """Render the problem description section."""
     return ["## Problem Description", "", problem_description]
 
 
-def render_kernel_code(kernel_code: str, language: str = "python") -> List[str]:
+def render_kernel_code(kernel_code: str, language: str = "python") -> list[str]:
     """Render the kernel code section with syntax highlighting."""
     return ["", "## Current Kernel Code", "", f"```{language}", kernel_code, "```"]
 
 
-def render_gpu_specs(gpu_specs: Dict[str, Any]) -> List[str]:
+def render_gpu_specs(gpu_specs: dict[str, Any]) -> list[str]:
     """Render the GPU hardware specifications section."""
     lines = ["", "## GPU Hardware Specifications", ""]
 
     for label, key, unit in GPU_SPEC_FIELDS:
         value = gpu_specs.get(key, "N/A")
-        lines.append(f"- **{label}:** {value}{unit}")
+        if value == "N/A":
+            lines.append(f"- **{label}:** N/A")
+        else:
+            lines.append(f"- **{label}:** {value}{unit}")
 
     for label, size_key, type_key, size_unit in GPU_MEMORY_FIELDS:
         size_value = gpu_specs.get(size_key, "N/A")
@@ -69,9 +72,9 @@ def render_gpu_specs(gpu_specs: Dict[str, Any]) -> List[str]:
 
 
 def render_ncu_metrics(
-    ncu_metrics: Dict[str, Any],
+    ncu_metrics: dict[str, Any],
     get_metric_fn: Callable[[str, str], str],
-) -> List[str]:
+) -> list[str]:
     """Render the NCU profiling metrics section."""
     lines = ["", "## NCU Profiling Metrics"]
 
@@ -85,7 +88,7 @@ def render_ncu_metrics(
     return lines
 
 
-def render_task_instructions() -> List[str]:
+def render_task_instructions() -> list[str]:
     """Render the task instructions section for dual-bottleneck analysis."""
     return [
         "",
@@ -102,7 +105,7 @@ def render_task_instructions() -> List[str]:
     ]
 
 
-def create_metric_getter(kernel_metrics: Dict[str, Any]) -> Callable[[str, str], str]:
+def create_metric_getter(kernel_metrics: dict[str, Any]) -> Callable[[str, str], str]:
     """Create a metric getter function for a specific kernel's metrics."""
 
     def get_metric(key: str, default: str = "N/A") -> str:
@@ -172,9 +175,9 @@ def get_metric(key: str, default: str = "N/A") -> str:
 def build_judge_optimization_prompt(
     kernel_code: str,
     problem_description: str,
-    ncu_metrics: Dict[str, Any],
-    gpu_specs: Dict[str, Any],
-) -> Tuple[str, str]:
+    ncu_metrics: dict[str, Any],
+    gpu_specs: dict[str, Any],
+) -> tuple[str, str]:
     """
     Build system and user prompts for Judge to analyze bottleneck.
 
@@ -209,7 +212,7 @@ def build_judge_optimization_prompt(
         raise ValueError("NCU metrics are empty - cannot build judge prompt")
 
     # Extract first kernel's metrics for the metric getter
-    first_kernel = list(ncu_metrics.values())[0] if ncu_metrics else {}
+    first_kernel = list(ncu_metrics.values())[0]
     get_metric_fn = create_metric_getter(first_kernel)
 
     # Build user prompt using modular section renderers
@@ -226,7 +229,7 @@ def build_judge_optimization_prompt(
     return JUDGE_SYSTEM_PROMPT, user_prompt
 
 
-def extract_judge_response(response_text: str) -> Optional[Dict[str, Any]]:
+def extract_judge_response(response_text: str) -> dict[str, Any] | None:
     """
     Extract and parse JSON from Judge LLM response.
 
@@ -302,7 +305,7 @@ def extract_judge_response(response_text: str) -> Optional[Dict[str, Any]]:
     return None
 
 
-def validate_judge_response(analysis: Dict[str, Any]) -> bool:
+def validate_judge_response(analysis: dict[str, Any]) -> bool:
     """Validate that Judge response contains required dual-bottleneck fields."""
     if "bottleneck_1" not in analysis or "bottleneck_2" not in analysis:
         return False
@@ -311,12 +314,12 @@ def validate_judge_response(analysis: Dict[str, Any]) -> bool:
     ) and _validate_bottleneck_entry(analysis["bottleneck_2"])
 
 
-VALID_CATEGORIES = frozenset(
-    ["memory-bound", "compute-bound", "occupancy-limited", "latency-bound"]
-)
+VALID_CATEGORIES = {
+    "memory-bound", "compute-bound", "occupancy-limited", "latency-bound"
+}
 
 
-def _validate_bottleneck_entry(bottleneck: Dict[str, Any]) -> bool:
+def _validate_bottleneck_entry(bottleneck: dict[str, Any]) -> bool:
     """Validate a single bottleneck entry."""
     required = [
         "category",