wandb · alex-woosuk-kwon · Sep 29, 2025 · Sep 30, 2025 · Oct 1, 2025 · Oct 1, 2025
diff --git a/configs/config-solar-pro2-250909-high-reasoning.yaml b/configs/config-solar-pro2-250909-high-reasoning.yaml
@@ -0,0 +1,24 @@
+wandb:
+  run_name: 'upstage/solar-pro2-250909-high-reasoning'
+  project: 'nejumi-leaderboard-4-upstage'
+
+api: upstage
+batch_size: 32
+testmode: true  # Run lightweight test with small number of questions
+inference_interval: 0.1  # 100ms delay between API calls to prevent rate limiting
+
+model:
+  pretrained_model_name_or_path: solar-pro2
+  bfcl_model_id: "solar-pro2"  
+  base_model: "unknown"
+  size_category: api
+  size: null
+  release_date: 09/09/2025
+
+generator:
+  reasoning_effort: "high"
+
+# BFCL-specific generator configuration (used by UpstageHandler)
+bfcl:
+  generator_config:
+    reasoning_effort: "high"
diff --git a/configs/config-solar-pro2-250909-minimal-reasoning.yaml b/configs/config-solar-pro2-250909-minimal-reasoning.yaml
@@ -0,0 +1,44 @@
+wandb:
+  run_name: 'upstage/solar-pro2-250909-minimal-reasoning'
+  project: 'nejumi-leaderboard-4-upstage'
+
+api: upstage
+batch_size: 32
+testmode: true  # Run lightweight test with small number of questions
+inference_interval: 0.1  # 100ms delay between API calls to prevent rate limiting
+
+#Override run configuration to not test swebench
+#REMOVE this override once test is done
+run:
+  bfcl: false
+  swebench: true
+  mtbench: true
+  jbbq: false
+  toxicity: true
+  jtruthfulqa: false
+  hle: false
+  hallulens: false
+  arc_agi: false
+  m_ifeval: false
+  jaster: false
+  jmmlu_robustness: false
+  aggregate: false
+
+model:
+  pretrained_model_name_or_path: solar-pro2
+  bfcl_model_id: "solar-pro2"  
+  base_model: "unknown"
+  size_category: api
+  size: null
+  release_date: 09/09/2025
+
+generator:
+  reasoning_effort: "minimal"
+
+# BFCL-specific generator configuration (used by UpstageHandler)
+bfcl:
+  generator_config:
+    reasoning_effort: "minimal"
+
+swebench:
+  evaluation_method: 'docker'
diff --git a/scripts/evaluator/evaluate_utils/bfcl_pkg/SUPPORTED_MODELS.md b/scripts/evaluator/evaluate_utils/bfcl_pkg/SUPPORTED_MODELS.md
@@ -163,7 +163,7 @@ These unified handlers eliminate the need to configure individual model-specific
 | xLAM-2-3b-fc-r                                 | Function Calling | Self-hosted 💻 | Salesforce/xLAM-2-3b-fc-r                                   |
 | xLAM-2-70b-fc-r                                | Function Calling | Self-hosted 💻 | Salesforce/Llama-xLAM-2-70b-fc-r                            |
 | xLAM-2-8b-fc-r                                 | Function Calling | Self-hosted 💻 | Salesforce/Llama-xLAM-2-8b-fc-r                             |
-| Upstage (Generic Handler)                      | Function Calling | Upstage        | upstage-FC                                                  |
+| Upstage (Generic Handler)                      | Prompt           | Upstage        | solar-pro2                                         |
 | PLaMo-2.0-Prime                                | Function Calling | Preferred AI   | PLaMo-2.0-Prime-FC                                         |
 | PLaMo-2.0-Prime                                | Prompt           | Preferred AI   | PLaMo-2.0-Prime                                            |
 

diff --git a/scripts/evaluator/evaluate_utils/bfcl_pkg/bfcl/constants/model_config.py b/scripts/evaluator/evaluate_utils/bfcl_pkg/bfcl/constants/model_config.py
@@ -1241,6 +1241,18 @@ class ModelConfig:
         is_fc_model=False,
         underscore_to_dot=False,
     ),
+    "solar-pro2": ModelConfig(
+        model_name="solar-pro2",
+        display_name="solar-pro2 (Prompt)",
+        url="https://console.upstage.ai/api/chat",
+        org="Upstage",
+        license="Proprietary",
+        model_handler=UpstageHandler,
+        input_price=None,  # Add pricing information if available
+        output_price=None,
+        is_fc_model=False,
+        underscore_to_dot=False,
+    ),
 }
 
 # Inference through local hosting

diff --git a/scripts/evaluator/evaluate_utils/bfcl_pkg/bfcl/constants/supported_models.py b/scripts/evaluator/evaluate_utils/bfcl_pkg/bfcl/constants/supported_models.py
@@ -103,7 +103,7 @@
     "qwq-32b-FC",
     "qwq-32b",
     "xiaoming-14B",
-    "upstage-FC",
+    "solar-pro2",
     "deepseek-ai/DeepSeek-R1",
     "google/gemma-3-1b-it",
     "google/gemma-3-4b-it",