test(utils.py): handle scenario where text tokens + reasoning tokens … (#10165)

krrishdholakia · web-flow · commit 03b5399f86d2 · 2025-04-19T12:32:38.000-07:00
* test(utils.py): handle scenario where text tokens + reasoning tokens set, but reasoning tokens not charged separately Addresses #10141 (comment) * fix(vertex_and_google_ai_studio.py): only set content if non-empty str
diff --git a/litellm/litellm_core_utils/llm_cost_calc/utils.py b/litellm/litellm_core_utils/llm_cost_calc/utils.py
@@ -265,9 +265,10 @@ def generic_cost_per_token(
     )
 
     ## CALCULATE OUTPUT COST
-    text_tokens = usage.completion_tokens
+    text_tokens = 0
     audio_tokens = 0
     reasoning_tokens = 0
+    is_text_tokens_total = False
     if usage.completion_tokens_details is not None:
         audio_tokens = (
             cast(
@@ -281,7 +282,7 @@ def generic_cost_per_token(
                 Optional[int],
                 getattr(usage.completion_tokens_details, "text_tokens", None),
             )
-            or usage.completion_tokens  # default to completion tokens, if this field is not set
+            or 0  # default to completion tokens, if this field is not set
         )
         reasoning_tokens = (
             cast(
@@ -290,6 +291,11 @@ def generic_cost_per_token(
             )
             or 0
         )
+
+    if text_tokens == 0:
+        text_tokens = usage.completion_tokens
+    if text_tokens == usage.completion_tokens:
+        is_text_tokens_total = True
     ## TEXT COST
     completion_cost = float(text_tokens) * completion_base_cost
 
@@ -302,19 +308,21 @@ def generic_cost_per_token(
     )
 
     ## AUDIO COST
-    if (
-        _output_cost_per_audio_token is not None
-        and audio_tokens is not None
-        and audio_tokens > 0
-    ):
+    if not is_text_tokens_total and audio_tokens is not None and audio_tokens > 0:
+        _output_cost_per_audio_token = (
+            _output_cost_per_audio_token
+            if _output_cost_per_audio_token is not None
+            else completion_base_cost
+        )
         completion_cost += float(audio_tokens) * _output_cost_per_audio_token
 
     ## REASONING COST
-    if (
-        _output_cost_per_reasoning_token is not None
-        and reasoning_tokens
-        and reasoning_tokens > 0
-    ):
+    if not is_text_tokens_total and reasoning_tokens and reasoning_tokens > 0:
+        _output_cost_per_reasoning_token = (
+            _output_cost_per_reasoning_token
+            if _output_cost_per_reasoning_token is not None
+            else completion_base_cost
+        )
         completion_cost += float(reasoning_tokens) * _output_cost_per_reasoning_token
 
     return prompt_cost, completion_cost
diff --git a/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py b/litellm/llms/vertex_ai/gemini/vertex_and_google_ai_studio_gemini.py
@@ -587,14 +587,15 @@ def get_assistant_content_message(
                 _content_str += "data:{};base64,{}".format(
                     part["inlineData"]["mimeType"], part["inlineData"]["data"]
                 )
-            if part.get("thought") is True:
-                if reasoning_content_str is None:
-                    reasoning_content_str = ""
-                reasoning_content_str += _content_str
-            else:
-                if content_str is None:
-                    content_str = ""
-                content_str += _content_str
+            if len(_content_str) > 0:
+                if part.get("thought") is True:
+                    if reasoning_content_str is None:
+                        reasoning_content_str = ""
+                    reasoning_content_str += _content_str
+                else:
+                    if content_str is None:
+                        content_str = ""
+                    content_str += _content_str
 
         return content_str, reasoning_content_str
 
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
@@ -4979,35 +4979,6 @@
         "supports_tool_choice": true
     },
     "gemini-2.5-pro-exp-03-25": {
-        "max_tokens": 65536,
-        "max_input_tokens": 1048576,
-        "max_output_tokens": 65536,
-        "max_images_per_prompt": 3000,
-        "max_videos_per_prompt": 10,
-        "max_video_length": 1,
-        "max_audio_length_hours": 8.4,
-        "max_audio_per_prompt": 1,
-        "max_pdf_size_mb": 30,
-        "input_cost_per_token": 0,
-        "input_cost_per_token_above_200k_tokens": 0,
-        "output_cost_per_token": 0,
-        "output_cost_per_token_above_200k_tokens": 0,
-        "litellm_provider": "vertex_ai-language-models",
-        "mode": "chat",
-        "supports_system_messages": true,
-        "supports_function_calling": true,
-        "supports_vision": true,
-        "supports_audio_input": true,
-        "supports_video_input": true,
-        "supports_pdf_input": true,
-        "supports_response_schema": true,
-        "supports_tool_choice": true,
-        "supported_endpoints": ["/v1/chat/completions", "/v1/completions"],
-        "supported_modalities": ["text", "image", "audio", "video"],
-        "supported_output_modalities": ["text"],
-        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing"
-    },
-    "gemini-2.5-pro-preview-03-25": {
         "max_tokens": 65536,
         "max_input_tokens": 1048576,
         "max_output_tokens": 65536,
diff --git a/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py b/tests/litellm/litellm_core_utils/llm_cost_calc/test_llm_cost_calc_utils.py
@@ -26,6 +26,47 @@
 from litellm.types.utils import Usage
 
 
+def test_reasoning_tokens_no_price_set():
+    model = "o1-mini"
+    custom_llm_provider = "openai"
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+    model_cost_map = litellm.model_cost[model]
+    usage = Usage(
+        completion_tokens=1578,
+        prompt_tokens=17,
+        total_tokens=1595,
+        completion_tokens_details=CompletionTokensDetailsWrapper(
+            accepted_prediction_tokens=None,
+            audio_tokens=None,
+            reasoning_tokens=952,
+            rejected_prediction_tokens=None,
+            text_tokens=626,
+        ),
+        prompt_tokens_details=PromptTokensDetailsWrapper(
+            audio_tokens=None, cached_tokens=None, text_tokens=17, image_tokens=None
+        ),
+    )
+    prompt_cost, completion_cost = generic_cost_per_token(
+        model=model,
+        usage=usage,
+        custom_llm_provider="openai",
+    )
+    assert round(prompt_cost, 10) == round(
+        model_cost_map["input_cost_per_token"] * usage.prompt_tokens,
+        10,
+    )
+    print(f"completion_cost: {completion_cost}")
+    expected_completion_cost = (
+        model_cost_map["output_cost_per_token"] * usage.completion_tokens
+    )
+    print(f"expected_completion_cost: {expected_completion_cost}")
+    assert round(completion_cost, 10) == round(
+        expected_completion_cost,
+        10,
+    )
+
+
 def test_reasoning_tokens_gemini():
     model = "gemini-2.5-flash-preview-04-17"
     custom_llm_provider = "gemini"
diff --git a/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py b/tests/litellm/llms/vertex_ai/gemini/test_vertex_and_google_ai_studio_gemini.py
@@ -239,3 +239,23 @@ def test_vertex_ai_thinking_output_part():
     content, reasoning_content = v.get_assistant_content_message(parts=parts)
     assert content == "Hello world"
     assert reasoning_content == "I'm thinking..."
+
+
+def test_vertex_ai_empty_content():
+    from litellm.llms.vertex_ai.gemini.vertex_and_google_ai_studio_gemini import (
+        VertexGeminiConfig,
+    )
+    from litellm.types.llms.vertex_ai import HttpxPartType
+
+    v = VertexGeminiConfig()
+    parts = [
+        HttpxPartType(
+            functionCall={
+                "name": "get_current_weather",
+                "arguments": "{}",
+            },
+        ),
+    ]
+    content, reasoning_content = v.get_assistant_content_message(parts=parts)
+    assert content is None
+    assert reasoning_content is None