Fix memory step model output in ToolCallingAgent (#1156)

keetrap · aymeric-roucher · commit 4e71e9b44856 · 2025-04-10T18:19:41.000+02:00
diff --git a/examples/smolagents_benchmark/run.py b/examples/smolagents_benchmark/run.py
@@ -141,12 +141,12 @@ def answer_single_question(example, model, answers_file, action_type):
     try:
         if action_type == "vanilla":
             answer = agent([{"role": "user", "content": augmented_question}]).content
-            token_count = agent.last_output_token_count
+            token_counts = agent.monitor.get_total_token_counts()
             intermediate_steps = answer
         else:
             # Run agent 🚀
             answer = str(agent.run(augmented_question))
-            token_count = agent.monitor.get_total_token_counts()
+            token_counts = agent.monitor.get_total_token_counts()
             # Remove memory from logs to make them more compact.
             for step in agent.memory.steps:
                 if isinstance(step, ActionStep):
@@ -157,6 +157,8 @@ def answer_single_question(example, model, answers_file, action_type):
     except Exception as e:
         print("Error on ", augmented_question, e)
         intermediate_steps = []
+        token_counts = {"input": 0, "output": 0}
+        answer = str(e)
     end_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     annotated_example = {
         "model_id": model.model_id,
@@ -169,7 +171,7 @@ def answer_single_question(example, model, answers_file, action_type):
         "intermediate_steps": intermediate_steps,
         "start_time": start_time,
         "end_time": end_time,
-        "token_counts": token_count,
+        "token_counts": token_counts,
     }
     append_answer(annotated_example, answers_file)
 
@@ -233,7 +235,7 @@ def answer_questions(
             max_completion_tokens=8192,
         )
     else:
-        model = HfApiModel(model_id=args.model_id, provider="together", max_tokens=8192)
+        model = HfApiModel(model_id=args.model_id, max_tokens=8192)
 
     answer_questions(
         eval_ds,
diff --git a/examples/smolagents_benchmark/score.ipynb b/examples/smolagents_benchmark/score.ipynb
diff --git a/src/smolagents/agents.py b/src/smolagents/agents.py
@@ -994,7 +994,7 @@ def step(self, memory_step: ActionStep) -> Union[None, Any]:
         tool_call = model_message.tool_calls[0]
         tool_name, tool_call_id = tool_call.function.name, tool_call.id
         tool_arguments = tool_call.function.arguments
-
+        memory_step.model_output = str(f"Called Tool: '{tool_name}' with arguments: {tool_arguments}")
         memory_step.tool_calls = [ToolCall(name=tool_name, arguments=tool_arguments, id=tool_call_id)]
 
         # Execute
diff --git a/src/smolagents/utils.py b/src/smolagents/utils.py
@@ -186,7 +186,7 @@ def parse_code_blobs(text: str) -> str:
     Raises:
         ValueError: If no valid code block is found in the text.
     """
-    pattern = r"```(?:py|python)?\n(.*?)\n```"
+    pattern = r"```(?:py|python)?\s*\n(.*?)\n```"
     matches = re.findall(pattern, text, re.DOTALL)
     if matches:
         return "\n\n".join(match.strip() for match in matches)
diff --git a/tests/test_agents.py b/tests/test_agents.py
@@ -317,7 +317,7 @@ def test_fake_toolcalling_agent(self):
         assert "7.2904" in output
         assert agent.memory.steps[0].task == "What is 2 multiplied by 3.6452?"
         assert "7.2904" in agent.memory.steps[1].observations
-        assert agent.memory.steps[2].model_output is None
+        assert agent.memory.steps[2].model_output == "Called Tool: 'final_answer' with arguments: {'answer': '7.2904'}"
 
     def test_toolcalling_agent_handles_image_tool_outputs(self, shared_datadir):
         import PIL.Image
@@ -495,6 +495,15 @@ def test_replay_shows_logs(self):
         assert 'final_answer("got' in str_output
         assert "```<end_code>" in str_output
 
+        agent = ToolCallingAgent(tools=[PythonInterpreterTool()], model=FakeToolCallModel(), verbosity_level=0)
+        agent.run("What is 2 multiplied by 3.6452?")
+        with agent.logger.console.capture() as capture:
+            agent.replay()
+        str_output = capture.get().replace("\n", "")
+        assert "Called" in str_output
+        assert "Tool" in str_output
+        assert "arguments" in str_output
+
     def test_code_nontrivial_final_answer_works(self):
         def fake_code_model_final_answer(messages, stop_sequences=None, grammar=None):
             return ChatMessage(
diff --git a/tests/test_monitoring.py b/tests/test_monitoring.py
@@ -160,7 +160,7 @@ def test_streaming_agent_image_output(self):
             )
         )
 
-        self.assertEqual(len(outputs), 5)
+        self.assertEqual(len(outputs), 6)
         final_message = outputs[-1]
         self.assertEqual(final_message.role, "assistant")
         self.assertIsInstance(final_message.content, dict)
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -113,25 +113,14 @@ def test_parse_code_blobs(self):
         output = parse_code_blobs(code_blob)
         assert output == code_blob
 
-    def test_multiple_code_blobs(self):
-        test_input = """Here's a function that adds numbers:
-```python
-def add(a, b):
-    return a + b
-```
-And here's a function that multiplies them:
-```py
-def multiply(a, b):
-    return a * b
-```"""
+        # Allow whitespaces after header
+        output = parse_code_blobs("```py    \ncode_a\n````")
+        assert output == "code_a"
 
-        expected_output = """def add(a, b):
-    return a + b
-
-def multiply(a, b):
-    return a * b"""
+    def test_multiple_code_blobs(self):
+        test_input = "```\nFoo\n```\n\n```py\ncode_a\n````\n\n```python\ncode_b\n```"
         result = parse_code_blobs(test_input)
-        assert result == expected_output
+        assert result == "nFoo\n\ncode_a\n\ncode_b"
 
 
 @pytest.fixture(scope="function")

Original file line number	Diff line number	Diff line change
`@@ -160,7 +160,7 @@ def test_streaming_agent_image_output(self):`
`160`	`160`	`)`
`161`	`161`	`)`
`162`	`162`
`163`		`- self.assertEqual(len(outputs), 5)`
	`163`	`+ self.assertEqual(len(outputs), 6)`
`164`	`164`	`final_message = outputs[-1]`
`165`	`165`	`self.assertEqual(final_message.role, "assistant")`
`166`	`166`	`self.assertIsInstance(final_message.content, dict)`