Skip to content

Commit 0595454

Browse files
committed
Add expected_output to the evaluation request
1 parent 72d2feb commit 0595454

File tree

4 files changed

+23
-11
lines changed

4 files changed

+23
-11
lines changed

src/root_signals_mcp/client.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,12 @@ async def list_evaluators(self) -> list[dict[str, Any]]:
120120
return result.get("evaluators", []) # type: ignore
121121

122122
async def run_evaluation(
123-
self, evaluator_id: str, request: str, response: str, contexts: list[str] | None = None
123+
self,
124+
evaluator_id: str,
125+
request: str,
126+
response: str,
127+
contexts: list[str] | None = None,
128+
expected_output: str | None = None,
124129
) -> dict[str, Any]:
125130
"""Run a standard evaluation using a RootSignals evaluator by ID.
126131
@@ -129,6 +134,7 @@ async def run_evaluation(
129134
request: The user request/query
130135
response: The model's response to evaluate
131136
contexts: Optional list of contexts (policy files, examples, etc.) used for generation. Only used for evaluators that require contexts.
137+
expected_output: Optional expected LLM response. Only used for evaluators that require expected output.
132138
133139
Returns:
134140
Evaluation result with score and justification
@@ -138,12 +144,18 @@ async def run_evaluation(
138144
"request": request,
139145
"response": response,
140146
"contexts": contexts,
147+
"expected_output": expected_output,
141148
}
142149

143150
return await self.call_tool("run_evaluation", arguments)
144151

145152
async def run_evaluation_by_name(
146-
self, evaluator_name: str, request: str, response: str, contexts: list[str] | None = None
153+
self,
154+
evaluator_name: str,
155+
request: str,
156+
response: str,
157+
contexts: list[str] | None = None,
158+
expected_output: str | None = None,
147159
) -> dict[str, Any]:
148160
"""Run a standard evaluation using a RootSignals evaluator by name.
149161
@@ -152,6 +164,7 @@ async def run_evaluation_by_name(
152164
request: The user request/query
153165
response: The model's response to evaluate
154166
contexts: Optional list of contexts (policy files, examples, etc.) used for generation. Only used for evaluators that require contexts.
167+
expected_output: Optional expected LLM response. Only used for evaluators that require expected output.
155168
156169
Returns:
157170
Evaluation result with score and justification
@@ -161,6 +174,7 @@ async def run_evaluation_by_name(
161174
"request": request,
162175
"response": response,
163176
"contexts": contexts,
177+
"expected_output": expected_output,
164178
}
165179

166180
return await self.call_tool("run_evaluation_by_name", arguments)

src/root_signals_mcp/evaluator.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ async def run_evaluation(self, request: EvaluationRequest) -> EvaluationResponse
116116
request=request.request,
117117
response=request.response,
118118
contexts=request.contexts,
119+
expected_output=request.expected_output,
119120
)
120121

121122
return result
@@ -147,6 +148,7 @@ async def run_evaluation_by_name(self, request: EvaluationRequestByName) -> Eval
147148
request=request.request,
148149
response=request.response,
149150
contexts=request.contexts,
151+
expected_output=request.expected_output,
150152
)
151153

152154
return result

src/root_signals_mcp/schema.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -171,14 +171,6 @@ class RequiredInput(BaseModel):
171171
items: ArrayInputItem | None = None
172172

173173

174-
INPUTS_DESCRIPTION = """
175-
Schema defining the input parameters required for running the evaluator (run_evaluation parameters).
176-
If contexts are required, it means this is a RAG evaluator and you must pass contexts such as policy files, examples, etc.
177-
If expected_output is required, it means this is a gold standard output evaluator and you must pass the expected output.
178-
Request and response are required for almost all evaluators. Request is the user query and response is the model's response to evaluate.
179-
"""
180-
181-
182174
class EvaluatorInfo(BaseRootSignalsModel):
183175
"""
184176
Model for evaluator information.
@@ -192,7 +184,7 @@ class EvaluatorInfo(BaseRootSignalsModel):
192184
intent: str | None = Field(None, description="Intent of the evaluator")
193185
inputs: dict[str, RequiredInput] = Field(
194186
...,
195-
description=INPUTS_DESCRIPTION,
187+
description="Schema defining the input parameters required for running the evaluator (run_evaluation parameters).",
196188
)
197189

198190
@property

src/root_signals_mcp/test/test_evaluator.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ async def test_run_evaluation_passes_correct_parameters(mock_api_client: MagicMo
161161
request="Test request",
162162
response="Test response",
163163
contexts=["Test context"],
164+
expected_output="Test expected output",
164165
)
165166

166167
result = await service.run_evaluation(request)
@@ -170,6 +171,7 @@ async def test_run_evaluation_passes_correct_parameters(mock_api_client: MagicMo
170171
request="Test request",
171172
response="Test response",
172173
contexts=["Test context"],
174+
expected_output="Test expected output",
173175
)
174176

175177
assert result.evaluator_name == "Test Evaluator"
@@ -195,6 +197,7 @@ async def test_run_evaluation_by_name_passes_correct_parameters(mock_api_client:
195197
request="Test request",
196198
response="Test response",
197199
contexts=["Test context"],
200+
expected_output="Test expected output",
198201
)
199202

200203
result = await service.run_evaluation_by_name(request)
@@ -204,6 +207,7 @@ async def test_run_evaluation_by_name_passes_correct_parameters(mock_api_client:
204207
request="Test request",
205208
response="Test response",
206209
contexts=["Test context"],
210+
expected_output="Test expected output",
207211
)
208212

209213
assert result.evaluator_name == "Test Evaluator"

0 commit comments

Comments
 (0)