Skip to content

Commit 5525325

Browse files
committed
Add expected_output to the evaluation request
1 parent 29fa78a commit 5525325

File tree

2 files changed

+22
-12
lines changed

2 files changed

+22
-12
lines changed

src/root_signals_mcp/client.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,12 @@ async def list_evaluators(self) -> list[dict[str, Any]]:
120120
return result.get("evaluators", []) # type: ignore
121121

122122
async def run_evaluation(
123-
self, evaluator_id: str, request: str, response: str, contexts: list[str] | None = None
123+
self,
124+
evaluator_id: str,
125+
request: str,
126+
response: str,
127+
contexts: list[str] | None = None,
128+
expected_output: str | None = None,
124129
) -> dict[str, Any]:
125130
"""Run a standard evaluation using a RootSignals evaluator by ID.
126131
@@ -129,6 +134,7 @@ async def run_evaluation(
129134
request: The user request/query
130135
response: The model's response to evaluate
131136
contexts: Optional list of contexts (policy files, examples, etc.) used for generation. Only used for evaluators that require contexts.
137+
expected_output: Optional expected LLM response. Only used for evaluators that require expected output.
132138
133139
Returns:
134140
Evaluation result with score and justification
@@ -138,12 +144,18 @@ async def run_evaluation(
138144
"request": request,
139145
"response": response,
140146
"contexts": contexts,
147+
"expected_output": expected_output,
141148
}
142149

143150
return await self.call_tool("run_evaluation", arguments)
144151

145152
async def run_evaluation_by_name(
146-
self, evaluator_name: str, request: str, response: str, contexts: list[str] | None = None
153+
self,
154+
evaluator_name: str,
155+
request: str,
156+
response: str,
157+
contexts: list[str] | None = None,
158+
expected_output: str | None = None,
147159
) -> dict[str, Any]:
148160
"""Run a standard evaluation using a RootSignals evaluator by name.
149161
@@ -152,6 +164,7 @@ async def run_evaluation_by_name(
152164
request: The user request/query
153165
response: The model's response to evaluate
154166
contexts: Optional list of contexts (policy files, examples, etc.) used for generation. Only used for evaluators that require contexts.
167+
expected_output: Optional expected LLM response. Only used for evaluators that require expected output.
155168
156169
Returns:
157170
Evaluation result with score and justification
@@ -161,6 +174,7 @@ async def run_evaluation_by_name(
161174
"request": request,
162175
"response": response,
163176
"contexts": contexts,
177+
"expected_output": expected_output,
164178
}
165179

166180
return await self.call_tool("run_evaluation_by_name", arguments)

src/root_signals_mcp/schema.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,11 @@ class BaseEvaluationRequest(BaseRootSignalsModel):
7373
response: str = Field(..., description="The AI assistant's response to evaluate")
7474
contexts: list[str] = Field(
7575
default=[],
76-
description="List of required context strings for evaluation. This is only used for RAG evaluators that require contexts to be sent",
76+
description="List of required context strings for evaluation. Used only for evaluators that have 'contexts' defined in their inputs.",
77+
)
78+
expected_output: str | None = Field(
79+
default=None,
80+
description="The expected LLM response. Used only for evaluators that have 'expected_output' defined in their inputs.",
7781
)
7882

7983
@field_validator("request", "response")
@@ -167,14 +171,6 @@ class RequiredInput(BaseModel):
167171
items: ArrayInputItem | None = None
168172

169173

170-
INPUTS_DESCRIPTION = """
171-
Schema defining the input parameters required for running the evaluator (run_evaluation parameters).
172-
If 'contexts' is defined, it means this is a RAG evaluator and contexts should include policy files, examples, etc.
173-
If 'expected_output' is defined, it means this is a gold standard output evaluator and the expected output should be passed.
174-
Most evaluators require both request (the user query) and response (the model's response to evaluate) without them being explicitly defined.
175-
"""
176-
177-
178174
class EvaluatorInfo(BaseRootSignalsModel):
179175
"""
180176
Model for evaluator information.
@@ -188,7 +184,7 @@ class EvaluatorInfo(BaseRootSignalsModel):
188184
intent: str | None = Field(None, description="Intent of the evaluator")
189185
inputs: dict[str, RequiredInput] = Field(
190186
...,
191-
description=INPUTS_DESCRIPTION,
187+
description="Schema defining the input parameters required for running the evaluator (run_evaluation parameters).",
192188
)
193189

194190

0 commit comments

Comments
 (0)