Skip to content

Commit 0ecec82

Browse files
authored
Fix conversational metrics and metrics page pagination (#1421)
* fix(frontend): increase metrics fetch limit to prevent missing backend types Metrics page only fetched 100 items, causing orgs with many metrics to miss older DeepEval, Rhesis, and Ragas entries in the backend type tabs. * fix(frontend): paginate metrics fetch to show all backend type tabs The metrics page fetched only the first 100 metrics, but the backend caps at 100 per request. Orgs with 100+ metrics lost DeepEval, Rhesis, and Ragas tabs since older metrics were outside the first page. Added getAllMetrics() that paginates through all pages. * feat(frontend): add client-side pagination to metrics grid * fix(metrics): thread conversation_history through evaluator pipeline Conversational metrics require conversation_history in evaluate() but the evaluator only passed single-turn params, causing TypeError. Add conversation_history param to MetricEvaluator.evaluate(), introspect for conversation_history and goal in _call_metric_with_introspection(), build ConversationHistory from conversation_summary in multi-turn evaluation, and register ConversationalJudge in native factory. Add introspection completeness guard test that scans all metric classes to ensure every required evaluate() param is handled by the evaluator. * style(frontend): fix indentation in metrics grid map callback * fix(frontend): add pagination robustness guards - Break getAllMetrics loop on empty response to prevent infinite fetch - Clamp page index when filtered list shrinks (e.g. after delete) * fix(metrics): initialize _conversation_history in MetricEvaluator.__init__
1 parent 52a95cb commit 0ecec82

File tree

8 files changed

+492
-129
lines changed

8 files changed

+492
-129
lines changed

apps/backend/src/rhesis/backend/metrics/evaluator.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from rhesis.backend.metrics.score_evaluator import ScoreEvaluator
2020
from rhesis.backend.metrics.utils import diagnose_invalid_metric
2121
from rhesis.sdk.metrics import BaseMetric, MetricConfig, MetricResult
22+
from rhesis.sdk.metrics.conversational.types import ConversationHistory
2223
from rhesis.sdk.metrics.utils import backend_config_to_sdk_config
2324

2425
# Use inline factory creation to avoid circular imports
@@ -63,6 +64,7 @@ def __init__(
6364
self.model = model # Store default model for passing to metrics
6465
self.db = db # Database session for fetching metric-specific models
6566
self.organization_id = organization_id # For secure model lookups
67+
self._conversation_history: Optional[ConversationHistory] = None
6668

6769
@staticmethod
6870
def _get_config_value(
@@ -141,6 +143,7 @@ def evaluate(
141143
context: List[str],
142144
metrics: List[Union[Dict[str, Any], MetricConfig, MetricModel]],
143145
max_workers: int = 5,
146+
conversation_history: Optional[ConversationHistory] = None,
144147
) -> Dict[str, Any]:
145148
"""
146149
Compute metrics using the configured backends in parallel.
@@ -176,6 +179,9 @@ def evaluate(
176179
Returns:
177180
Dictionary containing scores and details for each metric
178181
"""
182+
# Store conversation history for conversational metrics
183+
self._conversation_history = conversation_history
184+
179185
if not metrics:
180186
logger.warning("No metrics provided for evaluation")
181187
return {}
@@ -892,6 +898,10 @@ def _call_metric_with_introspection(
892898
kwargs["expected_output"] = expected_output
893899
if "context" in params:
894900
kwargs["context"] = context
901+
if "conversation_history" in params and self._conversation_history is not None:
902+
kwargs["conversation_history"] = self._conversation_history
903+
if "goal" in params:
904+
kwargs["goal"] = input_text
895905

896906
logger.debug(f"Calling metric '{metric.name}' with parameters: {list(kwargs.keys())}")
897907

apps/backend/src/rhesis/backend/tasks/execution/evaluation.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from rhesis.backend.metrics.evaluator import MetricEvaluator
2020
from rhesis.backend.tasks.execution.constants import MetricScope
2121
from rhesis.sdk.metrics import MetricConfig
22+
from rhesis.sdk.metrics.conversational.types import ConversationHistory
2223

2324
from .response_extractor import extract_response_with_fallback
2425

@@ -134,28 +135,32 @@ def evaluate_multi_turn_metrics(
134135
return {}
135136

136137
# Evaluate each metric on the conversation using the MetricEvaluator
137-
# For multi-turn, we reconstruct the conversation as a single prompt/response
138-
# pair and evaluate with the standard evaluator pipeline.
139-
# This approach reuses the existing evaluator infrastructure.
140138
metrics_evaluator = MetricEvaluator(model=model, db=db, organization_id=organization_id)
141139

142-
# Build a prompt/response pair from the conversation for evaluation
140+
# Build ConversationHistory from conversation_summary for conversational metrics
141+
conversation_summary = stored_output.get("conversation_summary", [])
142+
messages = []
143143
conversation_text = ""
144-
for turn in stored_output.get("conversation_summary", []):
144+
for turn in conversation_summary:
145145
penelope_msg = turn.get("penelope_message", "")
146146
target_resp = turn.get("target_response", "")
147147
if penelope_msg:
148+
messages.append({"role": "user", "content": penelope_msg})
148149
conversation_text += f"User: {penelope_msg}\n"
149150
if target_resp:
151+
messages.append({"role": "assistant", "content": target_resp})
150152
conversation_text += f"Assistant: {target_resp}\n"
151153

154+
conversation_history = ConversationHistory.from_messages(messages) if messages else None
155+
152156
try:
153157
results = metrics_evaluator.evaluate(
154158
input_text=goal,
155159
output_text=conversation_text.strip(),
156160
expected_output="",
157161
context=[],
158162
metrics=metric_configs,
163+
conversation_history=conversation_history,
159164
)
160165
except Exception as e:
161166
logger.warning(f"Error evaluating multi-turn metrics: {str(e)}")

apps/frontend/src/app/(protected)/metrics/components/MetricsClient.tsx

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,7 @@ export default function MetricsClientComponent({
136136
sort_by: 'created_at',
137137
sort_order: 'desc',
138138
}),
139-
metricsClient.getMetrics({
140-
skip: 0,
141-
limit: 100,
139+
metricsClient.getAllMetrics({
142140
sort_by: 'created_at',
143141
sort_order: 'desc',
144142
}),
@@ -148,7 +146,7 @@ export default function MetricsClientComponent({
148146
const behaviorsData = behaviorsWithMetricsData;
149147

150148
// Use all metrics from the dedicated metrics endpoint
151-
const metricsData = allMetricsData.data || [];
149+
const metricsData = allMetricsData;
152150

153151
// Add behavior IDs to each metric for compatibility
154152
const metricsWithBehaviors = metricsData.map(metric => {

0 commit comments

Comments
 (0)