diff --git a/vertexai/preview/evaluation/_evaluation.py b/vertexai/preview/evaluation/_evaluation.py index 37394a2684..54b2e51536 100644 --- a/vertexai/preview/evaluation/_evaluation.py +++ b/vertexai/preview/evaluation/_evaluation.py @@ -317,11 +317,31 @@ def _aggregate_summary_metrics( f"{metric.metric_name}/{constants.MetricResult.PAIRWISE_CHOICE_KEY}" ) if pairwise_choice_col_name in metrics_table: + candidate_model_win_rate_choices = [ + "CANDIDATE", + utils.RATING_TO_VERDICT["B>>A"], + utils.RATING_TO_VERDICT["A<<B"], + utils.RATING_TO_VERDICT["B>A"], + utils.RATING_TO_VERDICT["A<B"], + ] + baseline_model_win_rate_choices = [ + "BASELINE", + utils.RATING_TO_VERDICT["A>>B"], + utils.RATING_TO_VERDICT["B<<A"], + utils.RATING_TO_VERDICT["B<A"], + utils.RATING_TO_VERDICT["A>B"], + ] summary_metrics[ f"{metric.metric_name}/candidate_model_win_rate" - ] = (metrics_table[pairwise_choice_col_name] == "CANDIDATE").mean() + ] = ( + metrics_table[pairwise_choice_col_name].isin( + candidate_model_win_rate_choices + ) + ).mean() summary_metrics[f"{metric.metric_name}/baseline_model_win_rate"] = ( - metrics_table[pairwise_choice_col_name] == "BASELINE" + metrics_table[pairwise_choice_col_name].isin( + baseline_model_win_rate_choices + ) ).mean() else: score_col_name = f"{str(metric)}/{constants.MetricResult.SCORE_KEY}" diff --git a/vertexai/preview/evaluation/utils.py b/vertexai/preview/evaluation/utils.py index 6427df107a..cdce8ad250 100644 --- a/vertexai/preview/evaluation/utils.py +++ b/vertexai/preview/evaluation/utils.py @@ -73,7 +73,7 @@ ) _SXS_RATING_REGEX = re.compile(r"\[\[(SxSRating:[AB<>=]+)\]\]", re.DOTALL) -_RATING_TO_VERDICT = { +RATING_TO_VERDICT = { "B>>A": "Candidate response is better than the baseline response.", "A<<B": "Candiate response is better than the baseline response.", "B>A": "Candidate response is slightly better than the baseline response.", @@ -554,8 +554,8 @@ def parse_pairwise_rubric_result( return { "pairwise_choice": ( - _RATING_TO_VERDICT[rating_str] - if rating_str in _RATING_TO_VERDICT + RATING_TO_VERDICT[rating_str] + if rating_str in RATING_TO_VERDICT else rating_str ), "score": (