Skip to content

Commit d442c42

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Update win rate calculation in GenAI Evaluation for rubric based evaluation
PiperOrigin-RevId: 747474971
1 parent 64386e9 commit d442c42

File tree

2 files changed

+25
-5
lines changed

2 files changed

+25
-5
lines changed

vertexai/preview/evaluation/_evaluation.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -317,11 +317,31 @@ def _aggregate_summary_metrics(
317317
f"{metric.metric_name}/{constants.MetricResult.PAIRWISE_CHOICE_KEY}"
318318
)
319319
if pairwise_choice_col_name in metrics_table:
320+
candidate_model_win_rate_choices = [
321+
"CANDIDATE",
322+
utils.RATING_TO_VERDICT["B>>A"],
323+
utils.RATING_TO_VERDICT["A<<B"],
324+
utils.RATING_TO_VERDICT["B>A"],
325+
utils.RATING_TO_VERDICT["A<B"],
326+
]
327+
baseline_model_win_rate_choices = [
328+
"BASELINE",
329+
utils.RATING_TO_VERDICT["A>>B"],
330+
utils.RATING_TO_VERDICT["B<<A"],
331+
utils.RATING_TO_VERDICT["B<A"],
332+
utils.RATING_TO_VERDICT["A>B"],
333+
]
320334
summary_metrics[
321335
f"{metric.metric_name}/candidate_model_win_rate"
322-
] = (metrics_table[pairwise_choice_col_name] == "CANDIDATE").mean()
336+
] = (
337+
metrics_table[pairwise_choice_col_name].isin(
338+
candidate_model_win_rate_choices
339+
)
340+
).mean()
323341
summary_metrics[f"{metric.metric_name}/baseline_model_win_rate"] = (
324-
metrics_table[pairwise_choice_col_name] == "BASELINE"
342+
metrics_table[pairwise_choice_col_name].isin(
343+
baseline_model_win_rate_choices
344+
)
325345
).mean()
326346
else:
327347
score_col_name = f"{str(metric)}/{constants.MetricResult.SCORE_KEY}"

vertexai/preview/evaluation/utils.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
)
7474
_SXS_RATING_REGEX = re.compile(r"\[\[(SxSRating:[AB<>=]+)\]\]", re.DOTALL)
7575

76-
_RATING_TO_VERDICT = {
76+
RATING_TO_VERDICT = {
7777
"B>>A": "Candidate response is better than the baseline response.",
7878
"A<<B": "Candiate response is better than the baseline response.",
7979
"B>A": "Candidate response is slightly better than the baseline response.",
@@ -554,8 +554,8 @@ def parse_pairwise_rubric_result(
554554

555555
return {
556556
"pairwise_choice": (
557-
_RATING_TO_VERDICT[rating_str]
558-
if rating_str in _RATING_TO_VERDICT
557+
RATING_TO_VERDICT[rating_str]
558+
if rating_str in RATING_TO_VERDICT
559559
else rating_str
560560
),
561561
"score": (

0 commit comments

Comments
 (0)