Support mixed metrics for the skl interface. (#11536)

trivialfis · web-flow · commit 45def5ccefb6 · 2025-07-04T05:04:19.000+08:00
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
@@ -1,5 +1,6 @@
 # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-lines
 """Scikit-Learn Wrapper interface for XGBoost."""
+import collections
 import copy
 import json
 import os
@@ -432,7 +433,7 @@ def task(i: int) -> float:
         - ``one_output_per_tree``: One model for each target.
         - ``multi_output_tree``:  Use multi-target trees.
 
-    eval_metric : {Optional[Union[str, List[str], Callable]]}
+    eval_metric : {Optional[Union[str, List[Union[str, Callable]], Callable]]}
 
         .. versionadded:: 1.6.0
 
@@ -763,7 +764,7 @@ def __init__(
         max_cat_to_onehot: Optional[int] = None,
         max_cat_threshold: Optional[int] = None,
         multi_strategy: Optional[str] = None,
-        eval_metric: Optional[Union[str, List[str], Callable]] = None,
+        eval_metric: Optional[Union[str, List[Union[str, Callable]], Callable]] = None,
         early_stopping_rounds: Optional[int] = None,
         callbacks: Optional[List[TrainingCallback]] = None,
         **kwargs: Any,
@@ -1103,14 +1104,42 @@ def _duplicated(parameter: str) -> None:
 
         # - configure callable evaluation metric
         metric: Optional[Metric] = None
+
+        def custom_metric(m: Callable) -> Metric:
+            if self._get_type() == "ranker":
+                wrapped = ltr_metric_decorator(m, self.n_jobs)
+            else:
+                wrapped = _metric_decorator(m)
+            return wrapped
+
+        def invalid_type(m: Any) -> None:
+            msg = f"Invalid type for the `eval_metric`: {type(m)}"
+            raise TypeError(msg)
+
         if self.eval_metric is not None:
             if callable(self.eval_metric):
-                if self._get_type() == "ranker":
-                    metric = ltr_metric_decorator(self.eval_metric, self.n_jobs)
-                else:
-                    metric = _metric_decorator(self.eval_metric)
-            else:
+                metric = custom_metric(self.eval_metric)
+            elif isinstance(self.eval_metric, str):
                 params.update({"eval_metric": self.eval_metric})
+            else:
+                # A sequence of metrics
+                if not isinstance(self.eval_metric, collections.abc.Sequence):
+                    invalid_type(self.eval_metric)
+                # Could be a list of strings or callables
+                builtin_metrics: List[str] = []
+                for m in self.eval_metric:
+                    if callable(m):
+                        if metric is not None:
+                            raise NotImplementedError(
+                                "Using multiple custom metrics is not yet supported."
+                            )
+                        metric = custom_metric(m)
+                    elif isinstance(m, str):
+                        builtin_metrics.append(m)
+                    else:
+                        invalid_type(m)
+                if builtin_metrics:
+                    params.update({"eval_metric": builtin_metrics})
 
         if feature_weights is not None:
             _deprecated("feature_weights")
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
@@ -1397,6 +1397,29 @@ def merror(y_true: np.ndarray, predt: np.ndarray):
         clf.fit(X, y, eval_set=[(X, y)])
 
 
+def test_mixed_metrics() -> None:
+    from sklearn.datasets import make_classification
+    from sklearn.metrics import hamming_loss, hinge_loss, log_loss
+
+    X, y = make_classification(random_state=2025)
+
+    clf = xgb.XGBClassifier(eval_metric=["logloss", hinge_loss], n_estimators=2)
+    clf.fit(X, y, eval_set=[(X, y)])
+    results = clf.evals_result()["validation_0"]
+    assert "logloss" in results
+    assert "hinge_loss" in results
+
+    clf = xgb.XGBClassifier(eval_metric=[hamming_loss, log_loss], n_estimators=2)
+    with pytest.raises(
+        NotImplementedError, match="multiple custom metrics is not yet supported."
+    ):
+        clf.fit(X, y, eval_set=[(X, y)])
+
+    clf = xgb.XGBClassifier(eval_metric=[123, log_loss], n_estimators=2)
+    with pytest.raises(TypeError, match="Invalid type for the `eval_metric`"):
+        clf.fit(X, y, eval_set=[(X, y)])
+
+
 def test_weighted_evaluation_metric():
     from sklearn.datasets import make_hastie_10_2
     from sklearn.metrics import log_loss
diff --git a/tests/test_distributed/test_with_dask/test_with_dask.py b/tests/test_distributed/test_with_dask/test_with_dask.py
@@ -1677,6 +1677,30 @@ def sqr(
             results_custom = reg.evals_result()
             tm.non_increasing(results_custom["validation_0"]["rmse"])
 
+    @pytest.mark.skipif(**tm.no_sklearn())
+    def test_custom_metrics(self, client: "Client") -> None:
+        from sklearn.datasets import make_classification
+        from sklearn.metrics import hamming_loss, hinge_loss, log_loss
+
+        Xn, yn = make_classification(random_state=2025)
+        X, y = da.array(Xn), da.array(yn)
+
+        clf = dxgb.DaskXGBClassifier(
+            eval_metric=["logloss", hinge_loss], n_estimators=2
+        )
+        clf.fit(X, y, eval_set=[(X, y)])
+        results = clf.evals_result()["validation_0"]
+        assert "logloss" in results
+        assert "hinge_loss" in results
+
+        clf = dxgb.DaskXGBClassifier(
+            eval_metric=[hamming_loss, log_loss], n_estimators=2
+        )
+        with pytest.raises(
+            NotImplementedError, match="multiple custom metrics is not yet supported."
+        ):
+            clf.fit(X, y, eval_set=[(X, y)])
+
     def test_no_duplicated_partition(self) -> None:
         """Assert each worker has the correct amount of data, and DMatrix initialization
         doesn't generate unnecessary copies of data.