dmlc · trivialfis · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025
diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
@@ -24,6 +24,7 @@ namespace xgboost {
 class Json;
 class FeatureMap;
 class ObjFunction;
+class CatContainer;
 
 struct Context;
 struct LearnerModelParam;
@@ -135,12 +136,12 @@ class GradientBooster : public Model, public Configurable {
                                                bst_layer_t layer_begin, bst_layer_t layer_end,
                                                bool approximate) = 0;
 
-  /*!
-   * \brief dump the model in the requested format
-   * \param fmap feature map that may help give interpretations of feature
-   * \param with_stats extra statistics while dumping model
-   * \param format the format to dump the model in
-   * \return a vector of dump for boosters.
+  /**
+   * @brief dump the model in the requested format
+   * @param fmap feature map that may help give interpretations of feature
+   * @param with_stats extra statistics while dumping model
+   * @param format the format to dump the model in
+   * @return a vector of dump for boosters.
    */
   [[nodiscard]] virtual std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
                                                            std::string format) const = 0;
@@ -149,12 +150,19 @@ class GradientBooster : public Model, public Configurable {
                             common::Span<int32_t const> trees,
                             std::vector<bst_feature_t>* features,
                             std::vector<float>* scores) const = 0;
-  /*!
-   * \brief create a gradient booster from given name
-   * \param name name of gradient booster
-   * \param generic_param Pointer to runtime parameters
-   * \param learner_model_param pointer to global model parameters
-   * \return The created booster.
+  /**
+   * @brief Getter for categories.
+   */
+  [[nodiscard]] virtual CatContainer const* Cats() const {
+    LOG(FATAL) << "Retrieving categories is not supported by the current booster.";
+    return nullptr;
+  }
+  /**
+   * @brief create a gradient booster from given name
+   * @param name name of gradient booster
+   * @param generic_param Pointer to runtime parameters
+   * @param learner_model_param pointer to global model parameters
+   * @return The created booster.
    */
   static GradientBooster* Create(const std::string& name, Context const* ctx,
                                  LearnerModelParam const* learner_model_param);

diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
@@ -1,6 +1,6 @@
 /**
  * Copyright 2015-2025, XGBoost Contributors
- * \file learner.h
+ *
  * \brief Learner interface that integrates objective, gbm and evaluation together.
  *  This is the user facing XGBoost training module.
  * \author Tianqi Chen
@@ -35,6 +35,7 @@ class Json;
 struct XGBAPIThreadLocalEntry;
 template <typename T>
 class HostDeviceVector;
+class CatContainer;
 
 enum class PredictionType : std::uint8_t {  // NOLINT
   kValue = 0,
@@ -167,11 +168,11 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
    */
   virtual void SetParam(const std::string& key, const std::string& value) = 0;
 
-  /*!
-   * \brief Get the number of features of the booster.
-   * \return number of features
+  /**
+   * @brief Get the number of features of the booster.
+   * @return The number of features
    */
-  virtual uint32_t GetNumFeature() const = 0;
+  virtual bst_feature_t GetNumFeature() const = 0;
 
   /*!
    * \brief Set additional attribute to the Booster.
@@ -221,16 +222,19 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
    * \param fn Output feature types
    */
   virtual void GetFeatureTypes(std::vector<std::string>* ft) const = 0;
-
   /**
-   * \brief Slice the model.
+   * @brief Getter for categories.
+   */
+  [[nodiscard]] virtual CatContainer const* Cats() const = 0;
+  /**
+   * @brief Slice the model.
    *
    * See InplacePredict for layer parameters.
    *
-   * \param step step size between slice.
-   * \param out_of_bound Return true if end layer is out of bound.
+   * @param step step size between slice.
+   * @param out_of_bound Return true if end layer is out of bound.
    *
-   * \return a sliced model.
+   * @return a sliced model.
    */
   virtual Learner* Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step,
                          bool* out_of_bound) = 0;

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -779,6 +779,64 @@ def inner_f(*args: Any, **kwargs: Any) -> _T:
 _deprecate_positional_args = require_keyword_args(False)
 
 
+def _get_categories(
+    cfn: Callable[[ctypes.c_char_p], int],
+    feature_names: Optional[FeatureNames],
+    n_features: int,
+) -> Optional[Dict[str, "pa.DictionaryArray"]]:
+    if not is_pyarrow_available():
+        raise ImportError("`pyarrow` is required for exporting categories.")
+
+    if TYPE_CHECKING:
+        import pyarrow as pa
+    else:
+        pa = import_pyarrow()
+
+    fnames = feature_names
+    if fnames is None:
+        fnames = [str(i) for i in range(n_features)]
+
+    results: Dict[str, "pa.DictionaryArray"] = {}
+
+    ret = ctypes.c_char_p()
+    _check_call(cfn(ret))
+    if ret.value is None:
+        return None
+
+    retstr = ret.value.decode()  # pylint: disable=no-member
+    jcats = json.loads(retstr)
+    assert isinstance(jcats, list) and len(jcats) == n_features
+
+    for fidx in range(n_features):
+        f_jcats = jcats[fidx]
+        if f_jcats is None:
+            # Numeric data
+            results[fnames[fidx]] = None
+            continue
+
+        if "offsets" not in f_jcats:
+            values = from_array_interface(f_jcats)
+            pa_values = pa.Array.from_pandas(values)
+            results[fnames[fidx]] = pa_values
+            continue
+
+        joffsets = f_jcats["offsets"]
+        jvalues = f_jcats["values"]
+        offsets = from_array_interface(joffsets, True)
+        values = from_array_interface(jvalues, True)
+        pa_offsets = pa.array(offsets).buffers()
+        pa_values = pa.array(values).buffers()
+        assert (
+            pa_offsets[0] is None and pa_values[0] is None
+        ), "Should not have null mask."
+        pa_dict = pa.StringArray.from_buffers(
+            len(offsets) - 1, pa_offsets[1], pa_values[1]
+        )
+        results[fnames[fidx]] = pa_dict
+
+    return results
+
+
 @unique
 class DataSplitMode(IntEnum):
     """Supported data split mode for DMatrix."""
@@ -1299,58 +1357,11 @@ def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
         .. versionadded:: 3.1.0
 
         """
-        if not is_pyarrow_available():
-            raise ImportError("`pyarrow` is required for exporting categories.")
-
-        if TYPE_CHECKING:
-            import pyarrow as pa
-        else:
-            pa = import_pyarrow()
-
-        n_features = self.num_col()
-        fnames = self.feature_names
-        if fnames is None:
-            fnames = [str(i) for i in range(n_features)]
-
-        results: Dict[str, "pa.DictionaryArray"] = {}
-
-        ret = ctypes.c_char_p()
-        _check_call(_LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(ret)))
-        if ret.value is None:
-            return None
-
-        retstr = ret.value.decode()  # pylint: disable=no-member
-        jcats = json.loads(retstr)
-        assert isinstance(jcats, list) and len(jcats) == n_features
-
-        for fidx in range(n_features):
-            f_jcats = jcats[fidx]
-            if f_jcats is None:
-                # Numeric data
-                results[fnames[fidx]] = None
-                continue
-
-            if "offsets" not in f_jcats:
-                values = from_array_interface(f_jcats)
-                pa_values = pa.Array.from_pandas(values)
-                results[fnames[fidx]] = pa_values
-                continue
-
-            joffsets = f_jcats["offsets"]
-            jvalues = f_jcats["values"]
-            offsets = from_array_interface(joffsets, True)
-            values = from_array_interface(jvalues, True)
-            pa_offsets = pa.array(offsets).buffers()
-            pa_values = pa.array(values).buffers()
-            assert (
-                pa_offsets[0] is None and pa_values[0] is None
-            ), "Should not have null mask."
-            pa_dict = pa.StringArray.from_buffers(
-                len(offsets) - 1, pa_offsets[1], pa_values[1]
-            )
-            results[fnames[fidx]] = pa_dict
-
-        return results
+        return _get_categories(
+            lambda ret: _LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(ret)),
+            self.feature_names,
+            self.num_col(),
+        )
 
     def num_row(self) -> int:
         """Get the number of rows in the DMatrix."""
@@ -2312,6 +2323,23 @@ def feature_names(self) -> Optional[FeatureNames]:
     def feature_names(self, features: Optional[FeatureNames]) -> None:
         self._set_feature_info(features, "feature_name")
 
+    def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
+        """Get the categories in the dataset using `pyarrow`. Returns `None` if there's
+        no categorical features.
+
+        .. warning::
+
+            This function is still working in progress.
+
+        .. versionadded:: 3.1.0
+
+        """
+        return _get_categories(
+            lambda ret: _LIB.XGBoosterGetCategories(self.handle, ctypes.byref(ret)),
+            self.feature_names,
+            self.num_features(),
+        )
+
     def set_param(
         self,
         params: Union[Dict, Iterable[Tuple[str, Any]], str],

diff --git a/python-package/xgboost/testing/ordinal.py b/python-package/xgboost/testing/ordinal.py
@@ -14,7 +14,12 @@
 from ..core import DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
 from ..data import _lazy_load_cudf_is_cat
 from ..training import train
-from .data import IteratorForTest, is_pd_cat_dtype, make_categorical
+from .data import (
+    IteratorForTest,
+    is_pd_cat_dtype,
+    make_batches,
+    make_categorical,
+)
 
 
 def get_df_impl(device: str) -> Tuple[Type, Type]:
@@ -50,10 +55,24 @@ def assert_allclose(device: str, a: Any, b: Any) -> None:
         cp.testing.assert_allclose(a, b)
 
 
+def comp_booster(device: Literal["cpu", "cuda"], Xy: DMatrix, booster: str) -> None:
+    """Compare the results from DMatrix and Booster."""
+    cats = Xy.get_categories()
+    assert cats is not None
+
+    rng = np.random.default_rng(2025)
+    Xy.set_label(rng.normal(size=Xy.num_row()))
+    bst = train({"booster": booster, "device": device}, Xy, 1)
+    cats_bst = bst.get_categories()
+    assert cats_bst is not None
+    for k, v in cats_bst.items():
+        assert v == cats[k]
+
+
 def run_cat_container(device: Literal["cpu", "cuda"]) -> None:
     """Basic tests for the container class used by the DMatrix."""
 
-    def run_dispatch(device: str, DMatrixT: Type) -> None:
+    def run_dispatch(device: Literal["cpu", "cuda"], DMatrixT: Type) -> None:
         Df, _ = get_df_impl(device)
         # Basic test with a single feature
         df = Df({"c": ["cdef", "abc"]}, dtype="category")
@@ -86,10 +105,16 @@ def run_dispatch(device: str, DMatrixT: Type) -> None:
         assert_allclose(device, csr.indptr, np.array([0, 1, 1, 2, 3]))
         assert_allclose(device, csr.indices, np.array([0, 0, 0]))
 
+        comp_booster(device, Xy, "gbtree")
+        comp_booster(device, Xy, "dart")
+
         # Test with explicit null-terminated strings.
         df = Df({"c": ["cdef", None, "abc", "abc\0"]}, dtype="category")
         Xy = DMatrixT(df, enable_categorical=True)
 
+        comp_booster(device, Xy, "gbtree")
+        comp_booster(device, Xy, "dart")
+
     for dm in (DMatrix, QuantileDMatrix):
         run_dispatch(device, dm)
 
@@ -129,6 +154,7 @@ def check(Xy: DMatrix, X: pd.DataFrame) -> None:
                 assert cats[fname] is None
 
         if not hasattr(Xy, "ref"):  # not quantile DMatrix.
+            assert not isinstance(Xy, QuantileDMatrix)
             with tempfile.TemporaryDirectory() as tmpdir:
                 fname = os.path.join(tmpdir, "DMatrix.binary")
                 Xy.save_binary(fname)
@@ -144,6 +170,9 @@ def check(Xy: DMatrix, X: pd.DataFrame) -> None:
                     else:
                         assert v_0.to_pylist() == v_1.to_pylist()
 
+        comp_booster(device, Xy, "gbtree")
+        comp_booster(device, Xy, "dart")
+
     def run_dispatch(DMatrixT: Type) -> None:
         # full str type
         X, y = make_categorical(
@@ -216,6 +245,15 @@ def run_dispatch(DMatrixT: Type) -> None:
     for dm in (DMatrix, QuantileDMatrix):
         run_dispatch(dm)
 
+    batches = make_batches(
+        n_samples_per_batch=128, n_features=4, n_batches=1, use_cupy=device == "cuda"
+    )
+    X, y, w = map(lambda x: x[0], batches)
+    Xy = DMatrix(X, y, weight=w)
+    assert Xy.get_categories() is None
+    Xy = QuantileDMatrix(X, y, weight=w)
+    assert Xy.get_categories() is None
+
 
 def run_cat_container_iter(device: Literal["cpu", "cuda"]) -> None:
     """Test the categories container for iterator-based inputs."""