Skip to content

[enc] Add a cat accessor to the booster. #11568

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 20 additions & 12 deletions include/xgboost/gbm.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ namespace xgboost {
class Json;
class FeatureMap;
class ObjFunction;
class CatContainer;

struct Context;
struct LearnerModelParam;
Expand Down Expand Up @@ -135,12 +136,12 @@ class GradientBooster : public Model, public Configurable {
bst_layer_t layer_begin, bst_layer_t layer_end,
bool approximate) = 0;

/*!
* \brief dump the model in the requested format
* \param fmap feature map that may help give interpretations of feature
* \param with_stats extra statistics while dumping model
* \param format the format to dump the model in
* \return a vector of dump for boosters.
/**
* @brief dump the model in the requested format
* @param fmap feature map that may help give interpretations of feature
* @param with_stats extra statistics while dumping model
* @param format the format to dump the model in
* @return a vector of dump for boosters.
*/
[[nodiscard]] virtual std::vector<std::string> DumpModel(const FeatureMap& fmap, bool with_stats,
std::string format) const = 0;
Expand All @@ -149,12 +150,19 @@ class GradientBooster : public Model, public Configurable {
common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) const = 0;
/*!
* \brief create a gradient booster from given name
* \param name name of gradient booster
* \param generic_param Pointer to runtime parameters
* \param learner_model_param pointer to global model parameters
* \return The created booster.
/**
* @brief Getter for categories.
*/
[[nodiscard]] virtual CatContainer const* Cats() const {
LOG(FATAL) << "Retrieving categories is not supported by the current booster.";
return nullptr;
}
/**
* @brief create a gradient booster from given name
* @param name name of gradient booster
* @param generic_param Pointer to runtime parameters
* @param learner_model_param pointer to global model parameters
* @return The created booster.
*/
static GradientBooster* Create(const std::string& name, Context const* ctx,
LearnerModelParam const* learner_model_param);
Expand Down
24 changes: 14 additions & 10 deletions include/xgboost/learner.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/**
* Copyright 2015-2025, XGBoost Contributors
* \file learner.h
*
* \brief Learner interface that integrates objective, gbm and evaluation together.
* This is the user facing XGBoost training module.
* \author Tianqi Chen
Expand Down Expand Up @@ -35,6 +35,7 @@ class Json;
struct XGBAPIThreadLocalEntry;
template <typename T>
class HostDeviceVector;
class CatContainer;

enum class PredictionType : std::uint8_t { // NOLINT
kValue = 0,
Expand Down Expand Up @@ -167,11 +168,11 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
*/
virtual void SetParam(const std::string& key, const std::string& value) = 0;

/*!
* \brief Get the number of features of the booster.
* \return number of features
/**
* @brief Get the number of features of the booster.
* @return The number of features
*/
virtual uint32_t GetNumFeature() const = 0;
virtual bst_feature_t GetNumFeature() const = 0;

/*!
* \brief Set additional attribute to the Booster.
Expand Down Expand Up @@ -221,16 +222,19 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
* \param fn Output feature types
*/
virtual void GetFeatureTypes(std::vector<std::string>* ft) const = 0;

/**
* \brief Slice the model.
* @brief Getter for categories.
*/
[[nodiscard]] virtual CatContainer const* Cats() const = 0;
/**
* @brief Slice the model.
*
* See InplacePredict for layer parameters.
*
* \param step step size between slice.
* \param out_of_bound Return true if end layer is out of bound.
* @param step step size between slice.
* @param out_of_bound Return true if end layer is out of bound.
*
* \return a sliced model.
* @return a sliced model.
*/
virtual Learner* Slice(bst_layer_t begin, bst_layer_t end, bst_layer_t step,
bool* out_of_bound) = 0;
Expand Down
132 changes: 80 additions & 52 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,6 +779,64 @@ def inner_f(*args: Any, **kwargs: Any) -> _T:
_deprecate_positional_args = require_keyword_args(False)


def _get_categories(
cfn: Callable[[ctypes.c_char_p], int],
feature_names: Optional[FeatureNames],
n_features: int,
) -> Optional[Dict[str, "pa.DictionaryArray"]]:
if not is_pyarrow_available():
raise ImportError("`pyarrow` is required for exporting categories.")

if TYPE_CHECKING:
import pyarrow as pa
else:
pa = import_pyarrow()

fnames = feature_names
if fnames is None:
fnames = [str(i) for i in range(n_features)]

results: Dict[str, "pa.DictionaryArray"] = {}

ret = ctypes.c_char_p()
_check_call(cfn(ret))
if ret.value is None:
return None

retstr = ret.value.decode() # pylint: disable=no-member
jcats = json.loads(retstr)
assert isinstance(jcats, list) and len(jcats) == n_features

for fidx in range(n_features):
f_jcats = jcats[fidx]
if f_jcats is None:
# Numeric data
results[fnames[fidx]] = None
continue

if "offsets" not in f_jcats:
values = from_array_interface(f_jcats)
pa_values = pa.Array.from_pandas(values)
results[fnames[fidx]] = pa_values
continue

joffsets = f_jcats["offsets"]
jvalues = f_jcats["values"]
offsets = from_array_interface(joffsets, True)
values = from_array_interface(jvalues, True)
pa_offsets = pa.array(offsets).buffers()
pa_values = pa.array(values).buffers()
assert (
pa_offsets[0] is None and pa_values[0] is None
), "Should not have null mask."
pa_dict = pa.StringArray.from_buffers(
len(offsets) - 1, pa_offsets[1], pa_values[1]
)
results[fnames[fidx]] = pa_dict

return results


@unique
class DataSplitMode(IntEnum):
"""Supported data split mode for DMatrix."""
Expand Down Expand Up @@ -1299,58 +1357,11 @@ def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
.. versionadded:: 3.1.0

"""
if not is_pyarrow_available():
raise ImportError("`pyarrow` is required for exporting categories.")

if TYPE_CHECKING:
import pyarrow as pa
else:
pa = import_pyarrow()

n_features = self.num_col()
fnames = self.feature_names
if fnames is None:
fnames = [str(i) for i in range(n_features)]

results: Dict[str, "pa.DictionaryArray"] = {}

ret = ctypes.c_char_p()
_check_call(_LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(ret)))
if ret.value is None:
return None

retstr = ret.value.decode() # pylint: disable=no-member
jcats = json.loads(retstr)
assert isinstance(jcats, list) and len(jcats) == n_features

for fidx in range(n_features):
f_jcats = jcats[fidx]
if f_jcats is None:
# Numeric data
results[fnames[fidx]] = None
continue

if "offsets" not in f_jcats:
values = from_array_interface(f_jcats)
pa_values = pa.Array.from_pandas(values)
results[fnames[fidx]] = pa_values
continue

joffsets = f_jcats["offsets"]
jvalues = f_jcats["values"]
offsets = from_array_interface(joffsets, True)
values = from_array_interface(jvalues, True)
pa_offsets = pa.array(offsets).buffers()
pa_values = pa.array(values).buffers()
assert (
pa_offsets[0] is None and pa_values[0] is None
), "Should not have null mask."
pa_dict = pa.StringArray.from_buffers(
len(offsets) - 1, pa_offsets[1], pa_values[1]
)
results[fnames[fidx]] = pa_dict

return results
return _get_categories(
lambda ret: _LIB.XGBDMatrixGetCategories(self.handle, ctypes.byref(ret)),
self.feature_names,
self.num_col(),
)

def num_row(self) -> int:
"""Get the number of rows in the DMatrix."""
Expand Down Expand Up @@ -2312,6 +2323,23 @@ def feature_names(self) -> Optional[FeatureNames]:
def feature_names(self, features: Optional[FeatureNames]) -> None:
self._set_feature_info(features, "feature_name")

def get_categories(self) -> Optional[Dict[str, "pa.DictionaryArray"]]:
"""Get the categories in the dataset using `pyarrow`. Returns `None` if there's
no categorical features.

.. warning::

This function is still working in progress.

.. versionadded:: 3.1.0

"""
return _get_categories(
lambda ret: _LIB.XGBoosterGetCategories(self.handle, ctypes.byref(ret)),
self.feature_names,
self.num_features(),
)

def set_param(
self,
params: Union[Dict, Iterable[Tuple[str, Any]], str],
Expand Down
42 changes: 40 additions & 2 deletions python-package/xgboost/testing/ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
from ..core import DMatrix, ExtMemQuantileDMatrix, QuantileDMatrix
from ..data import _lazy_load_cudf_is_cat
from ..training import train
from .data import IteratorForTest, is_pd_cat_dtype, make_categorical
from .data import (
IteratorForTest,
is_pd_cat_dtype,
make_batches,
make_categorical,
)


def get_df_impl(device: str) -> Tuple[Type, Type]:
Expand Down Expand Up @@ -50,10 +55,24 @@ def assert_allclose(device: str, a: Any, b: Any) -> None:
cp.testing.assert_allclose(a, b)


def comp_booster(device: Literal["cpu", "cuda"], Xy: DMatrix, booster: str) -> None:
"""Compare the results from DMatrix and Booster."""
cats = Xy.get_categories()
assert cats is not None

rng = np.random.default_rng(2025)
Xy.set_label(rng.normal(size=Xy.num_row()))
bst = train({"booster": booster, "device": device}, Xy, 1)
cats_bst = bst.get_categories()
assert cats_bst is not None
for k, v in cats_bst.items():
assert v == cats[k]


def run_cat_container(device: Literal["cpu", "cuda"]) -> None:
"""Basic tests for the container class used by the DMatrix."""

def run_dispatch(device: str, DMatrixT: Type) -> None:
def run_dispatch(device: Literal["cpu", "cuda"], DMatrixT: Type) -> None:
Df, _ = get_df_impl(device)
# Basic test with a single feature
df = Df({"c": ["cdef", "abc"]}, dtype="category")
Expand Down Expand Up @@ -86,10 +105,16 @@ def run_dispatch(device: str, DMatrixT: Type) -> None:
assert_allclose(device, csr.indptr, np.array([0, 1, 1, 2, 3]))
assert_allclose(device, csr.indices, np.array([0, 0, 0]))

comp_booster(device, Xy, "gbtree")
comp_booster(device, Xy, "dart")

# Test with explicit null-terminated strings.
df = Df({"c": ["cdef", None, "abc", "abc\0"]}, dtype="category")
Xy = DMatrixT(df, enable_categorical=True)

comp_booster(device, Xy, "gbtree")
comp_booster(device, Xy, "dart")

for dm in (DMatrix, QuantileDMatrix):
run_dispatch(device, dm)

Expand Down Expand Up @@ -129,6 +154,7 @@ def check(Xy: DMatrix, X: pd.DataFrame) -> None:
assert cats[fname] is None

if not hasattr(Xy, "ref"): # not quantile DMatrix.
assert not isinstance(Xy, QuantileDMatrix)
with tempfile.TemporaryDirectory() as tmpdir:
fname = os.path.join(tmpdir, "DMatrix.binary")
Xy.save_binary(fname)
Expand All @@ -144,6 +170,9 @@ def check(Xy: DMatrix, X: pd.DataFrame) -> None:
else:
assert v_0.to_pylist() == v_1.to_pylist()

comp_booster(device, Xy, "gbtree")
comp_booster(device, Xy, "dart")

def run_dispatch(DMatrixT: Type) -> None:
# full str type
X, y = make_categorical(
Expand Down Expand Up @@ -216,6 +245,15 @@ def run_dispatch(DMatrixT: Type) -> None:
for dm in (DMatrix, QuantileDMatrix):
run_dispatch(dm)

batches = make_batches(
n_samples_per_batch=128, n_features=4, n_batches=1, use_cupy=device == "cuda"
)
X, y, w = map(lambda x: x[0], batches)
Xy = DMatrix(X, y, weight=w)
assert Xy.get_categories() is None
Xy = QuantileDMatrix(X, y, weight=w)
assert Xy.get_categories() is None


def run_cat_container_iter(device: Literal["cpu", "cuda"]) -> None:
"""Test the categories container for iterator-based inputs."""
Expand Down
Loading
Loading