Skip to content

Does XGBoost_Ray make use of DataIter for the XGBoost External Memory feature? #304

@daviddwlee84

Description

@daviddwlee84

I want to load larger than memory data which exceeds the cluster's entire memory summation.

To be specific, I want to take advantage of the feature Using XGBoost External Memory Version — xgboost 2.1.0-dev documentation and Experimental support for external memory — xgboost 2.1.0-dev documentation.

I found RayDataIter but seems it is only been used when it founds it is using legacy XGBoost (< 1.5.0 I think, without DataIter).

try:
from xgboost.core import DataIter
LEGACY_MATRIX = False
except ImportError:
DataIter = object
LEGACY_MATRIX = True

def _get_dmatrix(data: RayDMatrix, param: Dict) -> xgb.DMatrix:
if QUANTILE_AVAILABLE and isinstance(data, RayQuantileDMatrix):
if isinstance(param["data"], list):
qdm_param = _prepare_dmatrix_params(param)
param.update(qdm_param)
if data.enable_categorical is not None:
param["enable_categorical"] = data.enable_categorical
matrix = xgb.QuantileDMatrix(**param)
if not LEGACY_MATRIX and isinstance(data, RayDeviceQuantileDMatrix):
# If we only got a single data shard, create a list so we can
# iterate over it
if not isinstance(param["data"], list):
param["data"] = [param["data"]]
if not isinstance(param["label"], list):
param["label"] = [param["label"]]
if not isinstance(param["weight"], list):
param["weight"] = [param["weight"]]
if not isinstance(param["feature_weights"], list):
param["feature_weights"] = [param["feature_weights"]]
if not isinstance(param["qid"], list):
param["qid"] = [param["qid"]]
if not isinstance(param["data"], list):
param["base_margin"] = [param["base_margin"]]
param["label_lower_bound"] = [None]
param["label_upper_bound"] = [None]
dm_param = {
"feature_names": data.feature_names,
"feature_types": data.feature_types,
"missing": data.missing,
}
if data.enable_categorical is not None:
dm_param["enable_categorical"] = data.enable_categorical
param.update(dm_param)
it = RayDataIter(**param)
matrix = xgb.DeviceQuantileDMatrix(it, **dm_param)
else:
if isinstance(param["data"], list):
dm_param = _prepare_dmatrix_params(param)
param.update(dm_param)
ll = param.pop("label_lower_bound", None)
lu = param.pop("label_upper_bound", None)
fw = param.pop("feature_weights", None)
if LEGACY_MATRIX:
param.pop("base_margin", None)
if "qid" not in inspect.signature(xgb.DMatrix).parameters:
param.pop("qid", None)
if data.enable_categorical is not None:
param["enable_categorical"] = data.enable_categorical
matrix = xgb.DMatrix(**param)
if not LEGACY_MATRIX:
matrix.set_info(
label_lower_bound=ll, label_upper_bound=lu, feature_weights=fw
)
data.update_matrix_properties(matrix)
return matrix

Maybe it is better that we can construct XGBoost DMatrix with customized DataIter instead of concatenating all the data at once.

matrix = xgb.DMatrix(**param)

def _prepare_dmatrix_params(param: Dict) -> Dict:
dm_param = {
"data": concat_dataframes(param["data"]),
"label": concat_dataframes(param["label"]),
"weight": concat_dataframes(param["weight"]),
"feature_weights": concat_dataframes(param["feature_weights"]),
"qid": concat_dataframes(param["qid"]),
"base_margin": concat_dataframes(param["base_margin"]),
"label_lower_bound": concat_dataframes(param["label_lower_bound"]),
"label_upper_bound": concat_dataframes(param["label_upper_bound"]),
}
return dm_param

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions