-
Notifications
You must be signed in to change notification settings - Fork 36
Description
I want to load larger than memory data which exceeds the cluster's entire memory summation.
To be specific, I want to take advantage of the feature Using XGBoost External Memory Version — xgboost 2.1.0-dev documentation and Experimental support for external memory — xgboost 2.1.0-dev documentation.
I found RayDataIter but seems it is only been used when it founds it is using legacy XGBoost (< 1.5.0 I think, without DataIter).
xgboost_ray/xgboost_ray/matrix.py
Lines 43 to 49 in 9081780
try: | |
from xgboost.core import DataIter | |
LEGACY_MATRIX = False | |
except ImportError: | |
DataIter = object | |
LEGACY_MATRIX = True |
xgboost_ray/xgboost_ray/main.py
Lines 365 to 431 in 9081780
def _get_dmatrix(data: RayDMatrix, param: Dict) -> xgb.DMatrix: | |
if QUANTILE_AVAILABLE and isinstance(data, RayQuantileDMatrix): | |
if isinstance(param["data"], list): | |
qdm_param = _prepare_dmatrix_params(param) | |
param.update(qdm_param) | |
if data.enable_categorical is not None: | |
param["enable_categorical"] = data.enable_categorical | |
matrix = xgb.QuantileDMatrix(**param) | |
if not LEGACY_MATRIX and isinstance(data, RayDeviceQuantileDMatrix): | |
# If we only got a single data shard, create a list so we can | |
# iterate over it | |
if not isinstance(param["data"], list): | |
param["data"] = [param["data"]] | |
if not isinstance(param["label"], list): | |
param["label"] = [param["label"]] | |
if not isinstance(param["weight"], list): | |
param["weight"] = [param["weight"]] | |
if not isinstance(param["feature_weights"], list): | |
param["feature_weights"] = [param["feature_weights"]] | |
if not isinstance(param["qid"], list): | |
param["qid"] = [param["qid"]] | |
if not isinstance(param["data"], list): | |
param["base_margin"] = [param["base_margin"]] | |
param["label_lower_bound"] = [None] | |
param["label_upper_bound"] = [None] | |
dm_param = { | |
"feature_names": data.feature_names, | |
"feature_types": data.feature_types, | |
"missing": data.missing, | |
} | |
if data.enable_categorical is not None: | |
dm_param["enable_categorical"] = data.enable_categorical | |
param.update(dm_param) | |
it = RayDataIter(**param) | |
matrix = xgb.DeviceQuantileDMatrix(it, **dm_param) | |
else: | |
if isinstance(param["data"], list): | |
dm_param = _prepare_dmatrix_params(param) | |
param.update(dm_param) | |
ll = param.pop("label_lower_bound", None) | |
lu = param.pop("label_upper_bound", None) | |
fw = param.pop("feature_weights", None) | |
if LEGACY_MATRIX: | |
param.pop("base_margin", None) | |
if "qid" not in inspect.signature(xgb.DMatrix).parameters: | |
param.pop("qid", None) | |
if data.enable_categorical is not None: | |
param["enable_categorical"] = data.enable_categorical | |
matrix = xgb.DMatrix(**param) | |
if not LEGACY_MATRIX: | |
matrix.set_info( | |
label_lower_bound=ll, label_upper_bound=lu, feature_weights=fw | |
) | |
data.update_matrix_properties(matrix) | |
return matrix |
Maybe it is better that we can construct XGBoost DMatrix with customized DataIter instead of concatenating all the data at once.
xgboost_ray/xgboost_ray/main.py
Line 423 in 9081780
matrix = xgb.DMatrix(**param) |
xgboost_ray/xgboost_ray/main.py
Lines 351 to 362 in 9081780
def _prepare_dmatrix_params(param: Dict) -> Dict: | |
dm_param = { | |
"data": concat_dataframes(param["data"]), | |
"label": concat_dataframes(param["label"]), | |
"weight": concat_dataframes(param["weight"]), | |
"feature_weights": concat_dataframes(param["feature_weights"]), | |
"qid": concat_dataframes(param["qid"]), | |
"base_margin": concat_dataframes(param["base_margin"]), | |
"label_lower_bound": concat_dataframes(param["label_lower_bound"]), | |
"label_upper_bound": concat_dataframes(param["label_upper_bound"]), | |
} | |
return dm_param |