|
def _get_dmatrix(data: RayDMatrix, param: Dict) -> xgb.DMatrix: |
|
if QUANTILE_AVAILABLE and isinstance(data, RayQuantileDMatrix): |
|
if isinstance(param["data"], list): |
|
qdm_param = _prepare_dmatrix_params(param) |
|
param.update(qdm_param) |
|
if data.enable_categorical is not None: |
|
param["enable_categorical"] = data.enable_categorical |
|
matrix = xgb.QuantileDMatrix(**param) |
|
if not LEGACY_MATRIX and isinstance(data, RayDeviceQuantileDMatrix): |
|
# If we only got a single data shard, create a list so we can |
|
# iterate over it |
|
if not isinstance(param["data"], list): |
|
param["data"] = [param["data"]] |
|
|
|
if not isinstance(param["label"], list): |
|
param["label"] = [param["label"]] |
|
if not isinstance(param["weight"], list): |
|
param["weight"] = [param["weight"]] |
|
if not isinstance(param["feature_weights"], list): |
|
param["feature_weights"] = [param["feature_weights"]] |
|
if not isinstance(param["qid"], list): |
|
param["qid"] = [param["qid"]] |
|
if not isinstance(param["data"], list): |
|
param["base_margin"] = [param["base_margin"]] |
|
|
|
param["label_lower_bound"] = [None] |
|
param["label_upper_bound"] = [None] |
|
|
|
dm_param = { |
|
"feature_names": data.feature_names, |
|
"feature_types": data.feature_types, |
|
"missing": data.missing, |
|
} |
|
|
|
if data.enable_categorical is not None: |
|
dm_param["enable_categorical"] = data.enable_categorical |
|
|
|
param.update(dm_param) |
|
it = RayDataIter(**param) |
|
matrix = xgb.DeviceQuantileDMatrix(it, **dm_param) |
|
else: |
|
if isinstance(param["data"], list): |
|
dm_param = _prepare_dmatrix_params(param) |
|
param.update(dm_param) |
|
|
|
ll = param.pop("label_lower_bound", None) |
|
lu = param.pop("label_upper_bound", None) |
|
fw = param.pop("feature_weights", None) |
|
|
|
if LEGACY_MATRIX: |
|
param.pop("base_margin", None) |
|
|
|
if "qid" not in inspect.signature(xgb.DMatrix).parameters: |
|
param.pop("qid", None) |
|
|
|
if data.enable_categorical is not None: |
|
param["enable_categorical"] = data.enable_categorical |
|
|
|
matrix = xgb.DMatrix(**param) |
|
|
|
if not LEGACY_MATRIX: |
|
matrix.set_info( |
|
label_lower_bound=ll, label_upper_bound=lu, feature_weights=fw |
|
) |
|
|
|
data.update_matrix_properties(matrix) |
|
return matrix |
I want to load larger than memory data which exceeds the cluster's entire memory summation.
To be specific, I want to take advantage of the feature Using XGBoost External Memory Version — xgboost 2.1.0-dev documentation and Experimental support for external memory — xgboost 2.1.0-dev documentation.
I found RayDataIter but seems it is only been used when it founds it is using legacy XGBoost (< 1.5.0 I think, without DataIter).
xgboost_ray/xgboost_ray/matrix.py
Lines 43 to 49 in 9081780
xgboost_ray/xgboost_ray/main.py
Lines 365 to 431 in 9081780
Maybe it is better that we can construct XGBoost DMatrix with customized DataIter instead of concatenating all the data at once.
xgboost_ray/xgboost_ray/main.py
Line 423 in 9081780
xgboost_ray/xgboost_ray/main.py
Lines 351 to 362 in 9081780