From 92b52c4b3aba65cff3288607a09e0f27946629b5 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Thu, 2 Oct 2025 15:40:04 -0500 Subject: [PATCH 01/24] Update Lrnr_ranger.R --- R/Lrnr_ranger.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/Lrnr_ranger.R b/R/Lrnr_ranger.R index b7ba0d82..5550487c 100644 --- a/R/Lrnr_ranger.R +++ b/R/Lrnr_ranger.R @@ -118,7 +118,9 @@ Lrnr_ranger <- R6Class( if (task$has_node("weights")) { args$case.weights <- task$weights } - data_in <- cbind(task$Y, task$X) + Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) + data_in <- cbind(task$Y, Xdf) + # data_in <- cbind(task$Y, task$X) colnames(data_in)[1] <- task$nodes$outcome args$data <- data_in args$dependent.variable.name <- task$nodes$outcome @@ -127,10 +129,12 @@ Lrnr_ranger <- R6Class( return(fit_object) }, .predict = function(task) { + Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) + # extract numeric predictions from custom class ranger.prediction predictions <- stats::predict( private$.fit_object, - data = task$X, + data = Xdf, type = "response", num.threads = self$params$num.threads ) From 98cc84f5eea193d6f12e83928d35534cd827e4f1 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Thu, 2 Oct 2025 15:43:53 -0500 Subject: [PATCH 02/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 87beacc2..d04a5731 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -95,8 +95,11 @@ Lrnr_xgboost <- R6Class( Y <- as.numeric(Y) - 1 } + # Preserve raw data frame + Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) + # set up predictor data - Xmat <- as.matrix(task$X) + Xmat <- as.matrix(Xdf) if (is.integer(Xmat)) { Xmat[, 1] <- as.numeric(Xmat[, 1]) } @@ -151,8 +154,11 @@ Lrnr_xgboost <- R6Class( .predict = function(task = NULL) { fit_object <- private$.fit_object + # Preserve raw data frame + Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) + # set up test data for prediction - Xmat <- as.matrix(task$X) + Xmat <- as.matrix(Xdf) if (is.integer(Xmat)) { Xmat[, 1] <- as.numeric(Xmat[, 1]) } From 99e443013ecd3f577a36f4d0557860957967d2c7 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Thu, 2 Oct 2025 15:44:49 -0500 Subject: [PATCH 03/24] Update Lrnr_ranger.R --- R/Lrnr_ranger.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/Lrnr_ranger.R b/R/Lrnr_ranger.R index 5550487c..9ed0399c 100644 --- a/R/Lrnr_ranger.R +++ b/R/Lrnr_ranger.R @@ -118,9 +118,9 @@ Lrnr_ranger <- R6Class( if (task$has_node("weights")) { args$case.weights <- task$weights } + # Preserve raw data frame Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) data_in <- cbind(task$Y, Xdf) - # data_in <- cbind(task$Y, task$X) colnames(data_in)[1] <- task$nodes$outcome args$data <- data_in args$dependent.variable.name <- task$nodes$outcome @@ -129,6 +129,8 @@ Lrnr_ranger <- R6Class( return(fit_object) }, .predict = function(task) { + + # Preserve raw data frame Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) # extract numeric predictions from custom class ranger.prediction From 5380db47420288970b94696815b728254b1f3dfc Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Thu, 2 Oct 2025 16:22:34 -0500 Subject: [PATCH 04/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index d04a5731..1b57e751 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -106,7 +106,7 @@ Lrnr_xgboost <- R6Class( if (nrow(Xmat) != nrow(task$X) & ncol(Xmat) == nrow(task$X)) { Xmat <- t(Xmat) } - args$data <- try(xgboost::xgb.DMatrix(Xmat, label = Y), silent = TRUE) + args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y), silent = TRUE) # specify weights if (task$has_node("weights")) { @@ -169,7 +169,7 @@ Lrnr_xgboost <- R6Class( } stopifnot(nrow(Xmat_ord) == nrow(Xmat)) # convert to xgb.DMatrix - xgb_data <- try(xgboost::xgb.DMatrix(Xmat_ord), silent = TRUE) + xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE) # incorporate offset, if it wasspecified in training if (self$fit_object$training_offset) { From 9959dfa975c517e6bcb4006b3f1c399b96d1cbfb Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Thu, 2 Oct 2025 16:48:08 -0500 Subject: [PATCH 05/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 1b57e751..69a1a52d 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -98,16 +98,10 @@ Lrnr_xgboost <- R6Class( # Preserve raw data frame Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) - # set up predictor data - Xmat <- as.matrix(Xdf) - if (is.integer(Xmat)) { - Xmat[, 1] <- as.numeric(Xmat[, 1]) - } - if (nrow(Xmat) != nrow(task$X) & ncol(Xmat) == nrow(task$X)) { - Xmat <- t(Xmat) - } args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y), silent = TRUE) + factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL) + # specify weights if (task$has_node("weights")) { try(xgboost::setinfo(args$data, "weight", task$weights), silent = TRUE) @@ -148,6 +142,7 @@ Lrnr_xgboost <- R6Class( ) fit_object$training_offset <- task$has_node("offset") fit_object$link_fun <- link_fun + fit_object$sl3_factor_levels <- factor_levels return(fit_object) }, @@ -157,17 +152,12 @@ Lrnr_xgboost <- R6Class( # Preserve raw data frame Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) - # set up test data for prediction - Xmat <- as.matrix(Xdf) - if (is.integer(Xmat)) { - Xmat[, 1] <- as.numeric(Xmat[, 1]) + # relevel factors to training levels (keep order identical to train) + for (nm in names(Xdf)) { + tr_lvls <- fit_object$sl3_factor_levels[[nm]] + Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls) } - # order of columns has to be the same in xgboost training and test data - Xmat_ord <- as.matrix(Xmat[, match(fit_object$feature_names, colnames(Xmat))]) - if ((nrow(Xmat_ord) != nrow(Xmat)) & (ncol(Xmat_ord) == nrow(Xmat))) { - Xmat_ord <- t(Xmat_ord) - } - stopifnot(nrow(Xmat_ord) == nrow(Xmat)) + # convert to xgb.DMatrix xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE) From 39dac1547e02d9ecb556738e151fe1d41e4d8aba Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Thu, 2 Oct 2025 18:24:39 -0500 Subject: [PATCH 06/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 69a1a52d..0ca35d39 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -98,7 +98,7 @@ Lrnr_xgboost <- R6Class( # Preserve raw data frame Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) - args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y), silent = TRUE) + args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y)) # , silent = TRUE factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL) @@ -116,7 +116,7 @@ Lrnr_xgboost <- R6Class( family <- outcome_type$glm_family(return_object = TRUE) link_fun <- args$family$linkfun offset <- task$offset_transformed(link_fun) - try(xgboost::setinfo(args$data, "base_margin", offset), silent = TRUE) + try(xgboost::setinfo(args$data, "base_margin", offset)) #, silent = TRUE } else { link_fun <- NULL } From eb69493826da682b41735bb05236c46decf85ce2 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Thu, 2 Oct 2025 18:41:35 -0500 Subject: [PATCH 07/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 206 ++++++++++++++++++++++++++++------------------- 1 file changed, 124 insertions(+), 82 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 0ca35d39..4daa1a26 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -53,9 +53,8 @@ #' #' # get feature importance from fitted model #' xgb_varimp <- xgb_fit$importance() -Lrnr_xgboost <- R6Class( +Lrnr_xgboost <- R6::R6Class( classname = "Lrnr_xgboost", inherit = Lrnr_base, - portable = TRUE, class = TRUE, public = list( initialize = function(nrounds = 20, nthread = 1, ...) { params <- args_to_list() @@ -63,136 +62,179 @@ Lrnr_xgboost <- R6Class( }, importance = function(...) { self$assert_trained() - - # initiate argument list for xgboost::xgb.importance args <- list(...) args$model <- self$fit_object - - # calculate importance metrics, already sorted by decreasing importance - importance_result <- call_with_args(xgboost::xgb.importance, args) - rownames(importance_result) <- importance_result[["Feature"]] - return(importance_result) + imp <- call_with_args(xgboost::xgb.importance, args) + rownames(imp) <- imp[["Feature"]] + imp } ), private = list( - .properties = c( - "continuous", "binomial", "categorical", "weights", - "offset", "importance" - ), - .train = function(task) { - args <- self$params + .properties = c("continuous", "binomial", "categorical", "weights", "offset", "importance"), + + .audit_df = function(df, head_levels = 4L) { + cls <- vapply(df, function(z) paste(class(z), collapse = ","), character(1)) + is_char <- vapply(df, is.character, logical(1)) + is_fac <- vapply(df, is.factor, logical(1)) + lvl_n <- vapply(df, function(z) if (is.factor(z)) length(levels(z)) else NA_integer_, integer(1)) + lvl_ex <- vapply(df, function(z) { + if (is.factor(z)) paste(utils::head(levels(z), head_levels), collapse = "|") else "" + }, character(1)) + data.frame( + column = names(df), + class = cls, + is_character = is_char, + is_factor = is_fac, + n_levels = lvl_n, + ex_levels = lvl_ex, + row.names = NULL, + check.names = FALSE + ) + }, - verbose <- args$verbose - if (is.null(verbose)) { - verbose <- getOption("sl3.verbose") + .stop_with_context = function(msg, Xdf, err = NULL) { + audit <- private$.audit_df(Xdf) + bad_chars <- audit$column[audit$is_character] + cat("=== XGBoost DMatrix construction failed ===\n") + cat("Reason:", msg, "\n") + if (!is.null(err)) cat("xgboost says:", conditionMessage(err), "\n") + cat("Data summary: ", nrow(Xdf), " rows x ", ncol(Xdf), " cols\n", sep = "") + if (length(bad_chars)) { + cat("Character columns (must convert to factor or numeric):\n - ", + paste(bad_chars, collapse = ", "), "\n", sep = "") + } + fac_rows <- audit[audit$is_factor, ] + if (nrow(fac_rows)) { + cat("Factor columns and levels (first few):\n") + utils::capture.output(print(fac_rows[, c("column","n_levels","ex_levels")], row.names = FALSE)) |> + paste(collapse = "\n") |> cat("\n") } + stop(msg, call. = FALSE) + }, + + .train = function(task) { + args <- self$params + verbose <- if (is.null(args$verbose)) getOption("sl3.verbose") else args$verbose args$verbose <- as.integer(verbose) - # set up outcome + # outcome outcome_type <- self$get_outcome_type(task) Y <- outcome_type$format(task$Y) - if (outcome_type$type == "categorical") { - Y <- as.numeric(Y) - 1 - } + if (outcome_type$type == "categorical") Y <- as.numeric(Y) - 1L - # Preserve raw data frame + # covariates as raw data.frame (no factor expansion) Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) - - args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y)) # , silent = TRUE + stopifnot(is.data.frame(Xdf)) - factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL) + # convert any characters to factor (xgboost supports factor directly via xgb.DMatrix) + if (any(vapply(Xdf, is.character, logical(1)))) { + Xdf[] <- lapply(Xdf, function(z) if (is.character(z)) factor(z) else z) + } - # specify weights + # Build DMatrix with hard check + diagnostics + dm <- try(xgboost::xgb.DMatrix(data = Xdf, label = Y, + feature_names = colnames(Xdf)), silent = TRUE) + if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) { + private$.stop_with_context("xgb.DMatrix() did not return an xgb.DMatrix", Xdf, err = attr(dm, "condition")) + } + + # weights if (task$has_node("weights")) { - try(xgboost::setinfo(args$data, "weight", task$weights), silent = TRUE) + try(xgboost::setinfo(dm, "weight", task$weights), silent = TRUE) } - # specify offset + # offset / base_margin if (task$has_node("offset")) { if (outcome_type$type == "categorical") { - # TODO: fix stop("offsets not yet supported for outcome_type='categorical'") } - family <- outcome_type$glm_family(return_object = TRUE) + family <- outcome_type$glm_family(return_object = TRUE) link_fun <- args$family$linkfun - offset <- task$offset_transformed(link_fun) - try(xgboost::setinfo(args$data, "base_margin", offset)) #, silent = TRUE + offset <- task$offset_transformed(link_fun) + try(xgboost::setinfo(dm, "base_margin", offset), silent = TRUE) } else { link_fun <- NULL } - # specify objective if it's NULL to avoid xgb warnings + # reasonable default objective if (is.null(args$objective)) { if (outcome_type$type == "binomial") { - args$objective <- "binary:logistic" - args$eval_metric <- "logloss" + args$objective <- "binary:logistic"; args$eval_metric <- "logloss" } else if (outcome_type$type == "quasibinomial") { args$objective <- "reg:logistic" } else if (outcome_type$type == "categorical") { - args$objective <- "multi:softprob" - args$eval_metric <- "mlogloss" + args$objective <- "multi:softprob"; args$eval_metric <- "mlogloss" args$num_class <- as.integer(length(outcome_type$levels)) + } else { + # continuous + if (is.null(args$eval_metric)) args$eval_metric <- "rmse" } } - args$watchlist <- list(train = args$data) - fit_object <- call_with_args(xgboost::xgb.train, args, - keep_all = TRUE, - ignore = "formula" - ) - fit_object$training_offset <- task$has_node("offset") - fit_object$link_fun <- link_fun - fit_object$sl3_factor_levels <- factor_levels + args$data <- dm + args$watchlist <- list(train = dm) - return(fit_object) + fit_object <- call_with_args(xgboost::xgb.train, args, keep_all = TRUE, ignore = "formula") + fit_object$training_offset <- task$has_node("offset") + fit_object$link_fun <- link_fun + fit_object$sl3_feature_names <- colnames(Xdf) + fit_object$sl3_factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL) + fit_object }, + .predict = function(task = NULL) { - fit_object <- private$.fit_object + fit <- private$.fit_object - # Preserve raw data frame + # raw df Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) - - # relevel factors to training levels (keep order identical to train) + stopifnot(is.data.frame(Xdf)) + + # relevel factors to training levels; also coerce stray characters + tr_lvls_list <- fit$sl3_factor_levels for (nm in names(Xdf)) { - tr_lvls <- fit_object$sl3_factor_levels[[nm]] - Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls) + if (is.character(Xdf[[nm]])) Xdf[[nm]] <- factor(Xdf[[nm]]) + tr_lvls <- tr_lvls_list[[nm]] + if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) { + Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls) + } } - - # convert to xgb.DMatrix - xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE) - - # incorporate offset, if it wasspecified in training - if (self$fit_object$training_offset) { - offset <- task$offset_transformed( - self$fit_object$link_fun, - for_prediction = TRUE - ) - try(xgboost::setinfo(xgb_data, "base_margin", offset), silent = TRUE) + + # reorder columns to training order (critical) + tr_names <- fit$sl3_feature_names + ord <- match(tr_names, colnames(Xdf)) + if (anyNA(ord)) { + missing_cols <- tr_names[is.na(ord)] + msg <- paste0("Prediction data is missing ", length(missing_cols), + " training column(s): ", paste(missing_cols, collapse = ", ")) + private$.stop_with_context(msg, Xdf, err = NULL) } + Xdf <- Xdf[, ord, drop = FALSE] - # incorporate ntreelimit, if training model was not a gblinear-based fit - ntreelimit <- 0 - if (!is.null(fit_object[["best_ntreelimit"]]) & - !("gblinear" %in% fit_object[["params"]][["booster"]])) { - ntreelimit <- fit_object[["best_ntreelimit"]] + # DMatrix (with check) + dm <- try(xgboost::xgb.DMatrix(data = Xdf, feature_names = colnames(Xdf)), silent = TRUE) + if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) { + private$.stop_with_context("xgb.DMatrix() failed during predict()", Xdf, err = attr(dm, "condition")) } - predictions <- rep.int(list(numeric()), 1) - if (nrow(Xmat) > 0) { - # will generally return vector, needs to be put into data.table column - predictions <- stats::predict( - fit_object, - newdata = xgb_data, ntreelimit = ntreelimit, reshape = TRUE - ) - - if (private$.training_outcome_type$type == "categorical") { - # pack predictions in a single column - predictions <- pack_predictions(predictions) - } + # optional offset + if (fit$training_offset) { + offset <- task$offset_transformed(fit$link_fun, for_prediction = TRUE) + try(xgboost::setinfo(dm, "base_margin", offset), silent = TRUE) + } + + # best_ntreelimit if present + ntreelimit <- 0 + if (!is.null(fit[["best_ntreelimit"]]) && !("gblinear" %in% fit[["params"]][["booster"]])) { + ntreelimit <- fit[["best_ntreelimit"]] } - return(predictions) + if (nrow(Xdf) == 0) return(numeric(0)) + + preds <- stats::predict(fit, newdata = dm, ntreelimit = ntreelimit, reshape = TRUE) + if (private$.training_outcome_type$type == "categorical") preds <- pack_predictions(preds) + preds }, + .required_packages = c("xgboost") ) ) From e953d7ad1baa372104e40f3e0f6b037a13c65662 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Thu, 2 Oct 2025 21:46:37 -0500 Subject: [PATCH 08/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 4daa1a26..3c9b8026 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -132,8 +132,7 @@ Lrnr_xgboost <- R6::R6Class( } # Build DMatrix with hard check + diagnostics - dm <- try(xgboost::xgb.DMatrix(data = Xdf, label = Y, - feature_names = colnames(Xdf)), silent = TRUE) + dm <- try(xgboost::xgb.DMatrix(data = Xdf, feature_names = colnames(Xdf)), silent = TRUE) if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) { private$.stop_with_context("xgb.DMatrix() did not return an xgb.DMatrix", Xdf, err = attr(dm, "condition")) } From 454d2dff99b05b3d72d2c0bf8586e06115d43d9e Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Fri, 3 Oct 2025 00:44:34 -0500 Subject: [PATCH 09/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 3c9b8026..881e9190 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -131,6 +131,8 @@ Lrnr_xgboost <- R6::R6Class( Xdf[] <- lapply(Xdf, function(z) if (is.character(z)) factor(z) else z) } + print(is.data.frame(Xdf)) + # Build DMatrix with hard check + diagnostics dm <- try(xgboost::xgb.DMatrix(data = Xdf, feature_names = colnames(Xdf)), silent = TRUE) if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) { From 71769fb072af7888e6ddc7357a469637a1031e86 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Fri, 3 Oct 2025 11:23:16 -0500 Subject: [PATCH 10/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 207 +++++++++++++++++++---------------------------- 1 file changed, 82 insertions(+), 125 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 881e9190..69a1a52d 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -53,8 +53,9 @@ #' #' # get feature importance from fitted model #' xgb_varimp <- xgb_fit$importance() -Lrnr_xgboost <- R6::R6Class( +Lrnr_xgboost <- R6Class( classname = "Lrnr_xgboost", inherit = Lrnr_base, + portable = TRUE, class = TRUE, public = list( initialize = function(nrounds = 20, nthread = 1, ...) { params <- args_to_list() @@ -62,180 +63,136 @@ Lrnr_xgboost <- R6::R6Class( }, importance = function(...) { self$assert_trained() + + # initiate argument list for xgboost::xgb.importance args <- list(...) args$model <- self$fit_object - imp <- call_with_args(xgboost::xgb.importance, args) - rownames(imp) <- imp[["Feature"]] - imp + + # calculate importance metrics, already sorted by decreasing importance + importance_result <- call_with_args(xgboost::xgb.importance, args) + rownames(importance_result) <- importance_result[["Feature"]] + return(importance_result) } ), private = list( - .properties = c("continuous", "binomial", "categorical", "weights", "offset", "importance"), - - .audit_df = function(df, head_levels = 4L) { - cls <- vapply(df, function(z) paste(class(z), collapse = ","), character(1)) - is_char <- vapply(df, is.character, logical(1)) - is_fac <- vapply(df, is.factor, logical(1)) - lvl_n <- vapply(df, function(z) if (is.factor(z)) length(levels(z)) else NA_integer_, integer(1)) - lvl_ex <- vapply(df, function(z) { - if (is.factor(z)) paste(utils::head(levels(z), head_levels), collapse = "|") else "" - }, character(1)) - data.frame( - column = names(df), - class = cls, - is_character = is_char, - is_factor = is_fac, - n_levels = lvl_n, - ex_levels = lvl_ex, - row.names = NULL, - check.names = FALSE - ) - }, - - .stop_with_context = function(msg, Xdf, err = NULL) { - audit <- private$.audit_df(Xdf) - bad_chars <- audit$column[audit$is_character] - cat("=== XGBoost DMatrix construction failed ===\n") - cat("Reason:", msg, "\n") - if (!is.null(err)) cat("xgboost says:", conditionMessage(err), "\n") - cat("Data summary: ", nrow(Xdf), " rows x ", ncol(Xdf), " cols\n", sep = "") - if (length(bad_chars)) { - cat("Character columns (must convert to factor or numeric):\n - ", - paste(bad_chars, collapse = ", "), "\n", sep = "") - } - fac_rows <- audit[audit$is_factor, ] - if (nrow(fac_rows)) { - cat("Factor columns and levels (first few):\n") - utils::capture.output(print(fac_rows[, c("column","n_levels","ex_levels")], row.names = FALSE)) |> - paste(collapse = "\n") |> cat("\n") - } - stop(msg, call. = FALSE) - }, - + .properties = c( + "continuous", "binomial", "categorical", "weights", + "offset", "importance" + ), .train = function(task) { args <- self$params - verbose <- if (is.null(args$verbose)) getOption("sl3.verbose") else args$verbose + + verbose <- args$verbose + if (is.null(verbose)) { + verbose <- getOption("sl3.verbose") + } args$verbose <- as.integer(verbose) - # outcome + # set up outcome outcome_type <- self$get_outcome_type(task) Y <- outcome_type$format(task$Y) - if (outcome_type$type == "categorical") Y <- as.numeric(Y) - 1L - - # covariates as raw data.frame (no factor expansion) - Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) - stopifnot(is.data.frame(Xdf)) - - # convert any characters to factor (xgboost supports factor directly via xgb.DMatrix) - if (any(vapply(Xdf, is.character, logical(1)))) { - Xdf[] <- lapply(Xdf, function(z) if (is.character(z)) factor(z) else z) + if (outcome_type$type == "categorical") { + Y <- as.numeric(Y) - 1 } - print(is.data.frame(Xdf)) + # Preserve raw data frame + Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) + + args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y), silent = TRUE) - # Build DMatrix with hard check + diagnostics - dm <- try(xgboost::xgb.DMatrix(data = Xdf, feature_names = colnames(Xdf)), silent = TRUE) - if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) { - private$.stop_with_context("xgb.DMatrix() did not return an xgb.DMatrix", Xdf, err = attr(dm, "condition")) - } + factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL) - # weights + # specify weights if (task$has_node("weights")) { - try(xgboost::setinfo(dm, "weight", task$weights), silent = TRUE) + try(xgboost::setinfo(args$data, "weight", task$weights), silent = TRUE) } - # offset / base_margin + # specify offset if (task$has_node("offset")) { if (outcome_type$type == "categorical") { + # TODO: fix stop("offsets not yet supported for outcome_type='categorical'") } - family <- outcome_type$glm_family(return_object = TRUE) + family <- outcome_type$glm_family(return_object = TRUE) link_fun <- args$family$linkfun - offset <- task$offset_transformed(link_fun) - try(xgboost::setinfo(dm, "base_margin", offset), silent = TRUE) + offset <- task$offset_transformed(link_fun) + try(xgboost::setinfo(args$data, "base_margin", offset), silent = TRUE) } else { link_fun <- NULL } - # reasonable default objective + # specify objective if it's NULL to avoid xgb warnings if (is.null(args$objective)) { if (outcome_type$type == "binomial") { - args$objective <- "binary:logistic"; args$eval_metric <- "logloss" + args$objective <- "binary:logistic" + args$eval_metric <- "logloss" } else if (outcome_type$type == "quasibinomial") { args$objective <- "reg:logistic" } else if (outcome_type$type == "categorical") { - args$objective <- "multi:softprob"; args$eval_metric <- "mlogloss" + args$objective <- "multi:softprob" + args$eval_metric <- "mlogloss" args$num_class <- as.integer(length(outcome_type$levels)) - } else { - # continuous - if (is.null(args$eval_metric)) args$eval_metric <- "rmse" } } - args$data <- dm - args$watchlist <- list(train = dm) + args$watchlist <- list(train = args$data) + fit_object <- call_with_args(xgboost::xgb.train, args, + keep_all = TRUE, + ignore = "formula" + ) + fit_object$training_offset <- task$has_node("offset") + fit_object$link_fun <- link_fun + fit_object$sl3_factor_levels <- factor_levels - fit_object <- call_with_args(xgboost::xgb.train, args, keep_all = TRUE, ignore = "formula") - fit_object$training_offset <- task$has_node("offset") - fit_object$link_fun <- link_fun - fit_object$sl3_feature_names <- colnames(Xdf) - fit_object$sl3_factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL) - fit_object + return(fit_object) }, - .predict = function(task = NULL) { - fit <- private$.fit_object + fit_object <- private$.fit_object - # raw df + # Preserve raw data frame Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) - stopifnot(is.data.frame(Xdf)) - - # relevel factors to training levels; also coerce stray characters - tr_lvls_list <- fit$sl3_factor_levels + + # relevel factors to training levels (keep order identical to train) for (nm in names(Xdf)) { - if (is.character(Xdf[[nm]])) Xdf[[nm]] <- factor(Xdf[[nm]]) - tr_lvls <- tr_lvls_list[[nm]] - if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) { - Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls) - } - } - - # reorder columns to training order (critical) - tr_names <- fit$sl3_feature_names - ord <- match(tr_names, colnames(Xdf)) - if (anyNA(ord)) { - missing_cols <- tr_names[is.na(ord)] - msg <- paste0("Prediction data is missing ", length(missing_cols), - " training column(s): ", paste(missing_cols, collapse = ", ")) - private$.stop_with_context(msg, Xdf, err = NULL) - } - Xdf <- Xdf[, ord, drop = FALSE] - - # DMatrix (with check) - dm <- try(xgboost::xgb.DMatrix(data = Xdf, feature_names = colnames(Xdf)), silent = TRUE) - if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) { - private$.stop_with_context("xgb.DMatrix() failed during predict()", Xdf, err = attr(dm, "condition")) + tr_lvls <- fit_object$sl3_factor_levels[[nm]] + Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls) } - - # optional offset - if (fit$training_offset) { - offset <- task$offset_transformed(fit$link_fun, for_prediction = TRUE) - try(xgboost::setinfo(dm, "base_margin", offset), silent = TRUE) + + # convert to xgb.DMatrix + xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE) + + # incorporate offset, if it wasspecified in training + if (self$fit_object$training_offset) { + offset <- task$offset_transformed( + self$fit_object$link_fun, + for_prediction = TRUE + ) + try(xgboost::setinfo(xgb_data, "base_margin", offset), silent = TRUE) } - # best_ntreelimit if present + # incorporate ntreelimit, if training model was not a gblinear-based fit ntreelimit <- 0 - if (!is.null(fit[["best_ntreelimit"]]) && !("gblinear" %in% fit[["params"]][["booster"]])) { - ntreelimit <- fit[["best_ntreelimit"]] + if (!is.null(fit_object[["best_ntreelimit"]]) & + !("gblinear" %in% fit_object[["params"]][["booster"]])) { + ntreelimit <- fit_object[["best_ntreelimit"]] } - if (nrow(Xdf) == 0) return(numeric(0)) + predictions <- rep.int(list(numeric()), 1) + if (nrow(Xmat) > 0) { + # will generally return vector, needs to be put into data.table column + predictions <- stats::predict( + fit_object, + newdata = xgb_data, ntreelimit = ntreelimit, reshape = TRUE + ) + + if (private$.training_outcome_type$type == "categorical") { + # pack predictions in a single column + predictions <- pack_predictions(predictions) + } + } - preds <- stats::predict(fit, newdata = dm, ntreelimit = ntreelimit, reshape = TRUE) - if (private$.training_outcome_type$type == "categorical") preds <- pack_predictions(preds) - preds + return(predictions) }, - .required_packages = c("xgboost") ) ) From 56c72a9bc80ee300f0f5b5f162f22246b6b2d53a Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:36:30 -0500 Subject: [PATCH 11/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 190 +++++++++++++++++++++++++++-------------------- 1 file changed, 111 insertions(+), 79 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 69a1a52d..2ae812a6 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -80,118 +80,150 @@ Lrnr_xgboost <- R6Class( "offset", "importance" ), .train = function(task) { + # Safe helper for %||% + `%||%` <- function(a, b) if (!is.null(a)) a else b + args <- self$params - + + # verbosity verbose <- args$verbose - if (is.null(verbose)) { - verbose <- getOption("sl3.verbose") - } + if (is.null(verbose)) verbose <- getOption("sl3.verbose") args$verbose <- as.integer(verbose) - - # set up outcome + + # outcome outcome_type <- self$get_outcome_type(task) Y <- outcome_type$format(task$Y) - if (outcome_type$type == "categorical") { - Y <- as.numeric(Y) - 1 - } - - # Preserve raw data frame + if (outcome_type$type == "categorical") Y <- as.numeric(Y) - 1L + + # raw covariates, keep factors intact Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) - - args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y), silent = TRUE) - - factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL) - - # specify weights + + # (optional but recommended) explicit feature types + feat_types <- vapply(Xdf, function(z) { + if (is.factor(z)) "c" else if (is.integer(z)) "int" + else if (is.logical(z)) "i" else "float" + }, character(1)) + + # DMatrix + dtrain <- try(xgboost::xgb.DMatrix( + data = Xdf, label = Y, + feature_names = colnames(Xdf), + feature_types = feat_types + ), silent = TRUE) + + if (!inherits(dtrain, "xgb.DMatrix")) { + cls <- vapply(Xdf, function(z) paste(class(z), collapse=","), character(1)) + stop("xgb.DMatrix construction failed. Column classes: ", + paste(sprintf("%s:[%s]", names(cls), cls), collapse="; ")) + } + + # weights if (task$has_node("weights")) { - try(xgboost::setinfo(args$data, "weight", task$weights), silent = TRUE) + xgboost::setinfo(dtrain, "weight", task$weights) } - - # specify offset + + # offset (base_margin) + link_fun <- NULL if (task$has_node("offset")) { if (outcome_type$type == "categorical") { - # TODO: fix stop("offsets not yet supported for outcome_type='categorical'") } family <- outcome_type$glm_family(return_object = TRUE) - link_fun <- args$family$linkfun + link_fun <- family$linkfun offset <- task$offset_transformed(link_fun) - try(xgboost::setinfo(args$data, "base_margin", offset), silent = TRUE) - } else { - link_fun <- NULL + xgboost::setinfo(dtrain, "base_margin", offset) } - - # specify objective if it's NULL to avoid xgb warnings - if (is.null(args$objective)) { - if (outcome_type$type == "binomial") { - args$objective <- "binary:logistic" - args$eval_metric <- "logloss" + + # ----- xgboost arguments: use params + evals ----- + nrounds <- if (!is.null(args$nrounds)) args$nrounds else 20L + params <- if (!is.null(args$params)) args$params else list() + + # set objective/metric if not provided + if (is.null(params$objective)) { + if (outcome_type$type %in% c("binomial")) { + params$objective <- "binary:logistic" + params$eval_metric <- params$eval_metric %||% "logloss" } else if (outcome_type$type == "quasibinomial") { - args$objective <- "reg:logistic" + params$objective <- "reg:logistic" } else if (outcome_type$type == "categorical") { - args$objective <- "multi:softprob" - args$eval_metric <- "mlogloss" - args$num_class <- as.integer(length(outcome_type$levels)) + params$objective <- "multi:softprob" + params$eval_metric <- params$eval_metric %||% "mlogloss" + params$num_class <- as.integer(length(outcome_type$levels)) + } else { + params$objective <- params$objective %||% "reg:squarederror" } } - - args$watchlist <- list(train = args$data) - fit_object <- call_with_args(xgboost::xgb.train, args, - keep_all = TRUE, - ignore = "formula" + + fit_booster <- xgboost::xgb.train( + data = dtrain, + nrounds = nrounds, + params = params, + evals = list(train = dtrain), + verbose = args$verbose ) - fit_object$training_offset <- task$has_node("offset") - fit_object$link_fun <- link_fun - fit_object$sl3_factor_levels <- factor_levels - + + # DO NOT mutate the booster; wrap it instead + factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL) + fit_object <- list( + booster = fit_booster, + meta = list( + training_offset = task$has_node("offset"), + link_fun = link_fun, + sl3_factor_levels = factor_levels + ) + ) + class(fit_object) <- c("sl3_xgb_fit", "list") + return(fit_object) }, .predict = function(task = NULL) { fit_object <- private$.fit_object - - # Preserve raw data frame + booster <- fit_object$booster + meta <- fit_object$meta + + # raw covariates; relevel to training levels Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) - - # relevel factors to training levels (keep order identical to train) + for (nm in names(Xdf)) { - tr_lvls <- fit_object$sl3_factor_levels[[nm]] - Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls) + tr_lvls <- meta$sl3_factor_levels[[nm]] + if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) { + # (optional) track NAs due to unseen levels + before_na <- sum(is.na(Xdf[[nm]])) + Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls) + after_na <- sum(is.na(Xdf[[nm]])) + if (after_na > before_na) { + message("xgboost predict: introduced ", after_na - before_na, + " NA(s) in '", nm, "' due to unseen levels") + } + } } - - # convert to xgb.DMatrix + xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE) - - # incorporate offset, if it wasspecified in training - if (self$fit_object$training_offset) { - offset <- task$offset_transformed( - self$fit_object$link_fun, - for_prediction = TRUE - ) - try(xgboost::setinfo(xgb_data, "base_margin", offset), silent = TRUE) + if (!inherits(xgb_data, "xgb.DMatrix")) stop("Failed to build DMatrix for prediction.") + + # base_margin if used in training + if (isTRUE(meta$training_offset)) { + offset <- task$offset_transformed(meta$link_fun, for_prediction = TRUE) + xgboost::setinfo(xgb_data, "base_margin", offset) } - - # incorporate ntreelimit, if training model was not a gblinear-based fit - ntreelimit <- 0 - if (!is.null(fit_object[["best_ntreelimit"]]) & - !("gblinear" %in% fit_object[["params"]][["booster"]])) { - ntreelimit <- fit_object[["best_ntreelimit"]] + + # ntreelimit logic + ntreelimit <- 0L + if (!is.null(booster[["best_ntreelimit"]]) && + !("gblinear" %in% booster[["params"]][["booster"]])) { + ntreelimit <- booster[["best_ntreelimit"]] } - - predictions <- rep.int(list(numeric()), 1) - if (nrow(Xmat) > 0) { - # will generally return vector, needs to be put into data.table column - predictions <- stats::predict( - fit_object, - newdata = xgb_data, ntreelimit = ntreelimit, reshape = TRUE - ) - + + preds <- numeric(0) + if (nrow(Xdf) > 0) { + preds <- stats::predict(booster, newdata = xgb_data, + ntreelimit = ntreelimit, reshape = TRUE) + if (private$.training_outcome_type$type == "categorical") { - # pack predictions in a single column - predictions <- pack_predictions(predictions) + preds <- pack_predictions(preds) } } - - return(predictions) + return(preds) }, .required_packages = c("xgboost") ) From f5935b9f336ed18588fc4a7391ab3d0e40c72e5c Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Tue, 7 Oct 2025 11:26:42 -0500 Subject: [PATCH 12/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 2ae812a6..628687ad 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -187,10 +187,9 @@ Lrnr_xgboost <- R6Class( for (nm in names(Xdf)) { tr_lvls <- meta$sl3_factor_levels[[nm]] if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) { - # (optional) track NAs due to unseen levels before_na <- sum(is.na(Xdf[[nm]])) Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls) - after_na <- sum(is.na(Xdf[[nm]])) + after_na <- sum(is.na(Xdf[[nm]])) if (after_na > before_na) { message("xgboost predict: introduced ", after_na - before_na, " NA(s) in '", nm, "' due to unseen levels") @@ -198,6 +197,9 @@ Lrnr_xgboost <- R6Class( } } + # empty guard + if (nrow(Xdf) == 0L) return(numeric(0)) + xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE) if (!inherits(xgb_data, "xgb.DMatrix")) stop("Failed to build DMatrix for prediction.") @@ -207,24 +209,24 @@ Lrnr_xgboost <- R6Class( xgboost::setinfo(xgb_data, "base_margin", offset) } - # ntreelimit logic - ntreelimit <- 0L - if (!is.null(booster[["best_ntreelimit"]]) && - !("gblinear" %in% booster[["params"]][["booster"]])) { - ntreelimit <- booster[["best_ntreelimit"]] + # best iteration -> iterationrange (future-proof, no warnings) + best_iter <- try(xgboost::xgb.attr(booster, "best_iteration"), silent = TRUE) + if (!inherits(best_iter, "try-error") && !is.null(best_iter)) { + ir <- as.integer(c(0L, as.integer(best_iter) + 1L)) + preds <- stats::predict(booster, newdata = xgb_data, iterationrange = ir) + } else { + message("Can't find best_iteration") + preds <- stats::predict(booster, newdata = xgb_data) } - preds <- numeric(0) - if (nrow(Xdf) > 0) { - preds <- stats::predict(booster, newdata = xgb_data, - ntreelimit = ntreelimit, reshape = TRUE) - - if (private$.training_outcome_type$type == "categorical") { - preds <- pack_predictions(preds) - } + # reshape for multiclass + if (private$.training_outcome_type$type == "categorical") { + k <- length(private$.training_outcome_type$levels) + preds <- matrix(preds, ncol = k, byrow = TRUE) } + return(preds) - }, + } .required_packages = c("xgboost") ) ) From 3df1c230e5b914152e9c4209ce080d36d20e5805 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Tue, 7 Oct 2025 11:38:45 -0500 Subject: [PATCH 13/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 628687ad..1d541705 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -226,7 +226,7 @@ Lrnr_xgboost <- R6Class( } return(preds) - } + }, .required_packages = c("xgboost") ) ) From 0dcf61319cca1fa9d321b1a07a034f15c82ebd1f Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Tue, 7 Oct 2025 12:50:10 -0500 Subject: [PATCH 14/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 1d541705..b7047aa4 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -187,9 +187,10 @@ Lrnr_xgboost <- R6Class( for (nm in names(Xdf)) { tr_lvls <- meta$sl3_factor_levels[[nm]] if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) { + # (optional) track NAs due to unseen levels before_na <- sum(is.na(Xdf[[nm]])) Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls) - after_na <- sum(is.na(Xdf[[nm]])) + after_na <- sum(is.na(Xdf[[nm]])) if (after_na > before_na) { message("xgboost predict: introduced ", after_na - before_na, " NA(s) in '", nm, "' due to unseen levels") @@ -197,9 +198,6 @@ Lrnr_xgboost <- R6Class( } } - # empty guard - if (nrow(Xdf) == 0L) return(numeric(0)) - xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE) if (!inherits(xgb_data, "xgb.DMatrix")) stop("Failed to build DMatrix for prediction.") @@ -209,23 +207,25 @@ Lrnr_xgboost <- R6Class( xgboost::setinfo(xgb_data, "base_margin", offset) } - # best iteration -> iterationrange (future-proof, no warnings) - best_iter <- try(xgboost::xgb.attr(booster, "best_iteration"), silent = TRUE) - if (!inherits(best_iter, "try-error") && !is.null(best_iter)) { - ir <- as.integer(c(0L, as.integer(best_iter) + 1L)) - preds <- stats::predict(booster, newdata = xgb_data, iterationrange = ir) + # --- ntreelimit logic --- + ntreelimit <- 0L + booster <- fit_object$booster + + # try best_ntreelimit first (sometimes stored as character) + bn <- booster[["best_ntreelimit"]] + if (!is.null(bn)) { + ntreelimit <- as.integer(bn) } else { - message("Can't find best_iteration") - preds <- stats::predict(booster, newdata = xgb_data) - } - - # reshape for multiclass - if (private$.training_outcome_type$type == "categorical") { - k <- length(private$.training_outcome_type$levels) - preds <- matrix(preds, ncol = k, byrow = TRUE) + # fall back to best_iteration attribute if present + bi <- try(xgboost::xgb.attr(booster, "best_iteration"), silent = TRUE) + if (!inherits(bi, "try-error") && !is.null(bi)) { + ntreelimit <- as.integer(bi) # attribute is character; coerce + } } - - return(preds) + + # call predict (you can keep reshape=TRUE for now if you prefer) + preds <- stats::predict(booster, newdata = xgb_data, + ntreelimit = ntreelimit, reshape = TRUE) }, .required_packages = c("xgboost") ) From 97a1a7f8f6f816a010e136821f89cf8bb7096730 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Tue, 7 Oct 2025 14:03:03 -0500 Subject: [PATCH 15/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index b7047aa4..a5be039b 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -207,25 +207,17 @@ Lrnr_xgboost <- R6Class( xgboost::setinfo(xgb_data, "base_margin", offset) } - # --- ntreelimit logic --- - ntreelimit <- 0L - booster <- fit_object$booster - - # try best_ntreelimit first (sometimes stored as character) - bn <- booster[["best_ntreelimit"]] - if (!is.null(bn)) { - ntreelimit <- as.integer(bn) - } else { - # fall back to best_iteration attribute if present - bi <- try(xgboost::xgb.attr(booster, "best_iteration"), silent = TRUE) - if (!inherits(bi, "try-error") && !is.null(bi)) { - ntreelimit <- as.integer(bi) # attribute is character; coerce - } + # 4) Predict with NO limit (uses all trees; if model had ES and you use xgboost::xgboost + # class, its high-level predict would default to the best iteration. For xgb.Booster, + # just omit limits unless you really need a subset.) + preds <- stats::predict(booster, newdata = xgb_data) + + # 5) Reshape for multiclass + if (private$.training_outcome_type$type == "categorical") { + k <- length(private$.training_outcome_type$levels) + preds <- matrix(preds, ncol = k, byrow = TRUE) } - - # call predict (you can keep reshape=TRUE for now if you prefer) - preds <- stats::predict(booster, newdata = xgb_data, - ntreelimit = ntreelimit, reshape = TRUE) + return(preds) }, .required_packages = c("xgboost") ) From 29b3364fad8d3bd10a03b9c2a82e513473535c67 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Wed, 8 Oct 2025 12:01:12 -0500 Subject: [PATCH 16/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index a5be039b..8fdd7116 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -207,15 +207,15 @@ Lrnr_xgboost <- R6Class( xgboost::setinfo(xgb_data, "base_margin", offset) } - # 4) Predict with NO limit (uses all trees; if model had ES and you use xgboost::xgboost - # class, its high-level predict would default to the best iteration. For xgb.Booster, - # just omit limits unless you really need a subset.) - preds <- stats::predict(booster, newdata = xgb_data) + predictions <- stats::predict(booster, newdata = xgb_data) - # 5) Reshape for multiclass if (private$.training_outcome_type$type == "categorical") { k <- length(private$.training_outcome_type$levels) - preds <- matrix(preds, ncol = k, byrow = TRUE) + predictions <- matrix(predictions, ncol = k, byrow = TRUE) + colnames(predictions) <- private$.training_outcome_type$levels + + # pack predictions in a single column + predictions <- pack_predictions(predictions) } return(preds) }, From e7b29fea8292dc102e5ee9a4af967e7787f21ec5 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Wed, 8 Oct 2025 12:10:30 -0500 Subject: [PATCH 17/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 8fdd7116..2881df67 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -212,7 +212,7 @@ Lrnr_xgboost <- R6Class( if (private$.training_outcome_type$type == "categorical") { k <- length(private$.training_outcome_type$levels) predictions <- matrix(predictions, ncol = k, byrow = TRUE) - colnames(predictions) <- private$.training_outcome_type$levels + # colnames(predictions) <- private$.training_outcome_type$levels # pack predictions in a single column predictions <- pack_predictions(predictions) From e89db12ebd26f2b0a2382ed06421d9bd6ac71526 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Wed, 8 Oct 2025 12:11:34 -0500 Subject: [PATCH 18/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 2881df67..6a47c4b0 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -210,14 +210,14 @@ Lrnr_xgboost <- R6Class( predictions <- stats::predict(booster, newdata = xgb_data) if (private$.training_outcome_type$type == "categorical") { - k <- length(private$.training_outcome_type$levels) - predictions <- matrix(predictions, ncol = k, byrow = TRUE) + # k <- length(private$.training_outcome_type$levels) + # predictions <- matrix(predictions, ncol = k, byrow = TRUE) # colnames(predictions) <- private$.training_outcome_type$levels # pack predictions in a single column predictions <- pack_predictions(predictions) } - return(preds) + return(predictions) }, .required_packages = c("xgboost") ) From 4deae51267008df3ae43e3fdfb197d4c815455ee Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Wed, 8 Oct 2025 12:55:01 -0500 Subject: [PATCH 19/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 6a47c4b0..2a1c2616 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -210,8 +210,8 @@ Lrnr_xgboost <- R6Class( predictions <- stats::predict(booster, newdata = xgb_data) if (private$.training_outcome_type$type == "categorical") { - # k <- length(private$.training_outcome_type$levels) - # predictions <- matrix(predictions, ncol = k, byrow = TRUE) + k <- length(private$.training_outcome_type$levels) + predictions <- matrix(predictions, ncol = k, byrow = TRUE) # colnames(predictions) <- private$.training_outcome_type$levels # pack predictions in a single column From 97e76c78a8dc9fa4753ee732a30f1f53b9c9c4f0 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Wed, 8 Oct 2025 17:42:42 -0500 Subject: [PATCH 20/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 2a1c2616..0bd985f9 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -212,7 +212,7 @@ Lrnr_xgboost <- R6Class( if (private$.training_outcome_type$type == "categorical") { k <- length(private$.training_outcome_type$levels) predictions <- matrix(predictions, ncol = k, byrow = TRUE) - # colnames(predictions) <- private$.training_outcome_type$levels + colnames(predictions) <- private$.training_outcome_type$levels # pack predictions in a single column predictions <- pack_predictions(predictions) From 74c53efe0af4f7b8fd213f7bcf4caf9b05b30329 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Wed, 8 Oct 2025 20:39:55 -0500 Subject: [PATCH 21/24] Update Lrnr_xgboost.R --- R/Lrnr_xgboost.R | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R index 0bd985f9..f89ca21f 100644 --- a/R/Lrnr_xgboost.R +++ b/R/Lrnr_xgboost.R @@ -163,13 +163,11 @@ Lrnr_xgboost <- R6Class( ) # DO NOT mutate the booster; wrap it instead - factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL) fit_object <- list( booster = fit_booster, meta = list( training_offset = task$has_node("offset"), - link_fun = link_fun, - sl3_factor_levels = factor_levels + link_fun = link_fun ) ) class(fit_object) <- c("sl3_xgb_fit", "list") @@ -184,20 +182,6 @@ Lrnr_xgboost <- R6Class( # raw covariates; relevel to training levels Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) - for (nm in names(Xdf)) { - tr_lvls <- meta$sl3_factor_levels[[nm]] - if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) { - # (optional) track NAs due to unseen levels - before_na <- sum(is.na(Xdf[[nm]])) - Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls) - after_na <- sum(is.na(Xdf[[nm]])) - if (after_na > before_na) { - message("xgboost predict: introduced ", after_na - before_na, - " NA(s) in '", nm, "' due to unseen levels") - } - } - } - xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE) if (!inherits(xgb_data, "xgb.DMatrix")) stop("Failed to build DMatrix for prediction.") @@ -207,13 +191,9 @@ Lrnr_xgboost <- R6Class( xgboost::setinfo(xgb_data, "base_margin", offset) } - predictions <- stats::predict(booster, newdata = xgb_data) + predictions <- stats::predict(booster, newdata = xgb_data, strict_shape=TRUE) if (private$.training_outcome_type$type == "categorical") { - k <- length(private$.training_outcome_type$levels) - predictions <- matrix(predictions, ncol = k, byrow = TRUE) - colnames(predictions) <- private$.training_outcome_type$levels - # pack predictions in a single column predictions <- pack_predictions(predictions) } From 1f7073b6e560800c5174b8cec09a5ab3c48772bd Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Thu, 9 Oct 2025 21:52:31 -0500 Subject: [PATCH 22/24] Update sl3_Task.R --- R/sl3_Task.R | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/R/sl3_Task.R b/R/sl3_Task.R index 616140ca..b6d22c80 100644 --- a/R/sl3_Task.R +++ b/R/sl3_Task.R @@ -205,8 +205,23 @@ sl3_Task <- R6Class( } else { # match interaction terms to X Xmatch <- lapply(int, function(i) { - grep(i, colnames(self$X), value = TRUE) - }) + cols <- colnames(self$X) + + # detect if 'i' is represented by factor dummies in the design + has_factor_dummies <- any(startsWith(cols, paste0(i, "."))) + + if (has_factor_dummies) { + # prefix match for factor dummy columns, anchored + grep(paste0("^", i, "\\."), colnames(self$X), value = TRUE) + } else if (i %in% cols) { + # exact match for a single numeric (or already-numeric) column + i + } else { + # nothing found; better to fail loud than silently drop an interaction + warning("No matching columns in design matrix for interaction term '", i, "'.") + character(0) + } + }) Xint <- as.list(data.table::as.data.table(t(expand.grid(Xmatch)))) d_Xint <- lapply(Xint, function(Xint) { From 6c85ecb49c22c6517bd5bff2d5dc786f331b2991 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Thu, 5 Mar 2026 18:57:31 -0600 Subject: [PATCH 23/24] Refactor probability and prediction handling in Lrnr_ranger Updated probability handling for binomial and categorical tasks in ranger learner. Enhanced prediction extraction for different outcome types. --- R/Lrnr_ranger.R | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/R/Lrnr_ranger.R b/R/Lrnr_ranger.R index 9ed0399c..416efea2 100644 --- a/R/Lrnr_ranger.R +++ b/R/Lrnr_ranger.R @@ -124,7 +124,10 @@ Lrnr_ranger <- R6Class( colnames(data_in)[1] <- task$nodes$outcome args$data <- data_in args$dependent.variable.name <- task$nodes$outcome - args$probability <- task$outcome_type$type == "categorical" + # For binomial and categorical tasks, downstream sl3 expects probabilities. + if (is.null(args$probability)) { + args$probability <- task$outcome_type$type %in% c("binomial", "categorical") + } fit_object <- call_with_args(ranger::ranger, args) return(fit_object) }, @@ -134,18 +137,47 @@ Lrnr_ranger <- R6Class( Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) # extract numeric predictions from custom class ranger.prediction - predictions <- stats::predict( + pred_obj <- stats::predict( private$.fit_object, data = Xdf, type = "response", num.threads = self$params$num.threads ) + pred_raw <- pred_obj$predictions - predictions <- predictions[[1]] + outcome_type <- private$.training_outcome_type$type + if (outcome_type == "categorical") { + # pack K-class probabilities in a single column + predictions <- pack_predictions(pred_raw) + } else if (outcome_type == "binomial") { + # Return P(Y = max level), consistent with Variable_Type$format + # and other binomial learners in sl3. + if (is.matrix(pred_raw)) { + levs <- private$.training_outcome_type$levels + target <- if (!is.null(levs) && length(levs) > 0) { + as.character(levs[[length(levs)]]) + } else { + colnames(pred_raw)[ncol(pred_raw)] + } - if (private$.training_outcome_type$type == "categorical") { - # pack predictions in a single column - predictions <- pack_predictions(predictions) + if (!is.null(colnames(pred_raw)) && target %in% colnames(pred_raw)) { + predictions <- as.numeric(pred_raw[, target]) + } else { + predictions <- as.numeric(pred_raw[, ncol(pred_raw)]) + } + } else if (is.factor(pred_raw) || is.character(pred_raw)) { + levs <- private$.training_outcome_type$levels + target <- if (!is.null(levs) && length(levs) > 0) { + as.character(levs[[length(levs)]]) + } else { + as.character(sort(unique(pred_raw))[length(unique(pred_raw))]) + } + predictions <- as.numeric(as.character(pred_raw) == target) + } else { + predictions <- as.numeric(pred_raw) + } + } else { + predictions <- as.numeric(pred_raw) } return(predictions) }, From 4ec0b12b878d4db44ff99841e442254181725a50 Mon Sep 17 00:00:00 2001 From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com> Date: Mon, 16 Mar 2026 02:26:48 -0500 Subject: [PATCH 24/24] Refactor ranger importance and prediction handling --- R/Lrnr_ranger.R | 49 +++++++++++++++++++++++-------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/R/Lrnr_ranger.R b/R/Lrnr_ranger.R index 416efea2..48c2fb56 100644 --- a/R/Lrnr_ranger.R +++ b/R/Lrnr_ranger.R @@ -95,16 +95,13 @@ Lrnr_ranger <- R6Class( } self$assert_trained() - # initiate argument list for ranger::importance args <- list(...) args$x <- self$fit_object - # calculate importance metrics importance_result <- call_with_args(ranger::importance, args, keep_all = TRUE ) - # sort according to decreasing importance return(importance_result[order(importance_result, decreasing = TRUE)]) } ), @@ -118,25 +115,23 @@ Lrnr_ranger <- R6Class( if (task$has_node("weights")) { args$case.weights <- task$weights } - # Preserve raw data frame + Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) data_in <- cbind(task$Y, Xdf) colnames(data_in)[1] <- task$nodes$outcome args$data <- data_in args$dependent.variable.name <- task$nodes$outcome - # For binomial and categorical tasks, downstream sl3 expects probabilities. + if (is.null(args$probability)) { args$probability <- task$outcome_type$type %in% c("binomial", "categorical") } + fit_object <- call_with_args(ranger::ranger, args) return(fit_object) }, .predict = function(task) { - - # Preserve raw data frame Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE) - - # extract numeric predictions from custom class ranger.prediction + pred_obj <- stats::predict( private$.fit_object, data = Xdf, @@ -147,30 +142,32 @@ Lrnr_ranger <- R6Class( outcome_type <- private$.training_outcome_type$type if (outcome_type == "categorical") { - # pack K-class probabilities in a single column predictions <- pack_predictions(pred_raw) } else if (outcome_type == "binomial") { - # Return P(Y = max level), consistent with Variable_Type$format - # and other binomial learners in sl3. - if (is.matrix(pred_raw)) { - levs <- private$.training_outcome_type$levels - target <- if (!is.null(levs) && length(levs) > 0) { - as.character(levs[[length(levs)]]) - } else { - colnames(pred_raw)[ncol(pred_raw)] - } + levs <- private$.training_outcome_type$levels + target <- if (!is.null(levs) && length(levs) > 0L) { + as.character(max(levs)) + } else { + NULL + } - if (!is.null(colnames(pred_raw)) && target %in% colnames(pred_raw)) { + if (is.matrix(pred_raw)) { + if (!is.null(colnames(pred_raw)) && !is.null(target) && target %in% colnames(pred_raw)) { predictions <- as.numeric(pred_raw[, target]) + } else if (!is.null(target) && !is.null(levs)) { + target_idx <- match(target, as.character(levs)) + if (is.na(target_idx) || target_idx > ncol(pred_raw)) { + stop("Could not align ranger binomial prediction matrix to the sl3 event level.") + } + predictions <- as.numeric(pred_raw[, target_idx]) + } else if (ncol(pred_raw) == 1L) { + predictions <- as.numeric(pred_raw[, 1]) } else { - predictions <- as.numeric(pred_raw[, ncol(pred_raw)]) + stop("Could not identify the binomial probability column from ranger predictions.") } } else if (is.factor(pred_raw) || is.character(pred_raw)) { - levs <- private$.training_outcome_type$levels - target <- if (!is.null(levs) && length(levs) > 0) { - as.character(levs[[length(levs)]]) - } else { - as.character(sort(unique(pred_raw))[length(unique(pred_raw))]) + if (is.null(target)) { + stop("Could not identify the sl3 event level for ranger binomial predictions.") } predictions <- as.numeric(as.character(pred_raw) == target) } else {