From 92b52c4b3aba65cff3288607a09e0f27946629b5 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Thu, 2 Oct 2025 15:40:04 -0500
Subject: [PATCH 01/24] Update Lrnr_ranger.R

---
 R/Lrnr_ranger.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/Lrnr_ranger.R b/R/Lrnr_ranger.R
index b7ba0d82..5550487c 100644
--- a/R/Lrnr_ranger.R
+++ b/R/Lrnr_ranger.R
@@ -118,7 +118,9 @@ Lrnr_ranger <- R6Class(
       if (task$has_node("weights")) {
         args$case.weights <- task$weights
       }
-      data_in <- cbind(task$Y, task$X)
+      Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
+      data_in <- cbind(task$Y, Xdf)
+      # data_in <- cbind(task$Y, task$X)
       colnames(data_in)[1] <- task$nodes$outcome
       args$data <- data_in
       args$dependent.variable.name <- task$nodes$outcome
@@ -127,10 +129,12 @@ Lrnr_ranger <- R6Class(
       return(fit_object)
     },
     .predict = function(task) {
+      Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
+      
       # extract numeric predictions from custom class ranger.prediction
       predictions <- stats::predict(
         private$.fit_object,
-        data = task$X,
+        data = Xdf,
         type = "response",
         num.threads = self$params$num.threads
       )

From 98cc84f5eea193d6f12e83928d35534cd827e4f1 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Thu, 2 Oct 2025 15:43:53 -0500
Subject: [PATCH 02/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 87beacc2..d04a5731 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -95,8 +95,11 @@ Lrnr_xgboost <- R6Class(
         Y <- as.numeric(Y) - 1
       }
 
+      # Preserve raw data frame
+      Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
+      
       # set up predictor data
-      Xmat <- as.matrix(task$X)
+      Xmat <- as.matrix(Xdf)
       if (is.integer(Xmat)) {
         Xmat[, 1] <- as.numeric(Xmat[, 1])
       }
@@ -151,8 +154,11 @@ Lrnr_xgboost <- R6Class(
     .predict = function(task = NULL) {
       fit_object <- private$.fit_object
 
+      # Preserve raw data frame
+      Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
+      
       # set up test data for prediction
-      Xmat <- as.matrix(task$X)
+      Xmat <- as.matrix(Xdf)
       if (is.integer(Xmat)) {
         Xmat[, 1] <- as.numeric(Xmat[, 1])
       }

From 99e443013ecd3f577a36f4d0557860957967d2c7 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Thu, 2 Oct 2025 15:44:49 -0500
Subject: [PATCH 03/24] Update Lrnr_ranger.R

---
 R/Lrnr_ranger.R | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/R/Lrnr_ranger.R b/R/Lrnr_ranger.R
index 5550487c..9ed0399c 100644
--- a/R/Lrnr_ranger.R
+++ b/R/Lrnr_ranger.R
@@ -118,9 +118,9 @@ Lrnr_ranger <- R6Class(
       if (task$has_node("weights")) {
         args$case.weights <- task$weights
       }
+      # Preserve raw data frame
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
       data_in <- cbind(task$Y, Xdf)
-      # data_in <- cbind(task$Y, task$X)
       colnames(data_in)[1] <- task$nodes$outcome
       args$data <- data_in
       args$dependent.variable.name <- task$nodes$outcome
@@ -129,6 +129,8 @@ Lrnr_ranger <- R6Class(
       return(fit_object)
     },
     .predict = function(task) {
+
+      # Preserve raw data frame
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
       
       # extract numeric predictions from custom class ranger.prediction

From 5380db47420288970b94696815b728254b1f3dfc Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Thu, 2 Oct 2025 16:22:34 -0500
Subject: [PATCH 04/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index d04a5731..1b57e751 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -106,7 +106,7 @@ Lrnr_xgboost <- R6Class(
       if (nrow(Xmat) != nrow(task$X) & ncol(Xmat) == nrow(task$X)) {
         Xmat <- t(Xmat)
       }
-      args$data <- try(xgboost::xgb.DMatrix(Xmat, label = Y), silent = TRUE)
+      args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y), silent = TRUE)
 
       # specify weights
       if (task$has_node("weights")) {
@@ -169,7 +169,7 @@ Lrnr_xgboost <- R6Class(
       }
       stopifnot(nrow(Xmat_ord) == nrow(Xmat))
       # convert to xgb.DMatrix
-      xgb_data <- try(xgboost::xgb.DMatrix(Xmat_ord), silent = TRUE)
+      xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE)
 
       # incorporate offset, if it wasspecified in training
       if (self$fit_object$training_offset) {

From 9959dfa975c517e6bcb4006b3f1c399b96d1cbfb Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Thu, 2 Oct 2025 16:48:08 -0500
Subject: [PATCH 05/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 1b57e751..69a1a52d 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -98,16 +98,10 @@ Lrnr_xgboost <- R6Class(
       # Preserve raw data frame
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
       
-      # set up predictor data
-      Xmat <- as.matrix(Xdf)
-      if (is.integer(Xmat)) {
-        Xmat[, 1] <- as.numeric(Xmat[, 1])
-      }
-      if (nrow(Xmat) != nrow(task$X) & ncol(Xmat) == nrow(task$X)) {
-        Xmat <- t(Xmat)
-      }
       args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y), silent = TRUE)
 
+      factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL)
+
       # specify weights
       if (task$has_node("weights")) {
         try(xgboost::setinfo(args$data, "weight", task$weights), silent = TRUE)
@@ -148,6 +142,7 @@ Lrnr_xgboost <- R6Class(
       )
       fit_object$training_offset <- task$has_node("offset")
       fit_object$link_fun <- link_fun
+      fit_object$sl3_factor_levels <- factor_levels 
 
       return(fit_object)
     },
@@ -157,17 +152,12 @@ Lrnr_xgboost <- R6Class(
       # Preserve raw data frame
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
       
-      # set up test data for prediction
-      Xmat <- as.matrix(Xdf)
-      if (is.integer(Xmat)) {
-        Xmat[, 1] <- as.numeric(Xmat[, 1])
+      # relevel factors to training levels (keep order identical to train)
+      for (nm in names(Xdf)) {
+        tr_lvls <- fit_object$sl3_factor_levels[[nm]]
+        Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls)
       }
-      # order of columns has to be the same in xgboost training and test data
-      Xmat_ord <- as.matrix(Xmat[, match(fit_object$feature_names, colnames(Xmat))])
-      if ((nrow(Xmat_ord) != nrow(Xmat)) & (ncol(Xmat_ord) == nrow(Xmat))) {
-        Xmat_ord <- t(Xmat_ord)
-      }
-      stopifnot(nrow(Xmat_ord) == nrow(Xmat))
+      
       # convert to xgb.DMatrix
       xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE)
 

From 39dac1547e02d9ecb556738e151fe1d41e4d8aba Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Thu, 2 Oct 2025 18:24:39 -0500
Subject: [PATCH 06/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 69a1a52d..0ca35d39 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -98,7 +98,7 @@ Lrnr_xgboost <- R6Class(
       # Preserve raw data frame
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
       
-      args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y), silent = TRUE)
+      args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y)) # , silent = TRUE
 
       factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL)
 
@@ -116,7 +116,7 @@ Lrnr_xgboost <- R6Class(
         family <- outcome_type$glm_family(return_object = TRUE)
         link_fun <- args$family$linkfun
         offset <- task$offset_transformed(link_fun)
-        try(xgboost::setinfo(args$data, "base_margin", offset), silent = TRUE)
+        try(xgboost::setinfo(args$data, "base_margin", offset)) #, silent = TRUE
       } else {
         link_fun <- NULL
       }

From eb69493826da682b41735bb05236c46decf85ce2 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Thu, 2 Oct 2025 18:41:35 -0500
Subject: [PATCH 07/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 206 ++++++++++++++++++++++++++++-------------------
 1 file changed, 124 insertions(+), 82 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 0ca35d39..4daa1a26 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -53,9 +53,8 @@
 #'
 #' # get feature importance from fitted model
 #' xgb_varimp <- xgb_fit$importance()
-Lrnr_xgboost <- R6Class(
+Lrnr_xgboost <- R6::R6Class(
   classname = "Lrnr_xgboost", inherit = Lrnr_base,
-  portable = TRUE, class = TRUE,
   public = list(
     initialize = function(nrounds = 20, nthread = 1, ...) {
       params <- args_to_list()
@@ -63,136 +62,179 @@ Lrnr_xgboost <- R6Class(
     },
     importance = function(...) {
       self$assert_trained()
-
-      # initiate argument list for xgboost::xgb.importance
       args <- list(...)
       args$model <- self$fit_object
-
-      # calculate importance metrics, already sorted by decreasing importance
-      importance_result <- call_with_args(xgboost::xgb.importance, args)
-      rownames(importance_result) <- importance_result[["Feature"]]
-      return(importance_result)
+      imp <- call_with_args(xgboost::xgb.importance, args)
+      rownames(imp) <- imp[["Feature"]]
+      imp
     }
   ),
   private = list(
-    .properties = c(
-      "continuous", "binomial", "categorical", "weights",
-      "offset", "importance"
-    ),
-    .train = function(task) {
-      args <- self$params
+    .properties = c("continuous", "binomial", "categorical", "weights", "offset", "importance"),
+
+    .audit_df = function(df, head_levels = 4L) {
+      cls <- vapply(df, function(z) paste(class(z), collapse = ","), character(1))
+      is_char <- vapply(df, is.character, logical(1))
+      is_fac  <- vapply(df, is.factor,   logical(1))
+      lvl_n   <- vapply(df, function(z) if (is.factor(z)) length(levels(z)) else NA_integer_, integer(1))
+      lvl_ex  <- vapply(df, function(z) {
+        if (is.factor(z)) paste(utils::head(levels(z), head_levels), collapse = "|") else ""
+      }, character(1))
+      data.frame(
+        column = names(df),
+        class  = cls,
+        is_character = is_char,
+        is_factor    = is_fac,
+        n_levels     = lvl_n,
+        ex_levels    = lvl_ex,
+        row.names = NULL,
+        check.names = FALSE
+      )
+    },
 
-      verbose <- args$verbose
-      if (is.null(verbose)) {
-        verbose <- getOption("sl3.verbose")
+    .stop_with_context = function(msg, Xdf, err = NULL) {
+      audit <- private$.audit_df(Xdf)
+      bad_chars <- audit$column[audit$is_character]
+      cat("=== XGBoost DMatrix construction failed ===\n")
+      cat("Reason:", msg, "\n")
+      if (!is.null(err)) cat("xgboost says:", conditionMessage(err), "\n")
+      cat("Data summary: ", nrow(Xdf), " rows x ", ncol(Xdf), " cols\n", sep = "")
+      if (length(bad_chars)) {
+        cat("Character columns (must convert to factor or numeric):\n  - ",
+            paste(bad_chars, collapse = ", "), "\n", sep = "")
+      }
+      fac_rows <- audit[audit$is_factor, ]
+      if (nrow(fac_rows)) {
+        cat("Factor columns and levels (first few):\n")
+        utils::capture.output(print(fac_rows[, c("column","n_levels","ex_levels")], row.names = FALSE)) |>
+          paste(collapse = "\n") |> cat("\n")
       }
+      stop(msg, call. = FALSE)
+    },
+
+    .train = function(task) {
+      args <- self$params
+      verbose <- if (is.null(args$verbose)) getOption("sl3.verbose") else args$verbose
       args$verbose <- as.integer(verbose)
 
-      # set up outcome
+      # outcome
       outcome_type <- self$get_outcome_type(task)
       Y <- outcome_type$format(task$Y)
-      if (outcome_type$type == "categorical") {
-        Y <- as.numeric(Y) - 1
-      }
+      if (outcome_type$type == "categorical") Y <- as.numeric(Y) - 1L
 
-      # Preserve raw data frame
+      # covariates as raw data.frame (no factor expansion)
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
-      
-      args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y)) # , silent = TRUE
+      stopifnot(is.data.frame(Xdf))
 
-      factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL)
+      # convert any characters to factor (xgboost supports factor directly via xgb.DMatrix)
+      if (any(vapply(Xdf, is.character, logical(1)))) {
+        Xdf[] <- lapply(Xdf, function(z) if (is.character(z)) factor(z) else z)
+      }
 
-      # specify weights
+      # Build DMatrix with hard check + diagnostics
+      dm <- try(xgboost::xgb.DMatrix(data = Xdf, label = Y,
+                                     feature_names = colnames(Xdf)), silent = TRUE)
+      if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) {
+        private$.stop_with_context("xgb.DMatrix() did not return an xgb.DMatrix", Xdf, err = attr(dm, "condition"))
+      }
+
+      # weights
       if (task$has_node("weights")) {
-        try(xgboost::setinfo(args$data, "weight", task$weights), silent = TRUE)
+        try(xgboost::setinfo(dm, "weight", task$weights), silent = TRUE)
       }
 
-      # specify offset
+      # offset / base_margin
       if (task$has_node("offset")) {
         if (outcome_type$type == "categorical") {
-          # TODO: fix
           stop("offsets not yet supported for outcome_type='categorical'")
         }
-        family <- outcome_type$glm_family(return_object = TRUE)
+        family   <- outcome_type$glm_family(return_object = TRUE)
         link_fun <- args$family$linkfun
-        offset <- task$offset_transformed(link_fun)
-        try(xgboost::setinfo(args$data, "base_margin", offset)) #, silent = TRUE
+        offset   <- task$offset_transformed(link_fun)
+        try(xgboost::setinfo(dm, "base_margin", offset), silent = TRUE)
       } else {
         link_fun <- NULL
       }
 
-      # specify objective if it's NULL to avoid xgb warnings
+      # reasonable default objective
       if (is.null(args$objective)) {
         if (outcome_type$type == "binomial") {
-          args$objective <- "binary:logistic"
-          args$eval_metric <- "logloss"
+          args$objective <- "binary:logistic"; args$eval_metric <- "logloss"
         } else if (outcome_type$type == "quasibinomial") {
           args$objective <- "reg:logistic"
         } else if (outcome_type$type == "categorical") {
-          args$objective <- "multi:softprob"
-          args$eval_metric <- "mlogloss"
+          args$objective <- "multi:softprob"; args$eval_metric <- "mlogloss"
           args$num_class <- as.integer(length(outcome_type$levels))
+        } else {
+          # continuous
+          if (is.null(args$eval_metric)) args$eval_metric <- "rmse"
         }
       }
 
-      args$watchlist <- list(train = args$data)
-      fit_object <- call_with_args(xgboost::xgb.train, args,
-        keep_all = TRUE,
-        ignore = "formula"
-      )
-      fit_object$training_offset <- task$has_node("offset")
-      fit_object$link_fun <- link_fun
-      fit_object$sl3_factor_levels <- factor_levels 
+      args$data <- dm
+      args$watchlist <- list(train = dm)
 
-      return(fit_object)
+      fit_object <- call_with_args(xgboost::xgb.train, args, keep_all = TRUE, ignore = "formula")
+      fit_object$training_offset   <- task$has_node("offset")
+      fit_object$link_fun          <- link_fun
+      fit_object$sl3_feature_names <- colnames(Xdf)
+      fit_object$sl3_factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL)
+      fit_object
     },
+
     .predict = function(task = NULL) {
-      fit_object <- private$.fit_object
+      fit <- private$.fit_object
 
-      # Preserve raw data frame
+      # raw df
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
-      
-      # relevel factors to training levels (keep order identical to train)
+      stopifnot(is.data.frame(Xdf))
+
+      # relevel factors to training levels; also coerce stray characters
+      tr_lvls_list <- fit$sl3_factor_levels
       for (nm in names(Xdf)) {
-        tr_lvls <- fit_object$sl3_factor_levels[[nm]]
-        Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls)
+        if (is.character(Xdf[[nm]])) Xdf[[nm]] <- factor(Xdf[[nm]])
+        tr_lvls <- tr_lvls_list[[nm]]
+        if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) {
+          Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls)
+        }
       }
-      
-      # convert to xgb.DMatrix
-      xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE)
-
-      # incorporate offset, if it wasspecified in training
-      if (self$fit_object$training_offset) {
-        offset <- task$offset_transformed(
-          self$fit_object$link_fun,
-          for_prediction = TRUE
-        )
-        try(xgboost::setinfo(xgb_data, "base_margin", offset), silent = TRUE)
+
+      # reorder columns to training order (critical)
+      tr_names <- fit$sl3_feature_names
+      ord <- match(tr_names, colnames(Xdf))
+      if (anyNA(ord)) {
+        missing_cols <- tr_names[is.na(ord)]
+        msg <- paste0("Prediction data is missing ", length(missing_cols),
+                      " training column(s): ", paste(missing_cols, collapse = ", "))
+        private$.stop_with_context(msg, Xdf, err = NULL)
       }
+      Xdf <- Xdf[, ord, drop = FALSE]
 
-      # incorporate ntreelimit, if training model was not a gblinear-based fit
-      ntreelimit <- 0
-      if (!is.null(fit_object[["best_ntreelimit"]]) &
-        !("gblinear" %in% fit_object[["params"]][["booster"]])) {
-        ntreelimit <- fit_object[["best_ntreelimit"]]
+      # DMatrix (with check)
+      dm <- try(xgboost::xgb.DMatrix(data = Xdf, feature_names = colnames(Xdf)), silent = TRUE)
+      if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) {
+        private$.stop_with_context("xgb.DMatrix() failed during predict()", Xdf, err = attr(dm, "condition"))
       }
 
-      predictions <- rep.int(list(numeric()), 1)
-      if (nrow(Xmat) > 0) {
-        # will generally return vector, needs to be put into data.table column
-        predictions <- stats::predict(
-          fit_object,
-          newdata = xgb_data, ntreelimit = ntreelimit, reshape = TRUE
-        )
-
-        if (private$.training_outcome_type$type == "categorical") {
-          # pack predictions in a single column
-          predictions <- pack_predictions(predictions)
-        }
+      # optional offset
+      if (fit$training_offset) {
+        offset <- task$offset_transformed(fit$link_fun, for_prediction = TRUE)
+        try(xgboost::setinfo(dm, "base_margin", offset), silent = TRUE)
+      }
+
+      # best_ntreelimit if present
+      ntreelimit <- 0
+      if (!is.null(fit[["best_ntreelimit"]]) && !("gblinear" %in% fit[["params"]][["booster"]])) {
+        ntreelimit <- fit[["best_ntreelimit"]]
       }
 
-      return(predictions)
+      if (nrow(Xdf) == 0) return(numeric(0))
+
+      preds <- stats::predict(fit, newdata = dm, ntreelimit = ntreelimit, reshape = TRUE)
+      if (private$.training_outcome_type$type == "categorical") preds <- pack_predictions(preds)
+      preds
     },
+
     .required_packages = c("xgboost")
   )
 )

From e953d7ad1baa372104e40f3e0f6b037a13c65662 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Thu, 2 Oct 2025 21:46:37 -0500
Subject: [PATCH 08/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 4daa1a26..3c9b8026 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -132,8 +132,7 @@ Lrnr_xgboost <- R6::R6Class(
       }
 
       # Build DMatrix with hard check + diagnostics
-      dm <- try(xgboost::xgb.DMatrix(data = Xdf, label = Y,
-                                     feature_names = colnames(Xdf)), silent = TRUE)
+      dm <- try(xgboost::xgb.DMatrix(data = Xdf, feature_names = colnames(Xdf)), silent = TRUE)
       if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) {
         private$.stop_with_context("xgb.DMatrix() did not return an xgb.DMatrix", Xdf, err = attr(dm, "condition"))
       }

From 454d2dff99b05b3d72d2c0bf8586e06115d43d9e Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Fri, 3 Oct 2025 00:44:34 -0500
Subject: [PATCH 09/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 3c9b8026..881e9190 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -131,6 +131,8 @@ Lrnr_xgboost <- R6::R6Class(
         Xdf[] <- lapply(Xdf, function(z) if (is.character(z)) factor(z) else z)
       }
 
+      print(is.data.frame(Xdf))
+
       # Build DMatrix with hard check + diagnostics
       dm <- try(xgboost::xgb.DMatrix(data = Xdf, feature_names = colnames(Xdf)), silent = TRUE)
       if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) {

From 71769fb072af7888e6ddc7357a469637a1031e86 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Fri, 3 Oct 2025 11:23:16 -0500
Subject: [PATCH 10/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 207 +++++++++++++++++++----------------------------
 1 file changed, 82 insertions(+), 125 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 881e9190..69a1a52d 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -53,8 +53,9 @@
 #'
 #' # get feature importance from fitted model
 #' xgb_varimp <- xgb_fit$importance()
-Lrnr_xgboost <- R6::R6Class(
+Lrnr_xgboost <- R6Class(
   classname = "Lrnr_xgboost", inherit = Lrnr_base,
+  portable = TRUE, class = TRUE,
   public = list(
     initialize = function(nrounds = 20, nthread = 1, ...) {
       params <- args_to_list()
@@ -62,180 +63,136 @@ Lrnr_xgboost <- R6::R6Class(
     },
     importance = function(...) {
       self$assert_trained()
+
+      # initiate argument list for xgboost::xgb.importance
       args <- list(...)
       args$model <- self$fit_object
-      imp <- call_with_args(xgboost::xgb.importance, args)
-      rownames(imp) <- imp[["Feature"]]
-      imp
+
+      # calculate importance metrics, already sorted by decreasing importance
+      importance_result <- call_with_args(xgboost::xgb.importance, args)
+      rownames(importance_result) <- importance_result[["Feature"]]
+      return(importance_result)
     }
   ),
   private = list(
-    .properties = c("continuous", "binomial", "categorical", "weights", "offset", "importance"),
-
-    .audit_df = function(df, head_levels = 4L) {
-      cls <- vapply(df, function(z) paste(class(z), collapse = ","), character(1))
-      is_char <- vapply(df, is.character, logical(1))
-      is_fac  <- vapply(df, is.factor,   logical(1))
-      lvl_n   <- vapply(df, function(z) if (is.factor(z)) length(levels(z)) else NA_integer_, integer(1))
-      lvl_ex  <- vapply(df, function(z) {
-        if (is.factor(z)) paste(utils::head(levels(z), head_levels), collapse = "|") else ""
-      }, character(1))
-      data.frame(
-        column = names(df),
-        class  = cls,
-        is_character = is_char,
-        is_factor    = is_fac,
-        n_levels     = lvl_n,
-        ex_levels    = lvl_ex,
-        row.names = NULL,
-        check.names = FALSE
-      )
-    },
-
-    .stop_with_context = function(msg, Xdf, err = NULL) {
-      audit <- private$.audit_df(Xdf)
-      bad_chars <- audit$column[audit$is_character]
-      cat("=== XGBoost DMatrix construction failed ===\n")
-      cat("Reason:", msg, "\n")
-      if (!is.null(err)) cat("xgboost says:", conditionMessage(err), "\n")
-      cat("Data summary: ", nrow(Xdf), " rows x ", ncol(Xdf), " cols\n", sep = "")
-      if (length(bad_chars)) {
-        cat("Character columns (must convert to factor or numeric):\n  - ",
-            paste(bad_chars, collapse = ", "), "\n", sep = "")
-      }
-      fac_rows <- audit[audit$is_factor, ]
-      if (nrow(fac_rows)) {
-        cat("Factor columns and levels (first few):\n")
-        utils::capture.output(print(fac_rows[, c("column","n_levels","ex_levels")], row.names = FALSE)) |>
-          paste(collapse = "\n") |> cat("\n")
-      }
-      stop(msg, call. = FALSE)
-    },
-
+    .properties = c(
+      "continuous", "binomial", "categorical", "weights",
+      "offset", "importance"
+    ),
     .train = function(task) {
       args <- self$params
-      verbose <- if (is.null(args$verbose)) getOption("sl3.verbose") else args$verbose
+
+      verbose <- args$verbose
+      if (is.null(verbose)) {
+        verbose <- getOption("sl3.verbose")
+      }
       args$verbose <- as.integer(verbose)
 
-      # outcome
+      # set up outcome
       outcome_type <- self$get_outcome_type(task)
       Y <- outcome_type$format(task$Y)
-      if (outcome_type$type == "categorical") Y <- as.numeric(Y) - 1L
-
-      # covariates as raw data.frame (no factor expansion)
-      Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
-      stopifnot(is.data.frame(Xdf))
-
-      # convert any characters to factor (xgboost supports factor directly via xgb.DMatrix)
-      if (any(vapply(Xdf, is.character, logical(1)))) {
-        Xdf[] <- lapply(Xdf, function(z) if (is.character(z)) factor(z) else z)
+      if (outcome_type$type == "categorical") {
+        Y <- as.numeric(Y) - 1
       }
 
-      print(is.data.frame(Xdf))
+      # Preserve raw data frame
+      Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
+      
+      args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y), silent = TRUE)
 
-      # Build DMatrix with hard check + diagnostics
-      dm <- try(xgboost::xgb.DMatrix(data = Xdf, feature_names = colnames(Xdf)), silent = TRUE)
-      if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) {
-        private$.stop_with_context("xgb.DMatrix() did not return an xgb.DMatrix", Xdf, err = attr(dm, "condition"))
-      }
+      factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL)
 
-      # weights
+      # specify weights
       if (task$has_node("weights")) {
-        try(xgboost::setinfo(dm, "weight", task$weights), silent = TRUE)
+        try(xgboost::setinfo(args$data, "weight", task$weights), silent = TRUE)
       }
 
-      # offset / base_margin
+      # specify offset
       if (task$has_node("offset")) {
         if (outcome_type$type == "categorical") {
+          # TODO: fix
           stop("offsets not yet supported for outcome_type='categorical'")
         }
-        family   <- outcome_type$glm_family(return_object = TRUE)
+        family <- outcome_type$glm_family(return_object = TRUE)
         link_fun <- args$family$linkfun
-        offset   <- task$offset_transformed(link_fun)
-        try(xgboost::setinfo(dm, "base_margin", offset), silent = TRUE)
+        offset <- task$offset_transformed(link_fun)
+        try(xgboost::setinfo(args$data, "base_margin", offset), silent = TRUE)
       } else {
         link_fun <- NULL
       }
 
-      # reasonable default objective
+      # specify objective if it's NULL to avoid xgb warnings
       if (is.null(args$objective)) {
         if (outcome_type$type == "binomial") {
-          args$objective <- "binary:logistic"; args$eval_metric <- "logloss"
+          args$objective <- "binary:logistic"
+          args$eval_metric <- "logloss"
         } else if (outcome_type$type == "quasibinomial") {
           args$objective <- "reg:logistic"
         } else if (outcome_type$type == "categorical") {
-          args$objective <- "multi:softprob"; args$eval_metric <- "mlogloss"
+          args$objective <- "multi:softprob"
+          args$eval_metric <- "mlogloss"
           args$num_class <- as.integer(length(outcome_type$levels))
-        } else {
-          # continuous
-          if (is.null(args$eval_metric)) args$eval_metric <- "rmse"
         }
       }
 
-      args$data <- dm
-      args$watchlist <- list(train = dm)
+      args$watchlist <- list(train = args$data)
+      fit_object <- call_with_args(xgboost::xgb.train, args,
+        keep_all = TRUE,
+        ignore = "formula"
+      )
+      fit_object$training_offset <- task$has_node("offset")
+      fit_object$link_fun <- link_fun
+      fit_object$sl3_factor_levels <- factor_levels 
 
-      fit_object <- call_with_args(xgboost::xgb.train, args, keep_all = TRUE, ignore = "formula")
-      fit_object$training_offset   <- task$has_node("offset")
-      fit_object$link_fun          <- link_fun
-      fit_object$sl3_feature_names <- colnames(Xdf)
-      fit_object$sl3_factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL)
-      fit_object
+      return(fit_object)
     },
-
     .predict = function(task = NULL) {
-      fit <- private$.fit_object
+      fit_object <- private$.fit_object
 
-      # raw df
+      # Preserve raw data frame
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
-      stopifnot(is.data.frame(Xdf))
-
-      # relevel factors to training levels; also coerce stray characters
-      tr_lvls_list <- fit$sl3_factor_levels
+      
+      # relevel factors to training levels (keep order identical to train)
       for (nm in names(Xdf)) {
-        if (is.character(Xdf[[nm]])) Xdf[[nm]] <- factor(Xdf[[nm]])
-        tr_lvls <- tr_lvls_list[[nm]]
-        if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) {
-          Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls)
-        }
-      }
-
-      # reorder columns to training order (critical)
-      tr_names <- fit$sl3_feature_names
-      ord <- match(tr_names, colnames(Xdf))
-      if (anyNA(ord)) {
-        missing_cols <- tr_names[is.na(ord)]
-        msg <- paste0("Prediction data is missing ", length(missing_cols),
-                      " training column(s): ", paste(missing_cols, collapse = ", "))
-        private$.stop_with_context(msg, Xdf, err = NULL)
-      }
-      Xdf <- Xdf[, ord, drop = FALSE]
-
-      # DMatrix (with check)
-      dm <- try(xgboost::xgb.DMatrix(data = Xdf, feature_names = colnames(Xdf)), silent = TRUE)
-      if (inherits(dm, "try-error") || !inherits(dm, "xgb.DMatrix")) {
-        private$.stop_with_context("xgb.DMatrix() failed during predict()", Xdf, err = attr(dm, "condition"))
+        tr_lvls <- fit_object$sl3_factor_levels[[nm]]
+        Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls)
       }
-
-      # optional offset
-      if (fit$training_offset) {
-        offset <- task$offset_transformed(fit$link_fun, for_prediction = TRUE)
-        try(xgboost::setinfo(dm, "base_margin", offset), silent = TRUE)
+      
+      # convert to xgb.DMatrix
+      xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE)
+
+      # incorporate offset, if it wasspecified in training
+      if (self$fit_object$training_offset) {
+        offset <- task$offset_transformed(
+          self$fit_object$link_fun,
+          for_prediction = TRUE
+        )
+        try(xgboost::setinfo(xgb_data, "base_margin", offset), silent = TRUE)
       }
 
-      # best_ntreelimit if present
+      # incorporate ntreelimit, if training model was not a gblinear-based fit
       ntreelimit <- 0
-      if (!is.null(fit[["best_ntreelimit"]]) && !("gblinear" %in% fit[["params"]][["booster"]])) {
-        ntreelimit <- fit[["best_ntreelimit"]]
+      if (!is.null(fit_object[["best_ntreelimit"]]) &
+        !("gblinear" %in% fit_object[["params"]][["booster"]])) {
+        ntreelimit <- fit_object[["best_ntreelimit"]]
       }
 
-      if (nrow(Xdf) == 0) return(numeric(0))
+      predictions <- rep.int(list(numeric()), 1)
+      if (nrow(Xmat) > 0) {
+        # will generally return vector, needs to be put into data.table column
+        predictions <- stats::predict(
+          fit_object,
+          newdata = xgb_data, ntreelimit = ntreelimit, reshape = TRUE
+        )
+
+        if (private$.training_outcome_type$type == "categorical") {
+          # pack predictions in a single column
+          predictions <- pack_predictions(predictions)
+        }
+      }
 
-      preds <- stats::predict(fit, newdata = dm, ntreelimit = ntreelimit, reshape = TRUE)
-      if (private$.training_outcome_type$type == "categorical") preds <- pack_predictions(preds)
-      preds
+      return(predictions)
     },
-
     .required_packages = c("xgboost")
   )
 )

From 56c72a9bc80ee300f0f5b5f162f22246b6b2d53a Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Fri, 3 Oct 2025 13:36:30 -0500
Subject: [PATCH 11/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 190 +++++++++++++++++++++++++++--------------------
 1 file changed, 111 insertions(+), 79 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 69a1a52d..2ae812a6 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -80,118 +80,150 @@ Lrnr_xgboost <- R6Class(
       "offset", "importance"
     ),
     .train = function(task) {
+      # Safe helper for %||%
+      `%||%` <- function(a, b) if (!is.null(a)) a else b
+        
       args <- self$params
-
+    
+      # verbosity
       verbose <- args$verbose
-      if (is.null(verbose)) {
-        verbose <- getOption("sl3.verbose")
-      }
+      if (is.null(verbose)) verbose <- getOption("sl3.verbose")
       args$verbose <- as.integer(verbose)
-
-      # set up outcome
+    
+      # outcome
       outcome_type <- self$get_outcome_type(task)
       Y <- outcome_type$format(task$Y)
-      if (outcome_type$type == "categorical") {
-        Y <- as.numeric(Y) - 1
-      }
-
-      # Preserve raw data frame
+      if (outcome_type$type == "categorical") Y <- as.numeric(Y) - 1L
+    
+      # raw covariates, keep factors intact
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
-      
-      args$data <- try(xgboost::xgb.DMatrix(Xdf, label = Y), silent = TRUE)
-
-      factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL)
-
-      # specify weights
+    
+      # (optional but recommended) explicit feature types
+      feat_types <- vapply(Xdf, function(z) {
+        if (is.factor(z)) "c" else if (is.integer(z)) "int"
+        else if (is.logical(z)) "i" else "float"
+      }, character(1))
+    
+      # DMatrix
+      dtrain <- try(xgboost::xgb.DMatrix(
+        data = Xdf, label = Y,
+        feature_names = colnames(Xdf),
+        feature_types = feat_types
+      ), silent = TRUE)
+    
+      if (!inherits(dtrain, "xgb.DMatrix")) {
+        cls <- vapply(Xdf, function(z) paste(class(z), collapse=","), character(1))
+        stop("xgb.DMatrix construction failed. Column classes: ",
+             paste(sprintf("%s:[%s]", names(cls), cls), collapse="; "))
+      }
+    
+      # weights
       if (task$has_node("weights")) {
-        try(xgboost::setinfo(args$data, "weight", task$weights), silent = TRUE)
+        xgboost::setinfo(dtrain, "weight", task$weights)
       }
-
-      # specify offset
+    
+      # offset (base_margin)
+      link_fun <- NULL
       if (task$has_node("offset")) {
         if (outcome_type$type == "categorical") {
-          # TODO: fix
           stop("offsets not yet supported for outcome_type='categorical'")
         }
         family <- outcome_type$glm_family(return_object = TRUE)
-        link_fun <- args$family$linkfun
+        link_fun <- family$linkfun
         offset <- task$offset_transformed(link_fun)
-        try(xgboost::setinfo(args$data, "base_margin", offset), silent = TRUE)
-      } else {
-        link_fun <- NULL
+        xgboost::setinfo(dtrain, "base_margin", offset)
       }
-
-      # specify objective if it's NULL to avoid xgb warnings
-      if (is.null(args$objective)) {
-        if (outcome_type$type == "binomial") {
-          args$objective <- "binary:logistic"
-          args$eval_metric <- "logloss"
+    
+      # ----- xgboost arguments: use params + evals -----
+      nrounds <- if (!is.null(args$nrounds)) args$nrounds else 20L
+      params  <- if (!is.null(args$params)) args$params else list()
+    
+      # set objective/metric if not provided
+      if (is.null(params$objective)) {
+        if (outcome_type$type %in% c("binomial")) {
+          params$objective  <- "binary:logistic"
+          params$eval_metric <- params$eval_metric %||% "logloss"
         } else if (outcome_type$type == "quasibinomial") {
-          args$objective <- "reg:logistic"
+          params$objective <- "reg:logistic"
         } else if (outcome_type$type == "categorical") {
-          args$objective <- "multi:softprob"
-          args$eval_metric <- "mlogloss"
-          args$num_class <- as.integer(length(outcome_type$levels))
+          params$objective  <- "multi:softprob"
+          params$eval_metric <- params$eval_metric %||% "mlogloss"
+          params$num_class <- as.integer(length(outcome_type$levels))
+        } else {
+          params$objective <- params$objective %||% "reg:squarederror"
         }
       }
-
-      args$watchlist <- list(train = args$data)
-      fit_object <- call_with_args(xgboost::xgb.train, args,
-        keep_all = TRUE,
-        ignore = "formula"
+    
+      fit_booster <- xgboost::xgb.train(
+        data   = dtrain,
+        nrounds = nrounds,
+        params  = params,
+        evals   = list(train = dtrain),
+        verbose = args$verbose
       )
-      fit_object$training_offset <- task$has_node("offset")
-      fit_object$link_fun <- link_fun
-      fit_object$sl3_factor_levels <- factor_levels 
-
+    
+      # DO NOT mutate the booster; wrap it instead
+      factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL)
+      fit_object <- list(
+        booster = fit_booster,
+        meta = list(
+          training_offset   = task$has_node("offset"),
+          link_fun          = link_fun,
+          sl3_factor_levels = factor_levels
+        )
+      )
+      class(fit_object) <- c("sl3_xgb_fit", "list")
+    
       return(fit_object)
     },
     .predict = function(task = NULL) {
       fit_object <- private$.fit_object
-
-      # Preserve raw data frame
+      booster    <- fit_object$booster
+      meta       <- fit_object$meta
+    
+      # raw covariates; relevel to training levels
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
-      
-      # relevel factors to training levels (keep order identical to train)
+    
       for (nm in names(Xdf)) {
-        tr_lvls <- fit_object$sl3_factor_levels[[nm]]
-        Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls)
+        tr_lvls <- meta$sl3_factor_levels[[nm]]
+        if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) {
+          # (optional) track NAs due to unseen levels
+          before_na <- sum(is.na(Xdf[[nm]]))
+          Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls)
+          after_na <- sum(is.na(Xdf[[nm]]))
+          if (after_na > before_na) {
+            message("xgboost predict: introduced ", after_na - before_na,
+                    " NA(s) in '", nm, "' due to unseen levels")
+          }
+        }
       }
-      
-      # convert to xgb.DMatrix
+    
       xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE)
-
-      # incorporate offset, if it wasspecified in training
-      if (self$fit_object$training_offset) {
-        offset <- task$offset_transformed(
-          self$fit_object$link_fun,
-          for_prediction = TRUE
-        )
-        try(xgboost::setinfo(xgb_data, "base_margin", offset), silent = TRUE)
+      if (!inherits(xgb_data, "xgb.DMatrix")) stop("Failed to build DMatrix for prediction.")
+    
+      # base_margin if used in training
+      if (isTRUE(meta$training_offset)) {
+        offset <- task$offset_transformed(meta$link_fun, for_prediction = TRUE)
+        xgboost::setinfo(xgb_data, "base_margin", offset)
       }
-
-      # incorporate ntreelimit, if training model was not a gblinear-based fit
-      ntreelimit <- 0
-      if (!is.null(fit_object[["best_ntreelimit"]]) &
-        !("gblinear" %in% fit_object[["params"]][["booster"]])) {
-        ntreelimit <- fit_object[["best_ntreelimit"]]
+    
+      # ntreelimit logic
+      ntreelimit <- 0L
+      if (!is.null(booster[["best_ntreelimit"]]) &&
+          !("gblinear" %in% booster[["params"]][["booster"]])) {
+        ntreelimit <- booster[["best_ntreelimit"]]
       }
-
-      predictions <- rep.int(list(numeric()), 1)
-      if (nrow(Xmat) > 0) {
-        # will generally return vector, needs to be put into data.table column
-        predictions <- stats::predict(
-          fit_object,
-          newdata = xgb_data, ntreelimit = ntreelimit, reshape = TRUE
-        )
-
+    
+      preds <- numeric(0)
+      if (nrow(Xdf) > 0) {
+        preds <- stats::predict(booster, newdata = xgb_data,
+                                ntreelimit = ntreelimit, reshape = TRUE)
+    
         if (private$.training_outcome_type$type == "categorical") {
-          # pack predictions in a single column
-          predictions <- pack_predictions(predictions)
+          preds <- pack_predictions(preds)
         }
       }
-
-      return(predictions)
+      return(preds)
     },
     .required_packages = c("xgboost")
   )

From f5935b9f336ed18588fc4a7391ab3d0e40c72e5c Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Tue, 7 Oct 2025 11:26:42 -0500
Subject: [PATCH 12/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 2ae812a6..628687ad 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -187,10 +187,9 @@ Lrnr_xgboost <- R6Class(
       for (nm in names(Xdf)) {
         tr_lvls <- meta$sl3_factor_levels[[nm]]
         if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) {
-          # (optional) track NAs due to unseen levels
           before_na <- sum(is.na(Xdf[[nm]]))
           Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls)
-          after_na <- sum(is.na(Xdf[[nm]]))
+          after_na  <- sum(is.na(Xdf[[nm]]))
           if (after_na > before_na) {
             message("xgboost predict: introduced ", after_na - before_na,
                     " NA(s) in '", nm, "' due to unseen levels")
@@ -198,6 +197,9 @@ Lrnr_xgboost <- R6Class(
         }
       }
     
+      # empty guard
+      if (nrow(Xdf) == 0L) return(numeric(0))
+    
       xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE)
       if (!inherits(xgb_data, "xgb.DMatrix")) stop("Failed to build DMatrix for prediction.")
     
@@ -207,24 +209,24 @@ Lrnr_xgboost <- R6Class(
         xgboost::setinfo(xgb_data, "base_margin", offset)
       }
     
-      # ntreelimit logic
-      ntreelimit <- 0L
-      if (!is.null(booster[["best_ntreelimit"]]) &&
-          !("gblinear" %in% booster[["params"]][["booster"]])) {
-        ntreelimit <- booster[["best_ntreelimit"]]
+      # best iteration -> iterationrange (future-proof, no warnings)
+      best_iter <- try(xgboost::xgb.attr(booster, "best_iteration"), silent = TRUE)
+      if (!inherits(best_iter, "try-error") && !is.null(best_iter)) {
+        ir <- as.integer(c(0L, as.integer(best_iter) + 1L))
+        preds <- stats::predict(booster, newdata = xgb_data, iterationrange = ir)
+      } else {
+        message("Can't find best_iteration")
+        preds <- stats::predict(booster, newdata = xgb_data)
       }
     
-      preds <- numeric(0)
-      if (nrow(Xdf) > 0) {
-        preds <- stats::predict(booster, newdata = xgb_data,
-                                ntreelimit = ntreelimit, reshape = TRUE)
-    
-        if (private$.training_outcome_type$type == "categorical") {
-          preds <- pack_predictions(preds)
-        }
+      # reshape for multiclass
+      if (private$.training_outcome_type$type == "categorical") {
+        k <- length(private$.training_outcome_type$levels)
+        preds <- matrix(preds, ncol = k, byrow = TRUE)
       }
+    
       return(preds)
-    },
+    }
     .required_packages = c("xgboost")
   )
 )

From 3df1c230e5b914152e9c4209ce080d36d20e5805 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Tue, 7 Oct 2025 11:38:45 -0500
Subject: [PATCH 13/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 628687ad..1d541705 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -226,7 +226,7 @@ Lrnr_xgboost <- R6Class(
       }
     
       return(preds)
-    }
+    },
     .required_packages = c("xgboost")
   )
 )

From 0dcf61319cca1fa9d321b1a07a034f15c82ebd1f Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Tue, 7 Oct 2025 12:50:10 -0500
Subject: [PATCH 14/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 1d541705..b7047aa4 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -187,9 +187,10 @@ Lrnr_xgboost <- R6Class(
       for (nm in names(Xdf)) {
         tr_lvls <- meta$sl3_factor_levels[[nm]]
         if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) {
+          # (optional) track NAs due to unseen levels
           before_na <- sum(is.na(Xdf[[nm]]))
           Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls)
-          after_na  <- sum(is.na(Xdf[[nm]]))
+          after_na <- sum(is.na(Xdf[[nm]]))
           if (after_na > before_na) {
             message("xgboost predict: introduced ", after_na - before_na,
                     " NA(s) in '", nm, "' due to unseen levels")
@@ -197,9 +198,6 @@ Lrnr_xgboost <- R6Class(
         }
       }
     
-      # empty guard
-      if (nrow(Xdf) == 0L) return(numeric(0))
-    
       xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE)
       if (!inherits(xgb_data, "xgb.DMatrix")) stop("Failed to build DMatrix for prediction.")
     
@@ -209,23 +207,25 @@ Lrnr_xgboost <- R6Class(
         xgboost::setinfo(xgb_data, "base_margin", offset)
       }
     
-      # best iteration -> iterationrange (future-proof, no warnings)
-      best_iter <- try(xgboost::xgb.attr(booster, "best_iteration"), silent = TRUE)
-      if (!inherits(best_iter, "try-error") && !is.null(best_iter)) {
-        ir <- as.integer(c(0L, as.integer(best_iter) + 1L))
-        preds <- stats::predict(booster, newdata = xgb_data, iterationrange = ir)
+      # --- ntreelimit logic  ---
+      ntreelimit <- 0L
+      booster <- fit_object$booster
+      
+      # try best_ntreelimit first (sometimes stored as character)
+      bn <- booster[["best_ntreelimit"]]
+      if (!is.null(bn)) {
+        ntreelimit <- as.integer(bn)
       } else {
-        message("Can't find best_iteration")
-        preds <- stats::predict(booster, newdata = xgb_data)
-      }
-    
-      # reshape for multiclass
-      if (private$.training_outcome_type$type == "categorical") {
-        k <- length(private$.training_outcome_type$levels)
-        preds <- matrix(preds, ncol = k, byrow = TRUE)
+        # fall back to best_iteration attribute if present
+        bi <- try(xgboost::xgb.attr(booster, "best_iteration"), silent = TRUE)
+        if (!inherits(bi, "try-error") && !is.null(bi)) {
+          ntreelimit <- as.integer(bi)  # attribute is character; coerce
+        }
       }
-    
-      return(preds)
+      
+      # call predict (you can keep reshape=TRUE for now if you prefer)
+      preds <- stats::predict(booster, newdata = xgb_data,
+                              ntreelimit = ntreelimit, reshape = TRUE)
     },
     .required_packages = c("xgboost")
   )

From 97a1a7f8f6f816a010e136821f89cf8bb7096730 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Tue, 7 Oct 2025 14:03:03 -0500
Subject: [PATCH 15/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index b7047aa4..a5be039b 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -207,25 +207,17 @@ Lrnr_xgboost <- R6Class(
         xgboost::setinfo(xgb_data, "base_margin", offset)
       }
     
-      # --- ntreelimit logic  ---
-      ntreelimit <- 0L
-      booster <- fit_object$booster
-      
-      # try best_ntreelimit first (sometimes stored as character)
-      bn <- booster[["best_ntreelimit"]]
-      if (!is.null(bn)) {
-        ntreelimit <- as.integer(bn)
-      } else {
-        # fall back to best_iteration attribute if present
-        bi <- try(xgboost::xgb.attr(booster, "best_iteration"), silent = TRUE)
-        if (!inherits(bi, "try-error") && !is.null(bi)) {
-          ntreelimit <- as.integer(bi)  # attribute is character; coerce
-        }
+      # 4) Predict with NO limit (uses all trees; if model had ES and you use xgboost::xgboost
+      #    class, its high-level predict would default to the best iteration. For xgb.Booster,
+      #    just omit limits unless you really need a subset.)
+      preds <- stats::predict(booster, newdata = xgb_data)
+    
+      # 5) Reshape for multiclass
+      if (private$.training_outcome_type$type == "categorical") {
+        k <- length(private$.training_outcome_type$levels)
+        preds <- matrix(preds, ncol = k, byrow = TRUE)
       }
-      
-      # call predict (you can keep reshape=TRUE for now if you prefer)
-      preds <- stats::predict(booster, newdata = xgb_data,
-                              ntreelimit = ntreelimit, reshape = TRUE)
+      return(preds)
     },
     .required_packages = c("xgboost")
   )

From 29b3364fad8d3bd10a03b9c2a82e513473535c67 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Wed, 8 Oct 2025 12:01:12 -0500
Subject: [PATCH 16/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index a5be039b..8fdd7116 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -207,15 +207,15 @@ Lrnr_xgboost <- R6Class(
         xgboost::setinfo(xgb_data, "base_margin", offset)
       }
     
-      # 4) Predict with NO limit (uses all trees; if model had ES and you use xgboost::xgboost
-      #    class, its high-level predict would default to the best iteration. For xgb.Booster,
-      #    just omit limits unless you really need a subset.)
-      preds <- stats::predict(booster, newdata = xgb_data)
+      predictions <- stats::predict(booster, newdata = xgb_data)
     
-      # 5) Reshape for multiclass
       if (private$.training_outcome_type$type == "categorical") {
         k <- length(private$.training_outcome_type$levels)
-        preds <- matrix(preds, ncol = k, byrow = TRUE)
+        predictions <- matrix(predictions, ncol = k, byrow = TRUE)
+        colnames(predictions) <- private$.training_outcome_type$levels
+        
+        # pack predictions in a single column
+        predictions <- pack_predictions(predictions)
       }
       return(preds)
     },

From e7b29fea8292dc102e5ee9a4af967e7787f21ec5 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Wed, 8 Oct 2025 12:10:30 -0500
Subject: [PATCH 17/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 8fdd7116..2881df67 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -212,7 +212,7 @@ Lrnr_xgboost <- R6Class(
       if (private$.training_outcome_type$type == "categorical") {
         k <- length(private$.training_outcome_type$levels)
         predictions <- matrix(predictions, ncol = k, byrow = TRUE)
-        colnames(predictions) <- private$.training_outcome_type$levels
+        # colnames(predictions) <- private$.training_outcome_type$levels
         
         # pack predictions in a single column
         predictions <- pack_predictions(predictions)

From e89db12ebd26f2b0a2382ed06421d9bd6ac71526 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Wed, 8 Oct 2025 12:11:34 -0500
Subject: [PATCH 18/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 2881df67..6a47c4b0 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -210,14 +210,14 @@ Lrnr_xgboost <- R6Class(
       predictions <- stats::predict(booster, newdata = xgb_data)
     
       if (private$.training_outcome_type$type == "categorical") {
-        k <- length(private$.training_outcome_type$levels)
-        predictions <- matrix(predictions, ncol = k, byrow = TRUE)
+        # k <- length(private$.training_outcome_type$levels)
+        # predictions <- matrix(predictions, ncol = k, byrow = TRUE)
         # colnames(predictions) <- private$.training_outcome_type$levels
         
         # pack predictions in a single column
         predictions <- pack_predictions(predictions)
       }
-      return(preds)
+      return(predictions)
     },
     .required_packages = c("xgboost")
   )

From 4deae51267008df3ae43e3fdfb197d4c815455ee Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Wed, 8 Oct 2025 12:55:01 -0500
Subject: [PATCH 19/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 6a47c4b0..2a1c2616 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -210,8 +210,8 @@ Lrnr_xgboost <- R6Class(
       predictions <- stats::predict(booster, newdata = xgb_data)
     
       if (private$.training_outcome_type$type == "categorical") {
-        # k <- length(private$.training_outcome_type$levels)
-        # predictions <- matrix(predictions, ncol = k, byrow = TRUE)
+        k <- length(private$.training_outcome_type$levels)
+        predictions <- matrix(predictions, ncol = k, byrow = TRUE)
         # colnames(predictions) <- private$.training_outcome_type$levels
         
         # pack predictions in a single column

From 97e76c78a8dc9fa4753ee732a30f1f53b9c9c4f0 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Wed, 8 Oct 2025 17:42:42 -0500
Subject: [PATCH 20/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 2a1c2616..0bd985f9 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -212,7 +212,7 @@ Lrnr_xgboost <- R6Class(
       if (private$.training_outcome_type$type == "categorical") {
         k <- length(private$.training_outcome_type$levels)
         predictions <- matrix(predictions, ncol = k, byrow = TRUE)
-        # colnames(predictions) <- private$.training_outcome_type$levels
+        colnames(predictions) <- private$.training_outcome_type$levels
         
         # pack predictions in a single column
         predictions <- pack_predictions(predictions)

From 74c53efe0af4f7b8fd213f7bcf4caf9b05b30329 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Wed, 8 Oct 2025 20:39:55 -0500
Subject: [PATCH 21/24] Update Lrnr_xgboost.R

---
 R/Lrnr_xgboost.R | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/R/Lrnr_xgboost.R b/R/Lrnr_xgboost.R
index 0bd985f9..f89ca21f 100644
--- a/R/Lrnr_xgboost.R
+++ b/R/Lrnr_xgboost.R
@@ -163,13 +163,11 @@ Lrnr_xgboost <- R6Class(
       )
     
       # DO NOT mutate the booster; wrap it instead
-      factor_levels <- lapply(Xdf, function(z) if (is.factor(z)) levels(z) else NULL)
       fit_object <- list(
         booster = fit_booster,
         meta = list(
           training_offset   = task$has_node("offset"),
-          link_fun          = link_fun,
-          sl3_factor_levels = factor_levels
+          link_fun          = link_fun
         )
       )
       class(fit_object) <- c("sl3_xgb_fit", "list")
@@ -184,20 +182,6 @@ Lrnr_xgboost <- R6Class(
       # raw covariates; relevel to training levels
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
     
-      for (nm in names(Xdf)) {
-        tr_lvls <- meta$sl3_factor_levels[[nm]]
-        if (!is.null(tr_lvls) && is.factor(Xdf[[nm]])) {
-          # (optional) track NAs due to unseen levels
-          before_na <- sum(is.na(Xdf[[nm]]))
-          Xdf[[nm]] <- factor(Xdf[[nm]], levels = tr_lvls)
-          after_na <- sum(is.na(Xdf[[nm]]))
-          if (after_na > before_na) {
-            message("xgboost predict: introduced ", after_na - before_na,
-                    " NA(s) in '", nm, "' due to unseen levels")
-          }
-        }
-      }
-    
       xgb_data <- try(xgboost::xgb.DMatrix(Xdf), silent = TRUE)
       if (!inherits(xgb_data, "xgb.DMatrix")) stop("Failed to build DMatrix for prediction.")
     
@@ -207,13 +191,9 @@ Lrnr_xgboost <- R6Class(
         xgboost::setinfo(xgb_data, "base_margin", offset)
       }
     
-      predictions <- stats::predict(booster, newdata = xgb_data)
+      predictions <- stats::predict(booster, newdata = xgb_data, strict_shape=TRUE)
     
       if (private$.training_outcome_type$type == "categorical") {
-        k <- length(private$.training_outcome_type$levels)
-        predictions <- matrix(predictions, ncol = k, byrow = TRUE)
-        colnames(predictions) <- private$.training_outcome_type$levels
-        
         # pack predictions in a single column
         predictions <- pack_predictions(predictions)
       }

From 1f7073b6e560800c5174b8cec09a5ab3c48772bd Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Thu, 9 Oct 2025 21:52:31 -0500
Subject: [PATCH 22/24] Update sl3_Task.R

---
 R/sl3_Task.R | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/R/sl3_Task.R b/R/sl3_Task.R
index 616140ca..b6d22c80 100644
--- a/R/sl3_Task.R
+++ b/R/sl3_Task.R
@@ -205,8 +205,23 @@ sl3_Task <- R6Class(
         } else {
           # match interaction terms to X
           Xmatch <- lapply(int, function(i) {
-            grep(i, colnames(self$X), value = TRUE)
-          })
+            cols <- colnames(self$X)
+          
+            # detect if 'i' is represented by factor dummies in the design
+            has_factor_dummies <- any(startsWith(cols, paste0(i, ".")))
+          
+            if (has_factor_dummies) {
+              # prefix match for factor dummy columns, anchored
+              grep(paste0("^", i, "\\."), colnames(self$X), value = TRUE)
+            } else if (i %in% cols) {
+              # exact match for a single numeric (or already-numeric) column
+              i
+            } else {
+              # nothing found; better to fail loud than silently drop an interaction
+              warning("No matching columns in design matrix for interaction term '", i, "'.")
+              character(0)
+            }
+          })    
           Xint <- as.list(data.table::as.data.table(t(expand.grid(Xmatch))))
 
           d_Xint <- lapply(Xint, function(Xint) {

From 6c85ecb49c22c6517bd5bff2d5dc786f331b2991 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Thu, 5 Mar 2026 18:57:31 -0600
Subject: [PATCH 23/24] Refactor probability and prediction handling in
 Lrnr_ranger

Updated probability handling for binomial and categorical tasks in ranger learner. Enhanced prediction extraction for different outcome types.
---
 R/Lrnr_ranger.R | 44 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/R/Lrnr_ranger.R b/R/Lrnr_ranger.R
index 9ed0399c..416efea2 100644
--- a/R/Lrnr_ranger.R
+++ b/R/Lrnr_ranger.R
@@ -124,7 +124,10 @@ Lrnr_ranger <- R6Class(
       colnames(data_in)[1] <- task$nodes$outcome
       args$data <- data_in
       args$dependent.variable.name <- task$nodes$outcome
-      args$probability <- task$outcome_type$type == "categorical"
+      # For binomial and categorical tasks, downstream sl3 expects probabilities.
+      if (is.null(args$probability)) {
+        args$probability <- task$outcome_type$type %in% c("binomial", "categorical")
+      }
       fit_object <- call_with_args(ranger::ranger, args)
       return(fit_object)
     },
@@ -134,18 +137,47 @@ Lrnr_ranger <- R6Class(
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
       
       # extract numeric predictions from custom class ranger.prediction
-      predictions <- stats::predict(
+      pred_obj <- stats::predict(
         private$.fit_object,
         data = Xdf,
         type = "response",
         num.threads = self$params$num.threads
       )
+      pred_raw <- pred_obj$predictions
 
-      predictions <- predictions[[1]]
+      outcome_type <- private$.training_outcome_type$type
+      if (outcome_type == "categorical") {
+        # pack K-class probabilities in a single column
+        predictions <- pack_predictions(pred_raw)
+      } else if (outcome_type == "binomial") {
+        # Return P(Y = max level), consistent with Variable_Type$format
+        # and other binomial learners in sl3.
+        if (is.matrix(pred_raw)) {
+          levs <- private$.training_outcome_type$levels
+          target <- if (!is.null(levs) && length(levs) > 0) {
+            as.character(levs[[length(levs)]])
+          } else {
+            colnames(pred_raw)[ncol(pred_raw)]
+          }
 
-      if (private$.training_outcome_type$type == "categorical") {
-        # pack predictions in a single column
-        predictions <- pack_predictions(predictions)
+          if (!is.null(colnames(pred_raw)) && target %in% colnames(pred_raw)) {
+            predictions <- as.numeric(pred_raw[, target])
+          } else {
+            predictions <- as.numeric(pred_raw[, ncol(pred_raw)])
+          }
+        } else if (is.factor(pred_raw) || is.character(pred_raw)) {
+          levs <- private$.training_outcome_type$levels
+          target <- if (!is.null(levs) && length(levs) > 0) {
+            as.character(levs[[length(levs)]])
+          } else {
+            as.character(sort(unique(pred_raw))[length(unique(pred_raw))])
+          }
+          predictions <- as.numeric(as.character(pred_raw) == target)
+        } else {
+          predictions <- as.numeric(pred_raw)
+        }
+      } else {
+        predictions <- as.numeric(pred_raw)
       }
       return(predictions)
     },

From 4ec0b12b878d4db44ff99841e442254181725a50 Mon Sep 17 00:00:00 2001
From: Jesse Zhou <69114052+JesseZhou-1@users.noreply.github.com>
Date: Mon, 16 Mar 2026 02:26:48 -0500
Subject: [PATCH 24/24] Refactor ranger importance and prediction handling

---
 R/Lrnr_ranger.R | 49 +++++++++++++++++++++++--------------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/R/Lrnr_ranger.R b/R/Lrnr_ranger.R
index 416efea2..48c2fb56 100644
--- a/R/Lrnr_ranger.R
+++ b/R/Lrnr_ranger.R
@@ -95,16 +95,13 @@ Lrnr_ranger <- R6Class(
       }
       self$assert_trained()
 
-      # initiate argument list for ranger::importance
       args <- list(...)
       args$x <- self$fit_object
 
-      # calculate importance metrics
       importance_result <- call_with_args(ranger::importance, args,
         keep_all = TRUE
       )
 
-      # sort according to decreasing importance
       return(importance_result[order(importance_result, decreasing = TRUE)])
     }
   ),
@@ -118,25 +115,23 @@ Lrnr_ranger <- R6Class(
       if (task$has_node("weights")) {
         args$case.weights <- task$weights
       }
-      # Preserve raw data frame
+
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
       data_in <- cbind(task$Y, Xdf)
       colnames(data_in)[1] <- task$nodes$outcome
       args$data <- data_in
       args$dependent.variable.name <- task$nodes$outcome
-      # For binomial and categorical tasks, downstream sl3 expects probabilities.
+
       if (is.null(args$probability)) {
         args$probability <- task$outcome_type$type %in% c("binomial", "categorical")
       }
+
       fit_object <- call_with_args(ranger::ranger, args)
       return(fit_object)
     },
     .predict = function(task) {
-
-      # Preserve raw data frame
       Xdf <- task$get_data(columns = task$nodes$covariates, expand_factors = FALSE)
-      
-      # extract numeric predictions from custom class ranger.prediction
+
       pred_obj <- stats::predict(
         private$.fit_object,
         data = Xdf,
@@ -147,30 +142,32 @@ Lrnr_ranger <- R6Class(
 
       outcome_type <- private$.training_outcome_type$type
       if (outcome_type == "categorical") {
-        # pack K-class probabilities in a single column
         predictions <- pack_predictions(pred_raw)
       } else if (outcome_type == "binomial") {
-        # Return P(Y = max level), consistent with Variable_Type$format
-        # and other binomial learners in sl3.
-        if (is.matrix(pred_raw)) {
-          levs <- private$.training_outcome_type$levels
-          target <- if (!is.null(levs) && length(levs) > 0) {
-            as.character(levs[[length(levs)]])
-          } else {
-            colnames(pred_raw)[ncol(pred_raw)]
-          }
+        levs <- private$.training_outcome_type$levels
+        target <- if (!is.null(levs) && length(levs) > 0L) {
+          as.character(max(levs))
+        } else {
+          NULL
+        }
 
-          if (!is.null(colnames(pred_raw)) && target %in% colnames(pred_raw)) {
+        if (is.matrix(pred_raw)) {
+          if (!is.null(colnames(pred_raw)) && !is.null(target) && target %in% colnames(pred_raw)) {
             predictions <- as.numeric(pred_raw[, target])
+          } else if (!is.null(target) && !is.null(levs)) {
+            target_idx <- match(target, as.character(levs))
+            if (is.na(target_idx) || target_idx > ncol(pred_raw)) {
+              stop("Could not align ranger binomial prediction matrix to the sl3 event level.")
+            }
+            predictions <- as.numeric(pred_raw[, target_idx])
+          } else if (ncol(pred_raw) == 1L) {
+            predictions <- as.numeric(pred_raw[, 1])
           } else {
-            predictions <- as.numeric(pred_raw[, ncol(pred_raw)])
+            stop("Could not identify the binomial probability column from ranger predictions.")
           }
         } else if (is.factor(pred_raw) || is.character(pred_raw)) {
-          levs <- private$.training_outcome_type$levels
-          target <- if (!is.null(levs) && length(levs) > 0) {
-            as.character(levs[[length(levs)]])
-          } else {
-            as.character(sort(unique(pred_raw))[length(unique(pred_raw))])
+          if (is.null(target)) {
+            stop("Could not identify the sl3 event level for ranger binomial predictions.")
           }
           predictions <- as.numeric(as.character(pred_raw) == target)
         } else {