|
| 1 | +#' @title Safe Evaluation Utilities for Biomarker Validation |
| 2 | +#' |
| 3 | +#' @description |
| 4 | +#' Functions that enforce correct ROC computation and non-inferiority testing. |
| 5 | +#' These utilities prevent the most common evaluation bugs: |
| 6 | +#' hardcoded ROC direction (inverting AUC), wrong non-inferiority references, |
| 7 | +#' and unstable percentile estimates from insufficient random draws. |
| 8 | +#' |
| 9 | +#' @name safe-evaluation |
| 10 | +NULL |
| 11 | + |
| 12 | + |
| 13 | +#' @title Safe ROC Computation (Direction-Guarded) |
| 14 | +#' |
| 15 | +#' @description |
| 16 | +#' Wrapper around \code{pROC::roc()} that enforces \code{direction = "auto"} |
| 17 | +#' by default and warns loudly when direction is hardcoded. |
| 18 | +#' |
| 19 | +#' **Common bug this prevents**: Hardcoding \code{direction = "<"} when the |
| 20 | +#' predictor's score direction depends on class weights, calibration, or |
| 21 | +#' model type. With extreme class weighting (e.g., 17x), the positive class |
| 22 | +#' can have LOWER predicted probabilities, inverting the AUC to 1 - true_AUC. |
| 23 | +#' |
| 24 | +#' @param response Binary outcome vector (0/1 or factor) |
| 25 | +#' @param predictor Numeric prediction scores |
| 26 | +#' @param direction ROC direction. Default \code{"auto"} (strongly recommended). |
| 27 | +#' If set to \code{"<"} or \code{">"}, a warning is issued. |
| 28 | +#' @param levels Two-element vector specifying the levels of the response. |
| 29 | +#' Default \code{c(0, 1)}. |
| 30 | +#' @param quiet Logical; suppress pROC messages. Default TRUE. |
| 31 | +#' @param ... Additional arguments passed to \code{pROC::roc()} |
| 32 | +#' |
| 33 | +#' @return A pROC roc object |
| 34 | +#' |
| 35 | +#' @examples |
| 36 | +#' \dontrun{ |
| 37 | +#' roc_obj <- safe_roc(y_true, y_pred) |
| 38 | +#' auc_val <- as.numeric(pROC::auc(roc_obj)) |
| 39 | +#' } |
| 40 | +#' |
| 41 | +#' @export |
| 42 | +safe_roc <- function(response, predictor, direction = "auto", |
| 43 | + levels = c(0, 1), quiet = TRUE, ...) { |
| 44 | + |
| 45 | + if (!requireNamespace("pROC", quietly = TRUE)) { |
| 46 | + stop("Package 'pROC' is required. Install with: install.packages('pROC')", call. = FALSE) |
| 47 | + } |
| 48 | + |
| 49 | + if (direction != "auto") { |
| 50 | + warning( |
| 51 | + sprintf( |
| 52 | + "Hardcoded direction='%s' in ROC computation. ", |
| 53 | + direction |
| 54 | + ), |
| 55 | + "This can INVERT the AUC when class weights or calibration change the ", |
| 56 | + "score direction. Use direction='auto' unless you have verified the ", |
| 57 | + "predictor's score polarity. See: OmicSelector methodology audit, ", |
| 58 | + "pROC direction bug (miRPOC 2026-04-02).", |
| 59 | + call. = FALSE |
| 60 | + ) |
| 61 | + } |
| 62 | + |
| 63 | + pROC::roc( |
| 64 | + response = response, |
| 65 | + predictor = predictor, |
| 66 | + direction = direction, |
| 67 | + levels = levels, |
| 68 | + quiet = quiet, |
| 69 | + ... |
| 70 | + ) |
| 71 | +} |
| 72 | + |
| 73 | + |
| 74 | +#' @title Safe AUC with Confidence Interval |
| 75 | +#' |
| 76 | +#' @description |
| 77 | +#' Computes AUC with DeLong confidence interval using \code{safe_roc()}. |
| 78 | +#' Returns a named list with AUC, CI, and the ROC object. |
| 79 | +#' |
| 80 | +#' @param response Binary outcome vector (0/1) |
| 81 | +#' @param predictor Numeric prediction scores |
| 82 | +#' @param ci_method CI method: "delong" (default) or "bootstrap" |
| 83 | +#' @param ... Additional arguments passed to \code{safe_roc()} |
| 84 | +#' |
| 85 | +#' @return A list with: |
| 86 | +#' \item{auc}{Numeric AUC value} |
| 87 | +#' \item{ci_lower}{Lower bound of 95\% CI} |
| 88 | +#' \item{ci_upper}{Upper bound of 95\% CI} |
| 89 | +#' \item{roc}{The pROC roc object} |
| 90 | +#' \item{direction}{Direction used by pROC} |
| 91 | +#' |
| 92 | +#' @export |
| 93 | +safe_auc <- function(response, predictor, ci_method = "delong", ...) { |
| 94 | + |
| 95 | + roc_obj <- safe_roc(response, predictor, ...) |
| 96 | + auc_val <- as.numeric(pROC::auc(roc_obj)) |
| 97 | + ci_obj <- pROC::ci.auc(roc_obj, method = ci_method) |
| 98 | + |
| 99 | + list( |
| 100 | + auc = auc_val, |
| 101 | + ci_lower = as.numeric(ci_obj[1]), |
| 102 | + ci_upper = as.numeric(ci_obj[3]), |
| 103 | + roc = roc_obj, |
| 104 | + direction = roc_obj$direction |
| 105 | + ) |
| 106 | +} |
| 107 | + |
| 108 | + |
| 109 | +#' @title Paired DeLong Non-Inferiority Test |
| 110 | +#' |
| 111 | +#' @description |
| 112 | +#' Tests whether a candidate model's AUC is non-inferior to a reference AUC, |
| 113 | +#' using a paired DeLong test on the same predictions. |
| 114 | +#' |
| 115 | +#' **Common bugs this prevents**: |
| 116 | +#' \enumerate{ |
| 117 | +#' \item Using the wrong reference AUC (e.g., superseded Phase 2.1 instead of |
| 118 | +#' current Phase 2.1b) |
| 119 | +#' \item Using an unpaired CI check instead of a formal paired DeLong test |
| 120 | +#' \item Comparing AUCs from different sample denominators |
| 121 | +#' } |
| 122 | +#' |
| 123 | +#' @param response Binary outcome vector (0/1), same for both models |
| 124 | +#' @param predictor_candidate Candidate model predictions |
| 125 | +#' @param predictor_reference Reference model predictions. Must be on the |
| 126 | +#' SAME samples as predictor_candidate. |
| 127 | +#' @param delta Non-inferiority margin (positive value, e.g., 0.03). |
| 128 | +#' Candidate is non-inferior if its AUC is within delta of the reference. |
| 129 | +#' @param alpha Significance level (default 0.05 for one-sided test) |
| 130 | +#' |
| 131 | +#' @return A list with: |
| 132 | +#' \item{non_inferior}{Logical: TRUE if candidate is non-inferior} |
| 133 | +#' \item{auc_candidate}{Candidate AUC} |
| 134 | +#' \item{auc_reference}{Reference AUC} |
| 135 | +#' \item{delta_auc}{Candidate - Reference AUC} |
| 136 | +#' \item{delta}{Non-inferiority margin used} |
| 137 | +#' \item{se_diff}{Standard error of AUC difference (paired DeLong)} |
| 138 | +#' \item{z_stat}{Z-statistic for non-inferiority} |
| 139 | +#' \item{p_value}{One-sided p-value} |
| 140 | +#' \item{ci_lower_diff}{Lower bound of 95\% CI for AUC difference} |
| 141 | +#' |
| 142 | +#' @examples |
| 143 | +#' \dontrun{ |
| 144 | +#' result <- test_noninferiority( |
| 145 | +#' response = y_test, |
| 146 | +#' predictor_candidate = pred_reduced_panel, |
| 147 | +#' predictor_reference = pred_full_panel, |
| 148 | +#' delta = 0.03 |
| 149 | +#' ) |
| 150 | +#' cat("Non-inferior:", result$non_inferior, "\n") |
| 151 | +#' } |
| 152 | +#' |
| 153 | +#' @export |
| 154 | +test_noninferiority <- function(response, predictor_candidate, predictor_reference, |
| 155 | + delta = 0.03, alpha = 0.05) { |
| 156 | + |
| 157 | + if (!requireNamespace("pROC", quietly = TRUE)) { |
| 158 | + stop("Package 'pROC' is required.", call. = FALSE) |
| 159 | + } |
| 160 | + |
| 161 | + if (length(response) != length(predictor_candidate) || |
| 162 | + length(response) != length(predictor_reference)) { |
| 163 | + stop("response, predictor_candidate, and predictor_reference must have equal length", |
| 164 | + call. = FALSE) |
| 165 | + } |
| 166 | + |
| 167 | + if (delta <= 0) { |
| 168 | + stop("delta must be positive (e.g., 0.03)", call. = FALSE) |
| 169 | + } |
| 170 | + |
| 171 | + # Compute ROC objects (direction = auto for both) |
| 172 | + roc_cand <- safe_roc(response, predictor_candidate) |
| 173 | + roc_ref <- safe_roc(response, predictor_reference) |
| 174 | + |
| 175 | + auc_cand <- as.numeric(pROC::auc(roc_cand)) |
| 176 | + auc_ref <- as.numeric(pROC::auc(roc_ref)) |
| 177 | + delta_auc <- auc_cand - auc_ref |
| 178 | + |
| 179 | + # Paired DeLong variance for AUC difference |
| 180 | + # Use pROC::roc.test for the paired comparison |
| 181 | + delong_test <- pROC::roc.test(roc_ref, roc_cand, method = "delong", paired = TRUE) |
| 182 | + |
| 183 | + # Extract standard error from the DeLong test |
| 184 | + # The test statistic is (AUC1 - AUC2) / SE |
| 185 | + # We need SE for the non-inferiority formulation |
| 186 | + se_diff <- abs(delta_auc / delong_test$statistic) |
| 187 | + if (!is.finite(se_diff) || se_diff == 0) { |
| 188 | + # Fallback: estimate from the DeLong CI |
| 189 | + se_diff <- abs(delta_auc) / abs(qnorm(delong_test$p.value / 2)) |
| 190 | + if (!is.finite(se_diff)) se_diff <- NA_real_ |
| 191 | + } |
| 192 | + |
| 193 | + # Non-inferiority test: H0: AUC_cand - AUC_ref < -delta |
| 194 | + # H1: AUC_cand - AUC_ref >= -delta |
| 195 | + # Z = (delta_auc + delta) / SE |
| 196 | + if (is.finite(se_diff) && se_diff > 0) { |
| 197 | + z_stat <- (delta_auc + delta) / se_diff |
| 198 | + p_value <- pnorm(z_stat, lower.tail = FALSE) # One-sided |
| 199 | + # Actually for non-inferiority: we want to reject H0: diff < -delta |
| 200 | + # So p = P(Z > z) under H0 |
| 201 | + p_value <- 1 - pnorm(z_stat) |
| 202 | + ci_lower_diff <- delta_auc - qnorm(1 - alpha) * se_diff |
| 203 | + } else { |
| 204 | + z_stat <- NA_real_ |
| 205 | + p_value <- NA_real_ |
| 206 | + ci_lower_diff <- NA_real_ |
| 207 | + } |
| 208 | + |
| 209 | + non_inferior <- !is.na(p_value) && p_value < alpha |
| 210 | + |
| 211 | + list( |
| 212 | + non_inferior = non_inferior, |
| 213 | + auc_candidate = auc_cand, |
| 214 | + auc_reference = auc_ref, |
| 215 | + delta_auc = delta_auc, |
| 216 | + delta = delta, |
| 217 | + se_diff = as.numeric(se_diff), |
| 218 | + z_stat = as.numeric(z_stat), |
| 219 | + p_value = as.numeric(p_value), |
| 220 | + ci_lower_diff = as.numeric(ci_lower_diff) |
| 221 | + ) |
| 222 | +} |
| 223 | + |
| 224 | + |
| 225 | +#' @title Validate Random-Panel Null Benchmark |
| 226 | +#' |
| 227 | +#' @description |
| 228 | +#' Checks whether the number of random draws is sufficient for a stable |
| 229 | +#' percentile estimate. Warns when fewer than 500 draws are used. |
| 230 | +#' |
| 231 | +#' @param n_draws Number of random panel draws performed |
| 232 | +#' @param percentile The percentile being estimated (e.g., 0.95) |
| 233 | +#' |
| 234 | +#' @return Invisible TRUE if adequate, FALSE with warning otherwise |
| 235 | +#' |
| 236 | +#' @export |
| 237 | +check_null_benchmark_draws <- function(n_draws, percentile = 0.95) { |
| 238 | + # Order statistic SE approximation |
| 239 | + min_recommended <- ceiling(20 / (1 - percentile)) # ~400 for 95th pctile |
| 240 | + |
| 241 | + if (n_draws < min_recommended) { |
| 242 | + warning( |
| 243 | + sprintf( |
| 244 | + "Only %d draws for %.0fth percentile estimate (recommend >= %d). ", |
| 245 | + n_draws, percentile * 100, min_recommended |
| 246 | + ), |
| 247 | + "The Monte Carlo error on this order statistic is non-trivial. ", |
| 248 | + "Consider rerunning with more draws or reporting a bootstrap CI ", |
| 249 | + "on the percentile.", |
| 250 | + call. = FALSE |
| 251 | + ) |
| 252 | + return(invisible(FALSE)) |
| 253 | + } |
| 254 | + invisible(TRUE) |
| 255 | +} |
0 commit comments