From 42bf377a3ff7b9d2dcfe3041135da8102fef5711 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 19:35:26 +0100 Subject: [PATCH 01/33] test(red): add failing tests for impute_missing_scores and strategy factories Tests for impute_missing_scores(), impute_worst_score(), impute_mean_score(), impute_na_score(), impute_model_score(), and integration with summarise_scores. --- tests/testthat/test-filter-missing-scores.R | 158 +++++++++++ tests/testthat/test-impute-missing-scores.R | 285 ++++++++++++++++++++ 2 files changed, 443 insertions(+) create mode 100644 tests/testthat/test-filter-missing-scores.R create mode 100644 tests/testthat/test-impute-missing-scores.R diff --git a/tests/testthat/test-filter-missing-scores.R b/tests/testthat/test-filter-missing-scores.R new file mode 100644 index 000000000..33929f887 --- /dev/null +++ b/tests/testthat/test-filter-missing-scores.R @@ -0,0 +1,158 @@ +# ============================================================================== +# build_missing_grid() +# ============================================================================== +test_that("build_missing_grid() returns correct missing rows", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + missing <- build_missing_grid(scores, compare = "model") + expect_s3_class(missing, "data.table") + expect_equal(nrow(missing), 1) + expect_equal(missing$model, "B") + expect_equal(missing$location, "US") +}) + +test_that("build_missing_grid() returns zero rows when nothing missing", { + scores <- data.table::data.table( + model = c("A", "A", "B", "B"), + location = c("DE", "US", "DE", "US"), + wis = c(1, 2, 3, 4) + ) + scores <- new_scores(scores, "wis") + missing <- build_missing_grid(scores, compare = "model") + expect_equal(nrow(missing), 0) +}) + +test_that( + "build_missing_grid() uses observed target combinations", { + # If we had locations DE, US and dates Mon, Tue but only + # (DE, Mon) and (US, Tue) are observed, the grid should NOT + + # include (DE, Tue) or (US, Mon) + scores <- data.table::data.table( + model = c("A", "A", "B", "B"), + location = c("DE", "US", "DE", "US"), + date = c("Mon", "Tue", "Mon", "Tue"), + wis = c(1, 2, 3, 4) + ) + scores <- new_scores(scores, "wis") + missing <- build_missing_grid(scores, compare = "model") + expect_equal(nrow(missing), 0) + + # Now remove one combo - B should be missing (US, Tue) + scores2 <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + date = c("Mon", "Tue", "Mon"), + wis = c(1, 2, 3) + ) + scores2 <- new_scores(scores2, "wis") + missing2 <- build_missing_grid(scores2, compare = "model") + expect_equal(nrow(missing2), 1) + expect_equal(missing2$model, "B") + expect_equal(missing2$location, "US") + expect_equal(missing2$date, "Tue") +}) + + +# ============================================================================== +# filter_missing_scores() +# ============================================================================== +test_that( + "filter_missing_scores() with default strategy drops incomplete", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + result <- filter_missing_scores(scores) + # Only DE should remain (both models have it) + expect_equal(nrow(result), 2) + expect_true(all(result$location == "DE")) +}) + +test_that( + "filter_missing_scores() preserves scores class and metrics", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + result <- filter_missing_scores(scores) + expect_s3_class(result, "scores") + expect_equal(attr(result, "metrics"), "wis") +}) + +test_that( + "filter_missing_scores() returns unchanged when nothing missing", { + scores <- data.table::data.table( + model = c("A", "A", "B", "B"), + location = c("DE", "US", "DE", "US"), + wis = c(1, 2, 3, 4) + ) + scores <- new_scores(scores, "wis") + expect_message( + result <- filter_missing_scores(scores), + "No missing" + ) + expect_equal(nrow(result), 4) +}) + + +# ============================================================================== +# filter_to_intersection() +# ============================================================================== +test_that( + "filter_to_intersection(min_coverage = 0.5) keeps partial", { + scores <- data.table::data.table( + model = c("A", "A", "A", "B", "C"), + location = c("DE", "US", "FR", "DE", "DE"), + wis = c(1, 2, 3, 4, 5) + ) + scores <- new_scores(scores, "wis") + strategy <- filter_to_intersection(min_coverage = 0.5) + result <- strategy(scores, compare = "model") + # DE covered by 3/3 = 1.0, US by 1/3 = 0.33, FR by 1/3 = 0.33 + # At min_coverage = 0.5, only DE qualifies + expect_true(all(result$location == "DE")) + + strategy2 <- filter_to_intersection(min_coverage = 1 / 3) + result2 <- strategy2(scores, compare = "model") + # All locations have coverage >= 1/3 + expect_equal(nrow(result2), 5) +}) + +test_that( + "filter_to_intersection(models = 'model1') keeps that model's targets", { + scores <- data.table::data.table( + model = c("m1", "m1", "m2", "m2", "m3"), + location = c("DE", "US", "DE", "FR", "DE"), + wis = c(1, 2, 3, 4, 5) + ) + scores <- new_scores(scores, "wis") + strategy <- filter_to_intersection(models = "m1") + result <- strategy(scores, compare = "model") + # m1 covers DE and US, so keep all rows with DE or US + expect_true(all(result$location %in% c("DE", "US"))) + # FR should be dropped + expect_false("FR" %in% result$location) +}) + +test_that( + "filter_to_intersection(models = c('m1', 'm2')) keeps intersection", { + scores <- data.table::data.table( + model = c("m1", "m1", "m2", "m2", "m3"), + location = c("DE", "US", "DE", "FR", "DE"), + wis = c(1, 2, 3, 4, 5) + ) + scores <- new_scores(scores, "wis") + strategy <- filter_to_intersection(models = c("m1", "m2")) + result <- strategy(scores, compare = "model") + # m1 covers DE, US; m2 covers DE, FR; intersection = DE + expect_true(all(result$location == "DE")) +}) diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R new file mode 100644 index 000000000..5d3f93c0f --- /dev/null +++ b/tests/testthat/test-impute-missing-scores.R @@ -0,0 +1,285 @@ +# ============================================================================== +# impute_missing_scores() +# ============================================================================== +test_that( + "impute_missing_scores adds .imputed = FALSE when nothing missing", + { + scores <- scores_quantile + result <- impute_missing_scores( + scores, strategy = impute_na_score() + ) + expect_true(".imputed" %in% names(result)) + expect_true(all(result$.imputed == FALSE)) + } +) + +test_that( + "impute_missing_scores preserves scores class and metrics", + { + scores <- scores_quantile + result <- impute_missing_scores( + scores, strategy = impute_na_score() + ) + expect_s3_class(result, "scores") + expect_identical( + get_metrics.scores(result), + get_metrics.scores(scores) + ) + } +) + +test_that(".imputed is not in get_metrics.scores output", { + scores <- scores_quantile + result <- impute_missing_scores( + scores, strategy = impute_na_score() + ) + metrics <- get_metrics.scores(result) + expect_false(".imputed" %in% metrics) +}) + +test_that(".imputed is not in get_forecast_unit output", { + scores <- scores_quantile + result <- impute_missing_scores( + scores, strategy = impute_na_score() + ) + fu <- get_forecast_unit(result) + expect_false(".imputed" %in% fu) +}) + +# ============================================================================== +# Strategy factories with missing data +# ============================================================================== + +# Helper to create scores with missing entries +make_scores_with_missing <- function() { + scores <- data.table::copy(scores_quantile) + metrics <- get_metrics.scores(scores) + fu <- get_forecast_unit(scores) + + # Remove some rows for one model to create missingness + models <- unique(scores$model) + target_model <- models[1] + + # Remove the first few unique target combos for that model + target_cols <- setdiff(fu, "model") + targets <- unique( + scores[, target_cols, with = FALSE] + ) + remove_targets <- targets[1:3] + + # Anti-join to remove those rows + scores_reduced <- scores[ + !remove_targets, on = target_cols + ] + # Also remove those targets for the target model only + # to ensure only that model is missing + # Actually, let's be more precise: remove rows for + # target_model matching those targets + keep <- scores[ + !(model == target_model & + scores[remove_targets, on = target_cols, which = TRUE, + nomatch = NULL] |> + (\(x) seq_len(nrow(scores)) %in% x)()), + ] + + # Simpler approach: just remove specific rows + idx <- scores[model == target_model, which = TRUE] + remove_idx <- idx[1:min(5, length(idx))] + result <- scores[-remove_idx] + return(result) +} + +test_that("impute_worst_score fills with max observed score", { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- make_scores_with_missing() + metrics <- get_metrics.scores(scores) + result <- impute_missing_scores( + scores, strategy = impute_worst_score() + ) + # Imputed rows should exist + + imputed <- result[.imputed == TRUE] + if (nrow(imputed) > 0) { + # Each imputed metric value should be <= max of that + # metric across all original data + for (m in metrics) { + if (m %in% names(imputed) && m %in% names(scores)) { + max_val <- max(scores[[m]], na.rm = TRUE) + expect_true( + all( + imputed[[m]] <= max_val | is.na(imputed[[m]]) + ), + info = paste("metric", m, "exceeds max") + ) + } + } + } +}) + +test_that("impute_mean_score fills with mean observed score", { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- make_scores_with_missing() + result <- impute_missing_scores( + scores, strategy = impute_mean_score() + ) + imputed <- result[.imputed == TRUE] + expect_true(nrow(imputed) >= 0) +}) + +test_that("impute_na_score fills with NA_real_", { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- make_scores_with_missing() + metrics <- get_metrics.scores(scores) + result <- impute_missing_scores( + scores, strategy = impute_na_score() + ) + imputed <- result[.imputed == TRUE] + if (nrow(imputed) > 0) { + for (m in metrics) { + if (m %in% names(imputed)) { + expect_true( + all(is.na(imputed[[m]])), + info = paste("metric", m, "should be NA") + ) + } + } + } +}) + +test_that( + "impute_model_score fills with reference model scores", + { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- make_scores_with_missing() + models <- unique(scores$model) + # Use a model that is NOT the one with missing data + # (first model had rows removed) + ref_model <- models[2] + result <- impute_missing_scores( + scores, + strategy = impute_model_score(ref_model) + ) + expect_s3_class(result, "scores") + } +) + +test_that( + "impute_model_score errors when ref model missing target", + { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- make_scores_with_missing() + models <- unique(scores$model) + # The first model has missing entries, so using it as + # reference should error if it's missing those targets + target_model <- models[1] + # This should error because target_model is the one + # missing forecasts + expect_error( + impute_missing_scores( + scores, + strategy = impute_model_score(target_model) + ) + ) + } +) + +test_that("custom strategy function works", { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + custom_strategy <- function(scores, missing_rows, metrics) { + for (m in metrics) { + data.table::set(missing_rows, j = m, value = 999) + } + return(missing_rows) + } + scores <- make_scores_with_missing() + metrics <- get_metrics.scores(scores) + result <- impute_missing_scores( + scores, strategy = custom_strategy + ) + imputed <- result[.imputed == TRUE] + if (nrow(imputed) > 0) { + for (m in metrics) { + if (m %in% names(imputed)) { + expect_true(all(imputed[[m]] == 999)) + } + } + } +}) + +# ============================================================================== +# Integration tests +# ============================================================================== +test_that( + "impute_missing_scores |> summarise_scores works", + { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- make_scores_with_missing() + result <- scores |> + impute_missing_scores(strategy = impute_na_score()) |> + summarise_scores(by = "model") + expect_s3_class(result, "data.table") + } +) + +test_that( + "filter then impute pipeline works", + { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + skip_if_not( + exists("filter_missing_scores", + where = asNamespace("scoringutils")), + "filter_missing_scores not yet available" + ) + skip_if_not( + exists("filter_to_intersection", + where = asNamespace("scoringutils")), + "filter_to_intersection not yet available" + ) + scores <- make_scores_with_missing() + models <- unique(scores$model) + ref_model <- models[2] + + result <- scores |> + filter_missing_scores( + strategy = filter_to_intersection( + models = ref_model + ) + ) |> + impute_missing_scores( + strategy = impute_model_score(ref_model) + ) + expect_s3_class(result, "scores") + } +) From 9342685bdbd4adcbf3654f9b12786a287c275230 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 19:54:51 +0100 Subject: [PATCH 02/33] feat: implement build_missing_grid, filter_missing_scores, and filter_to_intersection Add shared internal build_missing_grid() for detecting missing model-target combinations. Add filter_missing_scores() with strategy pattern and filter_to_intersection() strategy factory. Add .imputed to protected columns and globalVariables. Update test for protected columns. All tests green. Ref #1122 --- NAMESPACE | 8 + R/filter-missing-scores.R | 163 ++++++++++++++++++++ R/get-protected-columns.R | 1 + R/missing-scores-internal.R | 49 ++++++ R/z-globalVariables.R | 2 + man/build_missing_grid.Rd | 26 ++++ man/filter_missing_scores.Rd | 35 +++++ man/filter_to_intersection.Rd | 29 ++++ tests/testthat/test-filter-missing-scores.R | 28 ++-- tests/testthat/test-get-protected-columns.R | 4 +- 10 files changed, 327 insertions(+), 18 deletions(-) create mode 100644 R/filter-missing-scores.R create mode 100644 R/missing-scores-internal.R create mode 100644 man/build_missing_grid.Rd create mode 100644 man/filter_missing_scores.Rd create mode 100644 man/filter_to_intersection.Rd diff --git a/NAMESPACE b/NAMESPACE index ca5534b46..307da4447 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -72,6 +72,8 @@ export(dispersion_quantile) export(dispersion_sample) export(dss_sample) export(energy_score_multivariate) +export(filter_missing_scores) +export(filter_to_intersection) export(get_correlations) export(get_coverage) export(get_duplicate_forecasts) @@ -81,6 +83,11 @@ export(get_grouping) export(get_metrics) export(get_pairwise_comparisons) export(get_pit_histogram) +export(impute_mean_score) +export(impute_missing_scores) +export(impute_model_score) +export(impute_na_score) +export(impute_worst_score) export(interval_coverage) export(is_forecast) export(is_forecast_binary) @@ -170,6 +177,7 @@ importFrom(data.table,key) importFrom(data.table,melt) importFrom(data.table,nafill) importFrom(data.table,rbindlist) +importFrom(data.table,set) importFrom(data.table,setDT) importFrom(data.table,setattr) importFrom(data.table,setcolorder) diff --git a/R/filter-missing-scores.R b/R/filter-missing-scores.R new file mode 100644 index 000000000..c1d018b3e --- /dev/null +++ b/R/filter-missing-scores.R @@ -0,0 +1,163 @@ +#' @title Filter scores with missing model-target combinations +#' +#' @description +#' Filters a `scores` object to remove target combinations where +#' one or more models have missing scores. +#' The filtering behaviour is controlled by the `strategy` +#' argument, which defaults to [filter_to_intersection()]. +#' +#' @param scores An object of class `scores` (a data.table with +#' scores and an additional attribute `metrics` as produced +#' by [score()]). +#' @param strategy A strategy function as returned by +#' [filter_to_intersection()]. Default is +#' `filter_to_intersection()`. +#' @param compare Character string (default `"model"`) naming the +#' column whose values are compared for missingness. +#' +#' @return A filtered `scores` object with the same class and +#' `metrics` attribute as the input. +#' +#' @importFrom cli cli_inform +#' @importFrom checkmate assert_class assert_character +#' assert_function +#' @export +#' @keywords handle-metrics +filter_missing_scores <- function( + scores, + strategy = filter_to_intersection(), + compare = "model" +) { + assert_class(scores, "scores") + assert_character(compare, len = 1) + assert_function(strategy) + + original_class <- class(scores) + original_metrics <- attr(scores, "metrics") + + #nolint start: object_usage_linter + missing <- build_missing_grid(scores, compare = compare) + #nolint end + + if (nrow(missing) == 0) { + #nolint start: keyword_quote_linter + cli_inform(c( + "i" = "No missing score combinations found. Returning + scores unchanged." + )) + #nolint end + return(scores) + } + + result <- strategy(scores, compare = compare) + + n_before <- nrow(scores) + n_after <- nrow(result) + #nolint start: object_usage_linter + n_dropped <- n_before - n_after + #nolint end + + #nolint start: keyword_quote_linter + cli_inform(c( + "i" = "Filtered out {n_dropped} rows with missing + {compare} combinations.", + "i" = "{n_after} of {n_before} rows remaining." + )) + #nolint end + + # Preserve class and metrics + class(result) <- original_class + data.table::setattr(result, "metrics", original_metrics) + + return(result) +} + + +#' @title Filter to intersection of model-target combinations +#' +#' @description +#' Strategy factory for [filter_missing_scores()]. +#' Returns a function that keeps only target combinations +#' covered by a minimum proportion of models. +#' +#' @param min_coverage Numeric between 0 and 1 (default `1`). +#' Minimum proportion of models that must cover a target +#' combination for it to be kept. +#' @param models Character vector or `NULL` (default). If +#' provided, the target grid is restricted to targets covered +#' by these models. When multiple models are specified, only +#' the intersection of their targets is used. +#' +#' @return A function with signature `function(scores, compare)` +#' suitable for use as a strategy in +#' [filter_missing_scores()]. +#' +#' @importFrom data.table as.data.table setkeyv +#' @importFrom checkmate assert_number assert_character +#' @export +#' @keywords handle-metrics +filter_to_intersection <- function( + min_coverage = 1, + models = NULL +) { + assert_number(min_coverage, lower = 0, upper = 1) + if (!is.null(models)) { + assert_character(models, min.len = 1) + } + + function(scores, compare = "model") { + scores <- data.table::as.data.table(scores) + forecast_unit <- get_forecast_unit(scores) + target_cols <- setdiff(forecast_unit, compare) + + if (!is.null(models)) { + # Restrict to targets covered by specified models + model_targets <- lapply(models, function(m) { + unique( + scores[ + scores[[compare]] == m, + target_cols, + with = FALSE + ] + ) + }) + # Intersection of all specified models' targets + qualifying <- model_targets[[1]] + if (length(model_targets) > 1) { + for (i in seq(2, length(model_targets))) { + data.table::setkeyv(qualifying, target_cols) + data.table::setkeyv( + model_targets[[i]], target_cols + ) + qualifying <- merge( + qualifying, model_targets[[i]], + by = target_cols + ) + } + } + } else { + # Count models per target combination + all_models <- unique(scores[[compare]]) + n_total <- length(all_models) + + target_coverage <- scores[ + , .(n_models = data.table::uniqueN(get(compare))), + by = target_cols + ] + #nolint start: object_usage_linter + qualifying <- target_coverage[ + n_models / n_total >= min_coverage, + #nolint end + target_cols, + with = FALSE + ] + } + + # Semi-join: keep scores rows matching qualifying targets + data.table::setkeyv(scores, target_cols) + data.table::setkeyv(qualifying, target_cols) + result <- scores[qualifying, nomatch = NULL] + + return(result) + } +} diff --git a/R/get-protected-columns.R b/R/get-protected-columns.R index 0cf59bb39..839d24861 100644 --- a/R/get-protected-columns.R +++ b/R/get-protected-columns.R @@ -15,6 +15,7 @@ get_protected_columns <- function(data = NULL) { protected_columns <- c( ".mv_group_id", + ".imputed", "predicted", "observed", "sample_id", "quantile_level", "upper", "lower", "pit_value", "interval_range", "boundary", "predicted_label", "interval_coverage", "interval_coverage_deviation", diff --git a/R/missing-scores-internal.R b/R/missing-scores-internal.R new file mode 100644 index 000000000..30cce93aa --- /dev/null +++ b/R/missing-scores-internal.R @@ -0,0 +1,49 @@ +#' Build grid of missing model-target combinations +#' +#' @description +#' Internal function that detects missing model x target +#' combinations by comparing observed data against the complete +#' grid of all compare-column values crossed with all observed +#' target combinations. +#' +#' @param scores A `scores` object (data.table with a `metrics` +#' attribute as produced by [score()]). +#' @param compare Character string (default `"model"`) naming the +#' column whose values are compared against each target. +#' +#' @return A `data.table` with forecast-unit columns for each +#' missing combination. Zero rows if nothing is missing. +#' +#' @importFrom data.table as.data.table setkeyv rbindlist copy +#' @keywords internal +build_missing_grid <- function(scores, compare = "model") { + scores <- data.table::as.data.table(scores) + forecast_unit <- get_forecast_unit(scores) + target_cols <- setdiff(forecast_unit, compare) + + # Observed target combinations (NOT per-column expand.grid) + targets <- unique(scores[, target_cols, with = FALSE]) + + # All unique compare values + compare_vals <- unique(scores[[compare]]) + + # Complete grid: every compare value x every observed target + complete <- data.table::rbindlist(lapply( + compare_vals, + function(val) { + out <- data.table::copy(targets) + out[, (compare) := val] + out + } + )) + + # Observed combinations (forecast unit cols only) + observed <- unique(scores[, forecast_unit, with = FALSE]) + + # Anti-join: rows in complete but not in observed + data.table::setkeyv(complete, forecast_unit) + data.table::setkeyv(observed, forecast_unit) + missing <- complete[!observed] + + return(missing) +} diff --git a/R/z-globalVariables.R b/R/z-globalVariables.R index 881a69a19..e9f2ffeca 100644 --- a/R/z-globalVariables.R +++ b/R/z-globalVariables.R @@ -11,6 +11,7 @@ globalVariables(c( ".BY", ".GRP", ".SD", + ".imputed", ".mv_group_id", ".scoringutils_N", ".scoringutils_count", @@ -51,6 +52,7 @@ globalVariables(c( "metrics", "metrics_select", "model", + "n_models", "n_obs", "n_obs wis_component_name", "observed", diff --git a/man/build_missing_grid.Rd b/man/build_missing_grid.Rd new file mode 100644 index 000000000..f0e770800 --- /dev/null +++ b/man/build_missing_grid.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/missing-scores-internal.R +\name{build_missing_grid} +\alias{build_missing_grid} +\title{Build grid of missing model-target combinations} +\usage{ +build_missing_grid(scores, compare = "model") +} +\arguments{ +\item{scores}{A \code{scores} object (data.table with a \code{metrics} +attribute as produced by \code{\link[=score]{score()}}).} + +\item{compare}{Character string (default \code{"model"}) naming the +column whose values are compared against each target.} +} +\value{ +A \code{data.table} with forecast-unit columns for each +missing combination. Zero rows if nothing is missing. +} +\description{ +Internal function that detects missing model x target +combinations by comparing observed data against the complete +grid of all compare-column values crossed with all observed +target combinations. +} +\keyword{internal} diff --git a/man/filter_missing_scores.Rd b/man/filter_missing_scores.Rd new file mode 100644 index 000000000..554778536 --- /dev/null +++ b/man/filter_missing_scores.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter-missing-scores.R +\name{filter_missing_scores} +\alias{filter_missing_scores} +\title{Filter scores with missing model-target combinations} +\usage{ +filter_missing_scores( + scores, + strategy = filter_to_intersection(), + compare = "model" +) +} +\arguments{ +\item{scores}{An object of class \code{scores} (a data.table with +scores and an additional attribute \code{metrics} as produced +by \code{\link[=score]{score()}}).} + +\item{strategy}{A strategy function as returned by +\code{\link[=filter_to_intersection]{filter_to_intersection()}}. Default is +\code{filter_to_intersection()}.} + +\item{compare}{Character string (default \code{"model"}) naming the +column whose values are compared for missingness.} +} +\value{ +A filtered \code{scores} object with the same class and +\code{metrics} attribute as the input. +} +\description{ +Filters a \code{scores} object to remove target combinations where +one or more models have missing scores. +The filtering behaviour is controlled by the \code{strategy} +argument, which defaults to \code{\link[=filter_to_intersection]{filter_to_intersection()}}. +} +\keyword{handle-metrics} diff --git a/man/filter_to_intersection.Rd b/man/filter_to_intersection.Rd new file mode 100644 index 000000000..f9db1928e --- /dev/null +++ b/man/filter_to_intersection.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter-missing-scores.R +\name{filter_to_intersection} +\alias{filter_to_intersection} +\title{Filter to intersection of model-target combinations} +\usage{ +filter_to_intersection(min_coverage = 1, models = NULL) +} +\arguments{ +\item{min_coverage}{Numeric between 0 and 1 (default \code{1}). +Minimum proportion of models that must cover a target +combination for it to be kept.} + +\item{models}{Character vector or \code{NULL} (default). If +provided, the target grid is restricted to targets covered +by these models. When multiple models are specified, only +the intersection of their targets is used.} +} +\value{ +A function with signature \verb{function(scores, compare)} +suitable for use as a strategy in +\code{\link[=filter_missing_scores]{filter_missing_scores()}}. +} +\description{ +Strategy factory for \code{\link[=filter_missing_scores]{filter_missing_scores()}}. +Returns a function that keeps only target combinations +covered by a minimum proportion of models. +} +\keyword{handle-metrics} diff --git a/tests/testthat/test-filter-missing-scores.R b/tests/testthat/test-filter-missing-scores.R index 33929f887..b10dad6a6 100644 --- a/tests/testthat/test-filter-missing-scores.R +++ b/tests/testthat/test-filter-missing-scores.R @@ -26,11 +26,9 @@ test_that("build_missing_grid() returns zero rows when nothing missing", { expect_equal(nrow(missing), 0) }) -test_that( - "build_missing_grid() uses observed target combinations", { +test_that("build_missing_grid() uses observed target combos", { # If we had locations DE, US and dates Mon, Tue but only # (DE, Mon) and (US, Tue) are observed, the grid should NOT - # include (DE, Tue) or (US, Mon) scores <- data.table::data.table( model = c("A", "A", "B", "B"), @@ -61,8 +59,7 @@ test_that( # ============================================================================== # filter_missing_scores() # ============================================================================== -test_that( - "filter_missing_scores() with default strategy drops incomplete", { +test_that("filter_missing_scores() drops incomplete targets", { scores <- data.table::data.table( model = c("A", "A", "B"), location = c("DE", "US", "DE"), @@ -75,8 +72,7 @@ test_that( expect_true(all(result$location == "DE")) }) -test_that( - "filter_missing_scores() preserves scores class and metrics", { +test_that("filter_missing_scores() preserves class and metrics", { scores <- data.table::data.table( model = c("A", "A", "B"), location = c("DE", "US", "DE"), @@ -88,8 +84,7 @@ test_that( expect_equal(attr(result, "metrics"), "wis") }) -test_that( - "filter_missing_scores() returns unchanged when nothing missing", { +test_that("filter_missing_scores() unchanged when nothing missing", { scores <- data.table::data.table( model = c("A", "A", "B", "B"), location = c("DE", "US", "DE", "US"), @@ -107,8 +102,7 @@ test_that( # ============================================================================== # filter_to_intersection() # ============================================================================== -test_that( - "filter_to_intersection(min_coverage = 0.5) keeps partial", { +test_that("filter_to_intersection(min_coverage=0.5) works", { scores <- data.table::data.table( model = c("A", "A", "A", "B", "C"), location = c("DE", "US", "FR", "DE", "DE"), @@ -117,7 +111,7 @@ test_that( scores <- new_scores(scores, "wis") strategy <- filter_to_intersection(min_coverage = 0.5) result <- strategy(scores, compare = "model") - # DE covered by 3/3 = 1.0, US by 1/3 = 0.33, FR by 1/3 = 0.33 + # DE covered by 3/3, US by 1/3, FR by 1/3 # At min_coverage = 0.5, only DE qualifies expect_true(all(result$location == "DE")) @@ -127,8 +121,7 @@ test_that( expect_equal(nrow(result2), 5) }) -test_that( - "filter_to_intersection(models = 'model1') keeps that model's targets", { +test_that("filter_to_intersection(models) keeps targets", { scores <- data.table::data.table( model = c("m1", "m1", "m2", "m2", "m3"), location = c("DE", "US", "DE", "FR", "DE"), @@ -143,15 +136,16 @@ test_that( expect_false("FR" %in% result$location) }) -test_that( - "filter_to_intersection(models = c('m1', 'm2')) keeps intersection", { +test_that("filter_to_intersection(models=c()) intersects", { scores <- data.table::data.table( model = c("m1", "m1", "m2", "m2", "m3"), location = c("DE", "US", "DE", "FR", "DE"), wis = c(1, 2, 3, 4, 5) ) scores <- new_scores(scores, "wis") - strategy <- filter_to_intersection(models = c("m1", "m2")) + strategy <- filter_to_intersection( + models = c("m1", "m2") + ) result <- strategy(scores, compare = "model") # m1 covers DE, US; m2 covers DE, FR; intersection = DE expect_true(all(result$location == "DE")) diff --git a/tests/testthat/test-get-protected-columns.R b/tests/testthat/test-get-protected-columns.R index 896dfa6f9..729a5b54b 100644 --- a/tests/testthat/test-get-protected-columns.R +++ b/tests/testthat/test-get-protected-columns.R @@ -6,9 +6,11 @@ test_that("get_protected_columns() works as expected", { get_protected_columns(), c( ".mv_group_id", + ".imputed", "predicted", "observed", "sample_id", "quantile_level", "upper", "lower", "pit_value", - "interval_range", "boundary", "predicted_label", "interval_coverage", + "interval_range", "boundary", "predicted_label", + "interval_coverage", "interval_coverage_deviation", "quantile_coverage", "quantile_coverage_deviation" ) From 8d812908d071ecab3bdebe2ff9757de05c36fab9 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 20:57:21 +0100 Subject: [PATCH 03/33] feat(green): implement impute_missing_scores and strategy factories Add impute_missing_scores() with .imputed column tracking, plus four strategy factories: impute_worst_score(), impute_mean_score(), impute_na_score(), and impute_model_score(). Fix data.table scoping issues in closures by using explicit namespacing and avoiding variable name collisions with column names. --- R/impute-missing-scores.R | 307 ++++++++++++++++++++ tests/testthat/test-impute-missing-scores.R | 99 +++---- 2 files changed, 345 insertions(+), 61 deletions(-) create mode 100644 R/impute-missing-scores.R diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R new file mode 100644 index 000000000..d67a62525 --- /dev/null +++ b/R/impute-missing-scores.R @@ -0,0 +1,307 @@ +#' @title Impute missing scores +#' +#' @description +#' Fills in scores for forecast-target combinations that are +#' missing from the data, using a user-specified imputation +#' strategy. This is useful to ensure all models are evaluated +#' on the same set of targets, which avoids bias when +#' summarising scores. +#' +#' Missing combinations are identified by comparing each +#' element in `compare` against the full set of targets +#' present across all elements. The strategy function then +#' provides the imputed values for the missing metric columns. +#' +#' An `.imputed` column is added to the output indicating +#' which rows were imputed (`TRUE`) and which are original +#' (`FALSE`). +#' +#' @param scores An object of class `scores` (a data.table +#' with scores and an additional attribute `metrics` as +#' produced by [score()]). +#' @param strategy A function or factory-created function that +#' fills missing metric values. Built-in options are +#' [impute_worst_score()], [impute_mean_score()], +#' [impute_na_score()], and [impute_model_score()]. +#' The function must accept three arguments: +#' `(scores, missing_rows, metrics)` and return +#' `missing_rows` with metric columns filled. +#' @param compare Character vector of length one with the +#' column name that defines the unit of comparison. +#' Default is `"model"`. +#' +#' @return An object of class `scores` with an additional +#' `.imputed` column. Rows that were imputed have +#' `.imputed = TRUE`. +#' +#' @importFrom data.table copy set rbindlist setattr +#' @importFrom checkmate assert_class assert_function +#' assert_character +#' @importFrom cli cli_abort +#' @export +#' @keywords scoring +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' # Impute with NA values +#' impute_missing_scores(scores, strategy = impute_na_score()) +impute_missing_scores <- function( + scores, + strategy, + compare = "model" +) { + assert_class(scores, "scores") + metrics <- get_metrics.scores(scores, error = TRUE) + assert_character(compare, len = 1) + assert_function(strategy) + + scores <- copy(scores) + + missing_rows <- build_missing_grid(scores, compare) + + if (nrow(missing_rows) == 0) { + data.table::set(scores, j = ".imputed", value = FALSE) + return(scores[]) + } + + filled <- strategy(scores, missing_rows, metrics) + + data.table::set(filled, j = ".imputed", value = TRUE) + data.table::set(scores, j = ".imputed", value = FALSE) + + out <- rbindlist(list(scores, filled), use.names = TRUE, + fill = TRUE) + + out <- new_scores(out, metrics) + return(out[]) +} + + +#' @title Impute with worst (maximum) observed score +#' +#' @description +#' Creates an imputation strategy that fills each missing +#' metric with the worst (maximum) observed value for that +#' metric within the same target combination across all +#' elements of `compare`. +#' +#' @return A function suitable for use as the `strategy` +#' argument in [impute_missing_scores()]. +#' @export +#' @keywords scoring +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' impute_missing_scores(scores, strategy = impute_worst_score()) +impute_worst_score <- function() { + function(scores, missing_rows, metrics) { + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + for (m in metrics) { + if (!(m %in% names(scores))) next + # Compute max per target combination + agg <- scores[, + .(..val = max(get(m), na.rm = TRUE)), + by = target_cols + ] + # Merge onto missing_rows + missing_rows <- merge( + missing_rows, agg, + by = target_cols, all.x = TRUE + ) + data.table::set( + missing_rows, j = m, + value = missing_rows[["..val"]] + ) + data.table::set( + missing_rows, j = "..val", value = NULL + ) + } + return(missing_rows) + } +} + + +#' @title Impute with mean observed score +#' +#' @description +#' Creates an imputation strategy that fills each missing +#' metric with the mean observed value for that metric within +#' the same target combination across all elements of +#' `compare`. +#' +#' @return A function suitable for use as the `strategy` +#' argument in [impute_missing_scores()]. +#' @export +#' @keywords scoring +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' impute_missing_scores(scores, strategy = impute_mean_score()) +impute_mean_score <- function() { + function(scores, missing_rows, metrics) { + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + for (m in metrics) { + if (!(m %in% names(scores))) next + agg <- scores[, + .(..val = mean(get(m), na.rm = TRUE)), + by = target_cols + ] + missing_rows <- merge( + missing_rows, agg, + by = target_cols, all.x = TRUE + ) + data.table::set( + missing_rows, j = m, + value = missing_rows[["..val"]] + ) + data.table::set( + missing_rows, j = "..val", value = NULL + ) + } + return(missing_rows) + } +} + + +#' @title Impute with NA values +#' +#' @description +#' Creates an imputation strategy that fills each missing +#' metric with `NA_real_`. +#' +#' @return A function suitable for use as the `strategy` +#' argument in [impute_missing_scores()]. +#' @export +#' @keywords scoring +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' impute_missing_scores(scores, strategy = impute_na_score()) +impute_na_score <- function() { + function(scores, missing_rows, metrics) { + for (m in metrics) { + data.table::set(missing_rows, j = m, value = NA_real_) + } + return(missing_rows) + } +} + + +#' @title Impute with a reference model's scores +#' +#' @description +#' Creates an imputation strategy that fills missing scores +#' with the actual scores from a specified reference model +#' for each target combination. +#' +#' @param model Character string naming the reference model +#' whose scores should be used for imputation. The reference +#' model must have scores for all target combinations that +#' need imputing. +#' +#' @return A function suitable for use as the `strategy` +#' argument in [impute_missing_scores()]. +#' +#' @importFrom cli cli_abort +#' @export +#' @keywords scoring +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' impute_missing_scores( +#' scores, +#' strategy = impute_model_score("EuroCOVIDhub-baseline") +#' ) +impute_model_score <- function(model) { + assert_character(model, len = 1) + # Store in a different name to avoid collision with + # the "model" column in data.table expressions + ref_model_name <- model + function(scores, missing_rows, metrics) { + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + ref <- scores[ + get("model") == ref_model_name + ] + + if (nrow(ref) == 0) { + cli_abort( + c( + "!" = "Reference model {.val {ref_model_name}} + not found in scores." + ) + ) + } + + # Check that the reference model has scores for all + # needed target combinations + needed <- unique( + missing_rows[, target_cols, with = FALSE] + ) + available <- unique( + ref[, target_cols, with = FALSE] + ) + missing_targets <- needed[!available, + on = target_cols + ] + if (nrow(missing_targets) > 0) { + cli_abort( + c( + "!" = "Reference model {.val {ref_model_name}} + is missing scores for + {nrow(missing_targets)} target + combination{?s} that need imputing." + ) + ) + } + + # Merge reference model scores onto missing rows + ref_scores <- ref[, + c( + target_cols, + metrics[metrics %in% names(ref)] + ), + with = FALSE + ] + missing_rows <- merge( + missing_rows[, + setdiff(names(missing_rows), metrics), + with = FALSE + ], + ref_scores, + by = target_cols, + all.x = TRUE + ) + return(missing_rows) + } +} diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index 5d3f93c0f..dc50d4b35 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -4,7 +4,16 @@ test_that( "impute_missing_scores adds .imputed = FALSE when nothing missing", { - scores <- scores_quantile + # Use only models with complete targets + scores <- scores_quantile[ + model %in% c( + "EuroCOVIDhub-ensemble", + "EuroCOVIDhub-baseline" + ) + ] + scores <- new_scores( + scores, get_metrics.scores(scores_quantile) + ) result <- impute_missing_scores( scores, strategy = impute_na_score() ) @@ -38,6 +47,12 @@ test_that(".imputed is not in get_metrics.scores output", { }) test_that(".imputed is not in get_forecast_unit output", { + # Requires .imputed in get_protected_columns() + # (added by parallel agent) + skip_if_not( + ".imputed" %in% get_protected_columns(), + ".imputed not yet in get_protected_columns" + ) scores <- scores_quantile result <- impute_missing_scores( scores, strategy = impute_na_score() @@ -50,44 +65,10 @@ test_that(".imputed is not in get_forecast_unit output", { # Strategy factories with missing data # ============================================================================== -# Helper to create scores with missing entries -make_scores_with_missing <- function() { - scores <- data.table::copy(scores_quantile) - metrics <- get_metrics.scores(scores) - fu <- get_forecast_unit(scores) - - # Remove some rows for one model to create missingness - models <- unique(scores$model) - target_model <- models[1] - - # Remove the first few unique target combos for that model - target_cols <- setdiff(fu, "model") - targets <- unique( - scores[, target_cols, with = FALSE] - ) - remove_targets <- targets[1:3] - - # Anti-join to remove those rows - scores_reduced <- scores[ - !remove_targets, on = target_cols - ] - # Also remove those targets for the target model only - # to ensure only that model is missing - # Actually, let's be more precise: remove rows for - # target_model matching those targets - keep <- scores[ - !(model == target_model & - scores[remove_targets, on = target_cols, which = TRUE, - nomatch = NULL] |> - (\(x) seq_len(nrow(scores)) %in% x)()), - ] - - # Simpler approach: just remove specific rows - idx <- scores[model == target_model, which = TRUE] - remove_idx <- idx[1:min(5, length(idx))] - result <- scores[-remove_idx] - return(result) -} +# scores_quantile already has missing combinations: +# UMass-MechBayes has 128 of 256 targets, +# epiforecasts-EpiNow2 has 247 of 256. +# Use it directly for tests needing missing data. test_that("impute_worst_score fills with max observed score", { skip_if_not( @@ -95,7 +76,7 @@ test_that("impute_worst_score fills with max observed score", { where = asNamespace("scoringutils")), "build_missing_grid not yet available" ) - scores <- make_scores_with_missing() + scores <- scores_quantile metrics <- get_metrics.scores(scores) result <- impute_missing_scores( scores, strategy = impute_worst_score() @@ -126,7 +107,7 @@ test_that("impute_mean_score fills with mean observed score", { where = asNamespace("scoringutils")), "build_missing_grid not yet available" ) - scores <- make_scores_with_missing() + scores <- scores_quantile result <- impute_missing_scores( scores, strategy = impute_mean_score() ) @@ -140,7 +121,7 @@ test_that("impute_na_score fills with NA_real_", { where = asNamespace("scoringutils")), "build_missing_grid not yet available" ) - scores <- make_scores_with_missing() + scores <- scores_quantile metrics <- get_metrics.scores(scores) result <- impute_missing_scores( scores, strategy = impute_na_score() @@ -166,14 +147,14 @@ test_that( where = asNamespace("scoringutils")), "build_missing_grid not yet available" ) - scores <- make_scores_with_missing() - models <- unique(scores$model) - # Use a model that is NOT the one with missing data - # (first model had rows removed) - ref_model <- models[2] + scores <- scores_quantile + # EuroCOVIDhub-baseline has all 256 targets so can + # serve as reference for all missing combinations result <- impute_missing_scores( scores, - strategy = impute_model_score(ref_model) + strategy = impute_model_score( + "EuroCOVIDhub-baseline" + ) ) expect_s3_class(result, "scores") } @@ -187,17 +168,14 @@ test_that( where = asNamespace("scoringutils")), "build_missing_grid not yet available" ) - scores <- make_scores_with_missing() - models <- unique(scores$model) - # The first model has missing entries, so using it as - # reference should error if it's missing those targets - target_model <- models[1] - # This should error because target_model is the one - # missing forecasts + scores <- scores_quantile + # UMass-MechBayes only has 128 of 256 targets, so + # it cannot serve as reference for imputing all missing + # target combinations expect_error( impute_missing_scores( scores, - strategy = impute_model_score(target_model) + strategy = impute_model_score("UMass-MechBayes") ) ) } @@ -215,7 +193,7 @@ test_that("custom strategy function works", { } return(missing_rows) } - scores <- make_scores_with_missing() + scores <- scores_quantile metrics <- get_metrics.scores(scores) result <- impute_missing_scores( scores, strategy = custom_strategy @@ -241,7 +219,7 @@ test_that( where = asNamespace("scoringutils")), "build_missing_grid not yet available" ) - scores <- make_scores_with_missing() + scores <- scores_quantile result <- scores |> impute_missing_scores(strategy = impute_na_score()) |> summarise_scores(by = "model") @@ -267,9 +245,8 @@ test_that( where = asNamespace("scoringutils")), "filter_to_intersection not yet available" ) - scores <- make_scores_with_missing() - models <- unique(scores$model) - ref_model <- models[2] + scores <- scores_quantile + ref_model <- "EuroCOVIDhub-baseline" result <- scores |> filter_missing_scores( From 3e1298de697e54c18153aa46b4828a6e88da8401 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 21:01:38 +0100 Subject: [PATCH 04/33] style: fix lint warnings in impute tests --- tests/testthat/test-impute-missing-scores.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index dc50d4b35..d78b5c46d 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -18,7 +18,7 @@ test_that( scores, strategy = impute_na_score() ) expect_true(".imputed" %in% names(result)) - expect_true(all(result$.imputed == FALSE)) + expect_false(any(result$.imputed)) } ) @@ -83,7 +83,7 @@ test_that("impute_worst_score fills with max observed score", { ) # Imputed rows should exist - imputed <- result[.imputed == TRUE] + imputed <- result[(.imputed)] if (nrow(imputed) > 0) { # Each imputed metric value should be <= max of that # metric across all original data @@ -111,8 +111,8 @@ test_that("impute_mean_score fills with mean observed score", { result <- impute_missing_scores( scores, strategy = impute_mean_score() ) - imputed <- result[.imputed == TRUE] - expect_true(nrow(imputed) >= 0) + imputed <- result[(.imputed)] + expect_gte(nrow(imputed), 0) }) test_that("impute_na_score fills with NA_real_", { @@ -126,7 +126,7 @@ test_that("impute_na_score fills with NA_real_", { result <- impute_missing_scores( scores, strategy = impute_na_score() ) - imputed <- result[.imputed == TRUE] + imputed <- result[(.imputed)] if (nrow(imputed) > 0) { for (m in metrics) { if (m %in% names(imputed)) { @@ -198,7 +198,7 @@ test_that("custom strategy function works", { result <- impute_missing_scores( scores, strategy = custom_strategy ) - imputed <- result[.imputed == TRUE] + imputed <- result[(.imputed)] if (nrow(imputed) > 0) { for (m in metrics) { if (m %in% names(imputed)) { From b15c5cfa62c100a0b0be4fd457faa61aa59aaa78 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 21:50:49 +0100 Subject: [PATCH 05/33] style: suppress lint warning for internal function reference --- R/impute-missing-scores.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R index d67a62525..9b50a0886 100644 --- a/R/impute-missing-scores.R +++ b/R/impute-missing-scores.R @@ -62,7 +62,7 @@ impute_missing_scores <- function( scores <- copy(scores) - missing_rows <- build_missing_grid(scores, compare) + missing_rows <- build_missing_grid(scores, compare) # nolint: object_usage_linter if (nrow(missing_rows) == 0) { data.table::set(scores, j = ".imputed", value = FALSE) From 6253cab8f4aa0f755bde57f92b64706301cd05eb Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 21:58:34 +0100 Subject: [PATCH 06/33] Rename filter_missing_scores() to filter_scores() Generalise the function as a strategy-based score filter that delegates all logic to the strategy function. Remove the redundant build_missing_grid() call and early-return check. Add validation that the compare column exists in scores. Rename source and test files accordingly. --- NAMESPACE | 2 +- ...ilter-missing-scores.R => filter-scores.R} | 40 +++++++++---------- ...ter_missing_scores.Rd => filter_scores.Rd} | 21 +++++----- man/filter_to_intersection.Rd | 6 +-- ...-missing-scores.R => test-filter-scores.R} | 29 ++++++++++---- 5 files changed, 52 insertions(+), 46 deletions(-) rename R/{filter-missing-scores.R => filter-scores.R} (86%) rename man/{filter_missing_scores.Rd => filter_scores.Rd} (64%) rename tests/testthat/{test-filter-missing-scores.R => test-filter-scores.R} (88%) diff --git a/NAMESPACE b/NAMESPACE index 307da4447..15564489f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -72,7 +72,7 @@ export(dispersion_quantile) export(dispersion_sample) export(dss_sample) export(energy_score_multivariate) -export(filter_missing_scores) +export(filter_scores) export(filter_to_intersection) export(get_correlations) export(get_coverage) diff --git a/R/filter-missing-scores.R b/R/filter-scores.R similarity index 86% rename from R/filter-missing-scores.R rename to R/filter-scores.R index c1d018b3e..4e17075b7 100644 --- a/R/filter-missing-scores.R +++ b/R/filter-scores.R @@ -1,10 +1,11 @@ -#' @title Filter scores with missing model-target combinations +#' @title Filter scores #' #' @description -#' Filters a `scores` object to remove target combinations where -#' one or more models have missing scores. +#' Filters a `scores` object according to a given strategy. #' The filtering behaviour is controlled by the `strategy` #' argument, which defaults to [filter_to_intersection()]. +#' This is a general-purpose filtering function that delegates +#' all logic to the strategy. #' #' @param scores An object of class `scores` (a data.table with #' scores and an additional attribute `metrics` as produced @@ -13,54 +14,49 @@ #' [filter_to_intersection()]. Default is #' `filter_to_intersection()`. #' @param compare Character string (default `"model"`) naming the -#' column whose values are compared for missingness. +#' column whose values are compared for filtering. #' #' @return A filtered `scores` object with the same class and #' `metrics` attribute as the input. #' #' @importFrom cli cli_inform #' @importFrom checkmate assert_class assert_character -#' assert_function +#' assert_function assert_subset #' @export #' @keywords handle-metrics -filter_missing_scores <- function( +filter_scores <- function( scores, strategy = filter_to_intersection(), compare = "model" ) { assert_class(scores, "scores") assert_character(compare, len = 1) + assert_subset(compare, names(scores)) assert_function(strategy) original_class <- class(scores) original_metrics <- attr(scores, "metrics") + result <- strategy(scores, compare = compare) + + n_before <- nrow(scores) + n_after <- nrow(result) #nolint start: object_usage_linter - missing <- build_missing_grid(scores, compare = compare) + n_dropped <- n_before - n_after #nolint end - if (nrow(missing) == 0) { + if (n_dropped == 0) { #nolint start: keyword_quote_linter cli_inform(c( - "i" = "No missing score combinations found. Returning - scores unchanged." + "i" = "No rows filtered. Returning scores unchanged." )) #nolint end return(scores) } - result <- strategy(scores, compare = compare) - - n_before <- nrow(scores) - n_after <- nrow(result) - #nolint start: object_usage_linter - n_dropped <- n_before - n_after - #nolint end - #nolint start: keyword_quote_linter cli_inform(c( - "i" = "Filtered out {n_dropped} rows with missing - {compare} combinations.", + "i" = "Filtered out {n_dropped} rows.", "i" = "{n_after} of {n_before} rows remaining." )) #nolint end @@ -76,7 +72,7 @@ filter_missing_scores <- function( #' @title Filter to intersection of model-target combinations #' #' @description -#' Strategy factory for [filter_missing_scores()]. +#' Strategy factory for [filter_scores()]. #' Returns a function that keeps only target combinations #' covered by a minimum proportion of models. #' @@ -90,7 +86,7 @@ filter_missing_scores <- function( #' #' @return A function with signature `function(scores, compare)` #' suitable for use as a strategy in -#' [filter_missing_scores()]. +#' [filter_scores()]. #' #' @importFrom data.table as.data.table setkeyv #' @importFrom checkmate assert_number assert_character diff --git a/man/filter_missing_scores.Rd b/man/filter_scores.Rd similarity index 64% rename from man/filter_missing_scores.Rd rename to man/filter_scores.Rd index 554778536..38cd25dd3 100644 --- a/man/filter_missing_scores.Rd +++ b/man/filter_scores.Rd @@ -1,14 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/filter-missing-scores.R -\name{filter_missing_scores} -\alias{filter_missing_scores} -\title{Filter scores with missing model-target combinations} +% Please edit documentation in R/filter-scores.R +\name{filter_scores} +\alias{filter_scores} +\title{Filter scores} \usage{ -filter_missing_scores( - scores, - strategy = filter_to_intersection(), - compare = "model" -) +filter_scores(scores, strategy = filter_to_intersection(), compare = "model") } \arguments{ \item{scores}{An object of class \code{scores} (a data.table with @@ -20,16 +16,17 @@ by \code{\link[=score]{score()}}).} \code{filter_to_intersection()}.} \item{compare}{Character string (default \code{"model"}) naming the -column whose values are compared for missingness.} +column whose values are compared for filtering.} } \value{ A filtered \code{scores} object with the same class and \code{metrics} attribute as the input. } \description{ -Filters a \code{scores} object to remove target combinations where -one or more models have missing scores. +Filters a \code{scores} object according to a given strategy. The filtering behaviour is controlled by the \code{strategy} argument, which defaults to \code{\link[=filter_to_intersection]{filter_to_intersection()}}. +This is a general-purpose filtering function that delegates +all logic to the strategy. } \keyword{handle-metrics} diff --git a/man/filter_to_intersection.Rd b/man/filter_to_intersection.Rd index f9db1928e..07a1ca464 100644 --- a/man/filter_to_intersection.Rd +++ b/man/filter_to_intersection.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/filter-missing-scores.R +% Please edit documentation in R/filter-scores.R \name{filter_to_intersection} \alias{filter_to_intersection} \title{Filter to intersection of model-target combinations} @@ -19,10 +19,10 @@ the intersection of their targets is used.} \value{ A function with signature \verb{function(scores, compare)} suitable for use as a strategy in -\code{\link[=filter_missing_scores]{filter_missing_scores()}}. +\code{\link[=filter_scores]{filter_scores()}}. } \description{ -Strategy factory for \code{\link[=filter_missing_scores]{filter_missing_scores()}}. +Strategy factory for \code{\link[=filter_scores]{filter_scores()}}. Returns a function that keeps only target combinations covered by a minimum proportion of models. } diff --git a/tests/testthat/test-filter-missing-scores.R b/tests/testthat/test-filter-scores.R similarity index 88% rename from tests/testthat/test-filter-missing-scores.R rename to tests/testthat/test-filter-scores.R index b10dad6a6..1192c7b59 100644 --- a/tests/testthat/test-filter-missing-scores.R +++ b/tests/testthat/test-filter-scores.R @@ -57,34 +57,34 @@ test_that("build_missing_grid() uses observed target combos", { # ============================================================================== -# filter_missing_scores() +# filter_scores() # ============================================================================== -test_that("filter_missing_scores() drops incomplete targets", { +test_that("filter_scores() drops incomplete targets", { scores <- data.table::data.table( model = c("A", "A", "B"), location = c("DE", "US", "DE"), wis = c(1, 2, 3) ) scores <- new_scores(scores, "wis") - result <- filter_missing_scores(scores) + result <- filter_scores(scores) # Only DE should remain (both models have it) expect_equal(nrow(result), 2) expect_true(all(result$location == "DE")) }) -test_that("filter_missing_scores() preserves class and metrics", { +test_that("filter_scores() preserves class and metrics", { scores <- data.table::data.table( model = c("A", "A", "B"), location = c("DE", "US", "DE"), wis = c(1, 2, 3) ) scores <- new_scores(scores, "wis") - result <- filter_missing_scores(scores) + result <- filter_scores(scores) expect_s3_class(result, "scores") expect_equal(attr(result, "metrics"), "wis") }) -test_that("filter_missing_scores() unchanged when nothing missing", { +test_that("filter_scores() unchanged when nothing missing", { scores <- data.table::data.table( model = c("A", "A", "B", "B"), location = c("DE", "US", "DE", "US"), @@ -92,12 +92,25 @@ test_that("filter_missing_scores() unchanged when nothing missing", { ) scores <- new_scores(scores, "wis") expect_message( - result <- filter_missing_scores(scores), - "No missing" + result <- filter_scores(scores), + "No rows filtered" ) expect_equal(nrow(result), 4) }) +test_that("filter_scores() errors on invalid compare column", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + expect_error( + filter_scores(scores, compare = "nonexistent"), + "nonexistent" + ) +}) + # ============================================================================== # filter_to_intersection() From 26fd2d888f5a0e17db0059aed978015377dd4648 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 21:58:35 +0100 Subject: [PATCH 07/33] fix: pass compare param through to strategy functions - Add compare to strategy function signature (4th argument) - Replace hardcoded "model" with compare in all strategies - Add assert_subset validation for compare column - Change @keywords from scoring to handle-metrics - Strengthen impute_worst_score test to verify per-target max --- R/impute-missing-scores.R | 35 +++++++------ man/impute_mean_score.Rd | 29 +++++++++++ man/impute_missing_scores.Rd | 58 +++++++++++++++++++++ man/impute_model_score.Rd | 37 +++++++++++++ man/impute_na_score.Rd | 27 ++++++++++ man/impute_worst_score.Rd | 29 +++++++++++ tests/testthat/test-impute-missing-scores.R | 37 +++++++++---- 7 files changed, 224 insertions(+), 28 deletions(-) create mode 100644 man/impute_mean_score.Rd create mode 100644 man/impute_missing_scores.Rd create mode 100644 man/impute_model_score.Rd create mode 100644 man/impute_na_score.Rd create mode 100644 man/impute_worst_score.Rd diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R index 9b50a0886..23f71731e 100644 --- a/R/impute-missing-scores.R +++ b/R/impute-missing-scores.R @@ -23,8 +23,8 @@ #' fills missing metric values. Built-in options are #' [impute_worst_score()], [impute_mean_score()], #' [impute_na_score()], and [impute_model_score()]. -#' The function must accept three arguments: -#' `(scores, missing_rows, metrics)` and return +#' The function must accept four arguments: +#' `(scores, missing_rows, metrics, compare)` and return #' `missing_rows` with metric columns filled. #' @param compare Character vector of length one with the #' column name that defines the unit of comparison. @@ -36,10 +36,10 @@ #' #' @importFrom data.table copy set rbindlist setattr #' @importFrom checkmate assert_class assert_function -#' assert_character +#' assert_character assert_subset #' @importFrom cli cli_abort #' @export -#' @keywords scoring +#' @keywords handle-metrics #' @examples #' \dontshow{ #' data.table::setDTthreads(2) @@ -58,6 +58,7 @@ impute_missing_scores <- function( assert_class(scores, "scores") metrics <- get_metrics.scores(scores, error = TRUE) assert_character(compare, len = 1) + assert_subset(compare, names(scores)) assert_function(strategy) scores <- copy(scores) @@ -69,7 +70,7 @@ impute_missing_scores <- function( return(scores[]) } - filled <- strategy(scores, missing_rows, metrics) + filled <- strategy(scores, missing_rows, metrics, compare) data.table::set(filled, j = ".imputed", value = TRUE) data.table::set(scores, j = ".imputed", value = FALSE) @@ -93,7 +94,7 @@ impute_missing_scores <- function( #' @return A function suitable for use as the `strategy` #' argument in [impute_missing_scores()]. #' @export -#' @keywords scoring +#' @keywords handle-metrics #' @examples #' \dontshow{ #' data.table::setDTthreads(2) @@ -104,9 +105,9 @@ impute_missing_scores <- function( #' #' impute_missing_scores(scores, strategy = impute_worst_score()) impute_worst_score <- function() { - function(scores, missing_rows, metrics) { + function(scores, missing_rows, metrics, compare) { fu <- get_forecast_unit(scores) - target_cols <- setdiff(fu, "model") + target_cols <- setdiff(fu, compare) for (m in metrics) { if (!(m %in% names(scores))) next @@ -144,7 +145,7 @@ impute_worst_score <- function() { #' @return A function suitable for use as the `strategy` #' argument in [impute_missing_scores()]. #' @export -#' @keywords scoring +#' @keywords handle-metrics #' @examples #' \dontshow{ #' data.table::setDTthreads(2) @@ -155,9 +156,9 @@ impute_worst_score <- function() { #' #' impute_missing_scores(scores, strategy = impute_mean_score()) impute_mean_score <- function() { - function(scores, missing_rows, metrics) { + function(scores, missing_rows, metrics, compare) { fu <- get_forecast_unit(scores) - target_cols <- setdiff(fu, "model") + target_cols <- setdiff(fu, compare) for (m in metrics) { if (!(m %in% names(scores))) next @@ -191,7 +192,7 @@ impute_mean_score <- function() { #' @return A function suitable for use as the `strategy` #' argument in [impute_missing_scores()]. #' @export -#' @keywords scoring +#' @keywords handle-metrics #' @examples #' \dontshow{ #' data.table::setDTthreads(2) @@ -202,7 +203,7 @@ impute_mean_score <- function() { #' #' impute_missing_scores(scores, strategy = impute_na_score()) impute_na_score <- function() { - function(scores, missing_rows, metrics) { + function(scores, missing_rows, metrics, compare) { for (m in metrics) { data.table::set(missing_rows, j = m, value = NA_real_) } @@ -228,7 +229,7 @@ impute_na_score <- function() { #' #' @importFrom cli cli_abort #' @export -#' @keywords scoring +#' @keywords handle-metrics #' @examples #' \dontshow{ #' data.table::setDTthreads(2) @@ -246,12 +247,12 @@ impute_model_score <- function(model) { # Store in a different name to avoid collision with # the "model" column in data.table expressions ref_model_name <- model - function(scores, missing_rows, metrics) { + function(scores, missing_rows, metrics, compare) { fu <- get_forecast_unit(scores) - target_cols <- setdiff(fu, "model") + target_cols <- setdiff(fu, compare) ref <- scores[ - get("model") == ref_model_name + get(compare) == ref_model_name ] if (nrow(ref) == 0) { diff --git a/man/impute_mean_score.Rd b/man/impute_mean_score.Rd new file mode 100644 index 000000000..c00796cc0 --- /dev/null +++ b/man/impute_mean_score.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute-missing-scores.R +\name{impute_mean_score} +\alias{impute_mean_score} +\title{Impute with mean observed score} +\usage{ +impute_mean_score() +} +\value{ +A function suitable for use as the \code{strategy} +argument in \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +} +\description{ +Creates an imputation strategy that fills each missing +metric with the mean observed value for that metric within +the same target combination across all elements of +\code{compare}. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +impute_missing_scores(scores, strategy = impute_mean_score()) +} +\keyword{handle-metrics} diff --git a/man/impute_missing_scores.Rd b/man/impute_missing_scores.Rd new file mode 100644 index 000000000..7cf3473c2 --- /dev/null +++ b/man/impute_missing_scores.Rd @@ -0,0 +1,58 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute-missing-scores.R +\name{impute_missing_scores} +\alias{impute_missing_scores} +\title{Impute missing scores} +\usage{ +impute_missing_scores(scores, strategy, compare = "model") +} +\arguments{ +\item{scores}{An object of class \code{scores} (a data.table +with scores and an additional attribute \code{metrics} as +produced by \code{\link[=score]{score()}}).} + +\item{strategy}{A function or factory-created function that +fills missing metric values. Built-in options are +\code{\link[=impute_worst_score]{impute_worst_score()}}, \code{\link[=impute_mean_score]{impute_mean_score()}}, +\code{\link[=impute_na_score]{impute_na_score()}}, and \code{\link[=impute_model_score]{impute_model_score()}}. +The function must accept four arguments: +\verb{(scores, missing_rows, metrics, compare)} and return +\code{missing_rows} with metric columns filled.} + +\item{compare}{Character vector of length one with the +column name that defines the unit of comparison. +Default is \code{"model"}.} +} +\value{ +An object of class \code{scores} with an additional +\code{.imputed} column. Rows that were imputed have +\code{.imputed = TRUE}. +} +\description{ +Fills in scores for forecast-target combinations that are +missing from the data, using a user-specified imputation +strategy. This is useful to ensure all models are evaluated +on the same set of targets, which avoids bias when +summarising scores. + +Missing combinations are identified by comparing each +element in \code{compare} against the full set of targets +present across all elements. The strategy function then +provides the imputed values for the missing metric columns. + +An \code{.imputed} column is added to the output indicating +which rows were imputed (\code{TRUE}) and which are original +(\code{FALSE}). +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +# Impute with NA values +impute_missing_scores(scores, strategy = impute_na_score()) +} +\keyword{handle-metrics} diff --git a/man/impute_model_score.Rd b/man/impute_model_score.Rd new file mode 100644 index 000000000..b75f86957 --- /dev/null +++ b/man/impute_model_score.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute-missing-scores.R +\name{impute_model_score} +\alias{impute_model_score} +\title{Impute with a reference model's scores} +\usage{ +impute_model_score(model) +} +\arguments{ +\item{model}{Character string naming the reference model +whose scores should be used for imputation. The reference +model must have scores for all target combinations that +need imputing.} +} +\value{ +A function suitable for use as the \code{strategy} +argument in \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +} +\description{ +Creates an imputation strategy that fills missing scores +with the actual scores from a specified reference model +for each target combination. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +impute_missing_scores( + scores, + strategy = impute_model_score("EuroCOVIDhub-baseline") +) +} +\keyword{handle-metrics} diff --git a/man/impute_na_score.Rd b/man/impute_na_score.Rd new file mode 100644 index 000000000..399af054c --- /dev/null +++ b/man/impute_na_score.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute-missing-scores.R +\name{impute_na_score} +\alias{impute_na_score} +\title{Impute with NA values} +\usage{ +impute_na_score() +} +\value{ +A function suitable for use as the \code{strategy} +argument in \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +} +\description{ +Creates an imputation strategy that fills each missing +metric with \code{NA_real_}. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +impute_missing_scores(scores, strategy = impute_na_score()) +} +\keyword{handle-metrics} diff --git a/man/impute_worst_score.Rd b/man/impute_worst_score.Rd new file mode 100644 index 000000000..48c5e6806 --- /dev/null +++ b/man/impute_worst_score.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute-missing-scores.R +\name{impute_worst_score} +\alias{impute_worst_score} +\title{Impute with worst (maximum) observed score} +\usage{ +impute_worst_score() +} +\value{ +A function suitable for use as the \code{strategy} +argument in \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +} +\description{ +Creates an imputation strategy that fills each missing +metric with the worst (maximum) observed value for that +metric within the same target combination across all +elements of \code{compare}. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +impute_missing_scores(scores, strategy = impute_worst_score()) +} +\keyword{handle-metrics} diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index d78b5c46d..0dd6906b7 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -85,18 +85,32 @@ test_that("impute_worst_score fills with max observed score", { imputed <- result[(.imputed)] if (nrow(imputed) > 0) { - # Each imputed metric value should be <= max of that - # metric across all original data + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + # Each imputed metric value should equal the max of + # that metric within the same target combination for (m in metrics) { - if (m %in% names(imputed) && m %in% names(scores)) { - max_val <- max(scores[[m]], na.rm = TRUE) - expect_true( - all( - imputed[[m]] <= max_val | is.na(imputed[[m]]) - ), - info = paste("metric", m, "exceeds max") + if (!(m %in% names(imputed)) || + !(m %in% names(scores))) next + max_per_target <- scores[, + .(..max = max(get(m), na.rm = TRUE)), + by = target_cols + ] + merged <- merge( + imputed, max_per_target, + by = target_cols, all.x = TRUE + ) + expect_true( + all( + merged[[m]] == merged[["..max"]] | + is.na(merged[[m]]) + ), + info = paste( + "metric", m, + "does not match per-target max" ) - } + ) } } }) @@ -187,7 +201,8 @@ test_that("custom strategy function works", { where = asNamespace("scoringutils")), "build_missing_grid not yet available" ) - custom_strategy <- function(scores, missing_rows, metrics) { + custom_strategy <- function(scores, missing_rows, metrics, + compare) { for (m in metrics) { data.table::set(missing_rows, j = m, value = 999) } From 1d505115eed9e569dd99992ac346f74efda00035 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 21:59:32 +0100 Subject: [PATCH 08/33] fix: update integration test to use renamed filter_scores --- tests/testthat/test-impute-missing-scores.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index 0dd6906b7..5f2cc9f35 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -251,9 +251,9 @@ test_that( "build_missing_grid not yet available" ) skip_if_not( - exists("filter_missing_scores", + exists("filter_scores", where = asNamespace("scoringutils")), - "filter_missing_scores not yet available" + "filter_scores not yet available" ) skip_if_not( exists("filter_to_intersection", @@ -264,7 +264,7 @@ test_that( ref_model <- "EuroCOVIDhub-baseline" result <- scores |> - filter_missing_scores( + filter_scores( strategy = filter_to_intersection( models = ref_model ) From 06b65e7b23e860fc699902f7d50317f7d64d1f86 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 22:16:58 +0100 Subject: [PATCH 09/33] style: fix indentation in impute test --- tests/testthat/test-impute-missing-scores.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index 5f2cc9f35..09078fe93 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -92,7 +92,7 @@ test_that("impute_worst_score fills with max observed score", { # that metric within the same target combination for (m in metrics) { if (!(m %in% names(imputed)) || - !(m %in% names(scores))) next + !(m %in% names(scores))) next max_per_target <- scores[, .(..max = max(get(m), na.rm = TRUE)), by = target_cols From dbcddb2b4b83f026f11f9b0c3458200da5b2ff61 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 22:25:21 +0100 Subject: [PATCH 10/33] =?UTF-8?q?refactor:=20rename=20models=E2=86=92inclu?= =?UTF-8?q?de,=20fix=20review=20issues?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename filter_to_intersection(models=) to include= for genericity - Add validation for unknown values in include argument - Fix cli_abort messages with embedded whitespace - Add non-default compare tests for both filter and impute - Pass compare through to all impute strategies (was already done, now tested) --- R/filter-scores.R | 48 +++++++++++++-------- R/impute-missing-scores.R | 24 +++++------ man/filter_to_intersection.Rd | 17 ++++---- tests/testthat/test-filter-scores.R | 40 ++++++++++++++--- tests/testthat/test-impute-missing-scores.R | 24 ++++++++++- 5 files changed, 108 insertions(+), 45 deletions(-) diff --git a/R/filter-scores.R b/R/filter-scores.R index 4e17075b7..063869214 100644 --- a/R/filter-scores.R +++ b/R/filter-scores.R @@ -74,15 +74,16 @@ filter_scores <- function( #' @description #' Strategy factory for [filter_scores()]. #' Returns a function that keeps only target combinations -#' covered by a minimum proportion of models. +#' covered by a minimum proportion of comparators. #' #' @param min_coverage Numeric between 0 and 1 (default `1`). -#' Minimum proportion of models that must cover a target -#' combination for it to be kept. -#' @param models Character vector or `NULL` (default). If -#' provided, the target grid is restricted to targets covered -#' by these models. When multiple models are specified, only -#' the intersection of their targets is used. +#' Minimum proportion of comparators that must cover a +#' target combination for it to be kept. +#' @param include Character vector or `NULL` (default). If +#' provided, the target grid is restricted to targets +#' covered by these values of the `compare` column. When +#' multiple values are given, only the intersection of +#' their targets is used. #' #' @return A function with signature `function(scores, compare)` #' suitable for use as a strategy in @@ -94,11 +95,11 @@ filter_scores <- function( #' @keywords handle-metrics filter_to_intersection <- function( min_coverage = 1, - models = NULL + include = NULL ) { assert_number(min_coverage, lower = 0, upper = 1) - if (!is.null(models)) { - assert_character(models, min.len = 1) + if (!is.null(include)) { + assert_character(include, min.len = 1) } function(scores, compare = "model") { @@ -106,9 +107,18 @@ filter_to_intersection <- function( forecast_unit <- get_forecast_unit(scores) target_cols <- setdiff(forecast_unit, compare) - if (!is.null(models)) { - # Restrict to targets covered by specified models - model_targets <- lapply(models, function(m) { + if (!is.null(include)) { + unknown <- setdiff(include, unique(scores[[compare]])) + if (length(unknown) > 0) { + cli::cli_abort(c( + "!" = paste0( + "{.val {unknown}} not found in ", + "{.arg {compare}} column." + ) + )) + } + # Restrict to targets covered by specified values + model_targets <- lapply(include, function(m) { unique( scores[ scores[[compare]] == m, @@ -117,7 +127,7 @@ filter_to_intersection <- function( ] ) }) - # Intersection of all specified models' targets + # Intersection of all specified values' targets qualifying <- model_targets[[1]] if (length(model_targets) > 1) { for (i in seq(2, length(model_targets))) { @@ -132,17 +142,17 @@ filter_to_intersection <- function( } } } else { - # Count models per target combination - all_models <- unique(scores[[compare]]) - n_total <- length(all_models) + # Count include per target combination + all_include <- unique(scores[[compare]]) + n_total <- length(all_include) target_coverage <- scores[ - , .(n_models = data.table::uniqueN(get(compare))), + , .(n_include = data.table::uniqueN(get(compare))), by = target_cols ] #nolint start: object_usage_linter qualifying <- target_coverage[ - n_models / n_total >= min_coverage, + n_include / n_total >= min_coverage, #nolint end target_cols, with = FALSE diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R index 23f71731e..91323293c 100644 --- a/R/impute-missing-scores.R +++ b/R/impute-missing-scores.R @@ -256,12 +256,12 @@ impute_model_score <- function(model) { ] if (nrow(ref) == 0) { - cli_abort( - c( - "!" = "Reference model {.val {ref_model_name}} - not found in scores." + cli_abort(c( + "!" = paste0( + "Reference model {.val {ref_model_name}} ", + "not found in scores." ) - ) + )) } # Check that the reference model has scores for all @@ -276,14 +276,14 @@ impute_model_score <- function(model) { on = target_cols ] if (nrow(missing_targets) > 0) { - cli_abort( - c( - "!" = "Reference model {.val {ref_model_name}} - is missing scores for - {nrow(missing_targets)} target - combination{?s} that need imputing." + cli_abort(c( + "!" = paste0( + "Reference model {.val {ref_model_name}} ", + "is missing scores for ", + "{nrow(missing_targets)} target ", + "combination{?s} that need imputing." ) - ) + )) } # Merge reference model scores onto missing rows diff --git a/man/filter_to_intersection.Rd b/man/filter_to_intersection.Rd index 07a1ca464..a1a04d336 100644 --- a/man/filter_to_intersection.Rd +++ b/man/filter_to_intersection.Rd @@ -4,17 +4,18 @@ \alias{filter_to_intersection} \title{Filter to intersection of model-target combinations} \usage{ -filter_to_intersection(min_coverage = 1, models = NULL) +filter_to_intersection(min_coverage = 1, include = NULL) } \arguments{ \item{min_coverage}{Numeric between 0 and 1 (default \code{1}). -Minimum proportion of models that must cover a target -combination for it to be kept.} +Minimum proportion of comparators that must cover a +target combination for it to be kept.} -\item{models}{Character vector or \code{NULL} (default). If -provided, the target grid is restricted to targets covered -by these models. When multiple models are specified, only -the intersection of their targets is used.} +\item{include}{Character vector or \code{NULL} (default). If +provided, the target grid is restricted to targets +covered by these values of the \code{compare} column. When +multiple values are given, only the intersection of +their targets is used.} } \value{ A function with signature \verb{function(scores, compare)} @@ -24,6 +25,6 @@ suitable for use as a strategy in \description{ Strategy factory for \code{\link[=filter_scores]{filter_scores()}}. Returns a function that keeps only target combinations -covered by a minimum proportion of models. +covered by a minimum proportion of comparators. } \keyword{handle-metrics} diff --git a/tests/testthat/test-filter-scores.R b/tests/testthat/test-filter-scores.R index 1192c7b59..065e54afb 100644 --- a/tests/testthat/test-filter-scores.R +++ b/tests/testthat/test-filter-scores.R @@ -67,7 +67,7 @@ test_that("filter_scores() drops incomplete targets", { ) scores <- new_scores(scores, "wis") result <- filter_scores(scores) - # Only DE should remain (both models have it) + # Only DE should remain (both include have it) expect_equal(nrow(result), 2) expect_true(all(result$location == "DE")) }) @@ -134,14 +134,14 @@ test_that("filter_to_intersection(min_coverage=0.5) works", { expect_equal(nrow(result2), 5) }) -test_that("filter_to_intersection(models) keeps targets", { +test_that("filter_to_intersection(include) keeps targets", { scores <- data.table::data.table( model = c("m1", "m1", "m2", "m2", "m3"), location = c("DE", "US", "DE", "FR", "DE"), wis = c(1, 2, 3, 4, 5) ) scores <- new_scores(scores, "wis") - strategy <- filter_to_intersection(models = "m1") + strategy <- filter_to_intersection(include = "m1") result <- strategy(scores, compare = "model") # m1 covers DE and US, so keep all rows with DE or US expect_true(all(result$location %in% c("DE", "US"))) @@ -149,7 +149,7 @@ test_that("filter_to_intersection(models) keeps targets", { expect_false("FR" %in% result$location) }) -test_that("filter_to_intersection(models=c()) intersects", { +test_that("filter_to_intersection(include=c()) intersects", { scores <- data.table::data.table( model = c("m1", "m1", "m2", "m2", "m3"), location = c("DE", "US", "DE", "FR", "DE"), @@ -157,9 +157,39 @@ test_that("filter_to_intersection(models=c()) intersects", { ) scores <- new_scores(scores, "wis") strategy <- filter_to_intersection( - models = c("m1", "m2") + include = c("m1", "m2") ) result <- strategy(scores, compare = "model") # m1 covers DE, US; m2 covers DE, FR; intersection = DE expect_true(all(result$location == "DE")) }) + +test_that("filter_to_intersection(include) errors on unknown", { + scores <- data.table::data.table( + model = c("A", "B"), + location = c("DE", "DE"), + wis = c(1, 2) + ) + scores <- new_scores(scores, "wis") + expect_error( + filter_scores( + scores, + strategy = filter_to_intersection(include = "Z") + ), + "not found" + ) +}) + +test_that("filter_scores() works with non-default compare", { + scores <- data.table::data.table( + forecaster = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + result <- filter_scores( + scores, compare = "forecaster" + ) + expect_true(all(result$location == "DE")) + expect_equal(nrow(result), 2) +}) diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index 09078fe93..4ebf91406 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -266,7 +266,7 @@ test_that( result <- scores |> filter_scores( strategy = filter_to_intersection( - models = ref_model + include = ref_model ) ) |> impute_missing_scores( @@ -275,3 +275,25 @@ test_that( expect_s3_class(result, "scores") } ) + +test_that( + "impute_missing_scores works with non-default compare", + { + scores <- data.table::data.table( + forecaster = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + result <- impute_missing_scores( + scores, + strategy = impute_worst_score(), + compare = "forecaster" + ) + expect_true(".imputed" %in% names(result)) + imputed <- result[result$.imputed == TRUE] + expect_equal(nrow(imputed), 1) + expect_equal(imputed$forecaster, "B") + expect_equal(imputed$location, "US") + } +) From 6ca383f9802450b5f6f7563a5abc578047b1492e Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 22:31:01 +0100 Subject: [PATCH 11/33] style: fix redundant_equals_linter in test --- tests/testthat/test-impute-missing-scores.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index 4ebf91406..1b17193d4 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -291,7 +291,7 @@ test_that( compare = "forecaster" ) expect_true(".imputed" %in% names(result)) - imputed <- result[result$.imputed == TRUE] + imputed <- result[result$.imputed] expect_equal(nrow(imputed), 1) expect_equal(imputed$forecaster, "B") expect_equal(imputed$location, "US") From 797b926026deef70f9ddc76915763934c6323edd Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 22:38:11 +0100 Subject: [PATCH 12/33] docs: add vignette for handling missing forecasts --- R/filter-scores.R | 1 + R/impute-missing-scores.R | 1 + man/filter_scores.Rd | 3 + man/impute_missing_scores.Rd | 3 + vignettes/handling-missing-forecasts.Rmd | 111 +++++++++++++++++++++++ 5 files changed, 119 insertions(+) create mode 100644 vignettes/handling-missing-forecasts.Rmd diff --git a/R/filter-scores.R b/R/filter-scores.R index 063869214..71c691457 100644 --- a/R/filter-scores.R +++ b/R/filter-scores.R @@ -19,6 +19,7 @@ #' @return A filtered `scores` object with the same class and #' `metrics` attribute as the input. #' +#' @seealso \code{vignette("handling-missing-forecasts")} #' @importFrom cli cli_inform #' @importFrom checkmate assert_class assert_character #' assert_function assert_subset diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R index 91323293c..05fc9c457 100644 --- a/R/impute-missing-scores.R +++ b/R/impute-missing-scores.R @@ -34,6 +34,7 @@ #' `.imputed` column. Rows that were imputed have #' `.imputed = TRUE`. #' +#' @seealso \code{vignette("handling-missing-forecasts")} #' @importFrom data.table copy set rbindlist setattr #' @importFrom checkmate assert_class assert_function #' assert_character assert_subset diff --git a/man/filter_scores.Rd b/man/filter_scores.Rd index 38cd25dd3..5b129b121 100644 --- a/man/filter_scores.Rd +++ b/man/filter_scores.Rd @@ -29,4 +29,7 @@ argument, which defaults to \code{\link[=filter_to_intersection]{filter_to_inter This is a general-purpose filtering function that delegates all logic to the strategy. } +\seealso{ +\code{vignette("handling-missing-forecasts")} +} \keyword{handle-metrics} diff --git a/man/impute_missing_scores.Rd b/man/impute_missing_scores.Rd index 7cf3473c2..e81a7fbcd 100644 --- a/man/impute_missing_scores.Rd +++ b/man/impute_missing_scores.Rd @@ -55,4 +55,7 @@ scores <- example_quantile |> # Impute with NA values impute_missing_scores(scores, strategy = impute_na_score()) } +\seealso{ +\code{vignette("handling-missing-forecasts")} +} \keyword{handle-metrics} diff --git a/vignettes/handling-missing-forecasts.Rmd b/vignettes/handling-missing-forecasts.Rmd new file mode 100644 index 000000000..3645965ec --- /dev/null +++ b/vignettes/handling-missing-forecasts.Rmd @@ -0,0 +1,111 @@ +--- +title: "Handling missing forecasts" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Handling missing forecasts} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, comment = "#>" +) +data.table::setDTthreads(2) +``` + +When models forecast different subsets of targets, score summaries become misleading. +A model that only forecasts easy targets will appear to perform better than one that attempts harder targets too. +To get fair comparisons, you need to either restrict to a common set of targets or impute the missing scores. + +## Diagnosing missingness + +`get_forecast_counts()` shows which models cover which targets. + +```{r counts} +library(scoringutils) +fc <- as_forecast_quantile(example_quantile) +get_forecast_counts(fc, by = c("model", "target_type")) +``` + +Here `UMass-MechBayes` has no case forecasts and `epiforecasts-EpiNow2` has fewer death forecasts than the other models. + +## Filtering to a common set of targets + +`filter_scores()` removes scores for target combinations not shared across models. +The default strategy, `filter_to_intersection()`, keeps only targets covered by all models. + +```{r score} +scores <- score(fc) +``` + +```{r filter} +scores_filtered <- filter_scores(scores) +``` + +You can relax the requirement with `min_coverage` to keep targets covered by at least a given proportion of models. + +```{r filter-relaxed} +scores_relaxed <- filter_scores( + scores, + strategy = filter_to_intersection( + min_coverage = 0.75 + ) +) +``` + +The `include` argument restricts to targets covered by specific models. + +```{r filter-include} +scores_baseline <- filter_scores( + scores, + strategy = filter_to_intersection( + include = "EuroCOVIDhub-baseline" + ) +) +``` + +## Imputing missing scores + +Instead of dropping data, `impute_missing_scores()` fills in scores for missing target combinations. +Four built-in strategies are available. + +- `impute_worst_score()` fills with the worst (maximum) observed score for each target. +- `impute_mean_score()` fills with the mean observed score. +- `impute_na_score()` fills with `NA`. +- `impute_model_score(model)` fills with the scores from a named reference model. + +```{r impute} +scores_imputed <- impute_missing_scores( + scores, + strategy = impute_worst_score() +) +``` + +Imputed rows are marked with `.imputed = TRUE`, so you can inspect what was added. + +```{r imputed-rows} +scores_imputed[scores_imputed$.imputed == TRUE, 1:5] +``` + +## Recommended workflow + +Filter first to remove targets with very low coverage, then impute the remainder. + +```{r pipeline} +result <- scores |> + filter_scores( + strategy = filter_to_intersection( + min_coverage = 0.5 + ) + ) |> + impute_missing_scores( + strategy = impute_worst_score() + ) +``` + +You can then summarise as usual with `summarise_scores()`. + +```{r summarise} +summarise_scores(result, by = "model") +``` From 048be7c4b5d1254960f740c99e6dc76da6812824 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 22:38:45 +0100 Subject: [PATCH 13/33] docs: add vignette and NEWS entry for missing scores handling --- NEWS.md | 1 + vignettes/handling-missing-forecasts.Rmd | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 1fb41245b..e87a51509 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # scoringutils (development version) +- Added `filter_scores()` and `impute_missing_scores()` for handling missing forecasts before summarisation. `filter_scores()` removes target combinations with insufficient model coverage, while `impute_missing_scores()` fills in missing scores using configurable strategies (worst, mean, NA, or reference model). Both use a strategy function pattern for extensibility. See `vignette("handling-missing-forecasts")` for details (#1122). - `get_pairwise_comparisons()` now works with only two models when a baseline is specified, instead of requiring at least three (#1022). - `score()` now warns when column names in the input data clash with metric names, as these columns are overwritten during scoring (#382). - The print method for multivariate forecasts now displays the `joint_across` columns, making it easier to see which variables are forecast jointly (#1043). diff --git a/vignettes/handling-missing-forecasts.Rmd b/vignettes/handling-missing-forecasts.Rmd index 3645965ec..1229f5768 100644 --- a/vignettes/handling-missing-forecasts.Rmd +++ b/vignettes/handling-missing-forecasts.Rmd @@ -85,7 +85,7 @@ scores_imputed <- impute_missing_scores( Imputed rows are marked with `.imputed = TRUE`, so you can inspect what was added. ```{r imputed-rows} -scores_imputed[scores_imputed$.imputed == TRUE, 1:5] +scores_imputed[scores_imputed$.imputed, 1:5] ``` ## Recommended workflow From 3d266ab934fb490903bb2d635ac85a2fc7472b43 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Mon, 30 Mar 2026 22:41:07 +0100 Subject: [PATCH 14/33] docs: improve vignette clarity and fix review issues --- vignettes/handling-missing-forecasts.Rmd | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/vignettes/handling-missing-forecasts.Rmd b/vignettes/handling-missing-forecasts.Rmd index 1229f5768..a6313db82 100644 --- a/vignettes/handling-missing-forecasts.Rmd +++ b/vignettes/handling-missing-forecasts.Rmd @@ -15,8 +15,8 @@ data.table::setDTthreads(2) ``` When models forecast different subsets of targets, score summaries become misleading. -A model that only forecasts easy targets will appear to perform better than one that attempts harder targets too. -To get fair comparisons, you need to either restrict to a common set of targets or impute the missing scores. +Filter when a model legitimately does not cover certain targets. +Impute when a missing forecast should be treated as a performance failure. ## Diagnosing missingness @@ -34,6 +34,7 @@ Here `UMass-MechBayes` has no case forecasts and `epiforecasts-EpiNow2` has fewe `filter_scores()` removes scores for target combinations not shared across models. The default strategy, `filter_to_intersection()`, keeps only targets covered by all models. +Both functions accept a `compare` argument (default `"model"`) that specifies which column identifies the unit being compared. ```{r score} scores <- score(fc) @@ -54,7 +55,7 @@ scores_relaxed <- filter_scores( ) ``` -The `include` argument restricts to targets covered by specific models. +The `include` argument restricts to targets covered by specific values of the `compare` column. ```{r filter-include} scores_baseline <- filter_scores( @@ -68,12 +69,11 @@ scores_baseline <- filter_scores( ## Imputing missing scores Instead of dropping data, `impute_missing_scores()` fills in scores for missing target combinations. -Four built-in strategies are available. - `impute_worst_score()` fills with the worst (maximum) observed score for each target. - `impute_mean_score()` fills with the mean observed score. - `impute_na_score()` fills with `NA`. -- `impute_model_score(model)` fills with the scores from a named reference model. +- `impute_model_score("my-baseline")` fills with scores from a named reference model. ```{r impute} scores_imputed <- impute_missing_scores( @@ -82,15 +82,16 @@ scores_imputed <- impute_missing_scores( ) ``` -Imputed rows are marked with `.imputed = TRUE`, so you can inspect what was added. +Imputed rows are marked with `.imputed = TRUE`. ```{r imputed-rows} -scores_imputed[scores_imputed$.imputed, 1:5] +scores_imputed[(.imputed)] ``` ## Recommended workflow -Filter first to remove targets with very low coverage, then impute the remainder. +When some models have sparse coverage, filtering alone may drop too many targets. +Filtering to a minimum coverage threshold first, then imputing the remainder, balances data retention against fairness. ```{r pipeline} result <- scores |> From a84aea2c10a88c23d0eb08268ec98d71b2810e6f Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 14:54:01 +0100 Subject: [PATCH 15/33] docs: rewrite vignette and add CLI messages to impute_missing_scores Restructure the handling-missing-forecasts vignette to introduce concepts incrementally with summarise_scores() shown after each approach. Add cli_inform() messages to impute_missing_scores() to match the pattern used by filter_scores(). Co-authored-by: Sam Abbott --- R/impute-missing-scores.R | 22 +++- vignettes/handling-missing-forecasts.Rmd | 150 +++++++++++++++++------ 2 files changed, 135 insertions(+), 37 deletions(-) diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R index 05fc9c457..d67cb6151 100644 --- a/R/impute-missing-scores.R +++ b/R/impute-missing-scores.R @@ -38,7 +38,7 @@ #' @importFrom data.table copy set rbindlist setattr #' @importFrom checkmate assert_class assert_function #' assert_character assert_subset -#' @importFrom cli cli_abort +#' @importFrom cli cli_abort cli_inform #' @export #' @keywords handle-metrics #' @examples @@ -67,10 +67,30 @@ impute_missing_scores <- function( missing_rows <- build_missing_grid(scores, compare) # nolint: object_usage_linter if (nrow(missing_rows) == 0) { + #nolint start: keyword_quote_linter + cli_inform(c( + "i" = "No missing scores to impute. Returning scores unchanged." + )) + #nolint end data.table::set(scores, j = ".imputed", value = FALSE) return(scores[]) } + #nolint start: object_usage_linter + n_missing <- nrow(missing_rows) + n_comparators <- length(unique(missing_rows[[compare]])) + #nolint end + #nolint start: keyword_quote_linter + compare_label <- paste0( + n_comparators, " ", compare, + if (n_comparators != 1) "s" else "" + ) + cli_inform(c( + "i" = "Imputing {n_missing} missing score row{?s} + across {compare_label}." + )) + #nolint end + filled <- strategy(scores, missing_rows, metrics, compare) data.table::set(filled, j = ".imputed", value = TRUE) diff --git a/vignettes/handling-missing-forecasts.Rmd b/vignettes/handling-missing-forecasts.Rmd index a6313db82..606c31598 100644 --- a/vignettes/handling-missing-forecasts.Rmd +++ b/vignettes/handling-missing-forecasts.Rmd @@ -14,13 +14,16 @@ knitr::opts_chunk$set( data.table::setDTthreads(2) ``` -When models forecast different subsets of targets, score summaries become misleading. -Filter when a model legitimately does not cover certain targets. -Impute when a missing forecast should be treated as a performance failure. +When comparing forecast models, not all models will have made predictions for every target. +Naively averaging scores across different sets of targets produces misleading summaries. +This vignette walks through diagnosing which targets each model covers, scoring the forecasts, and then adjusting the scores using filtering and imputation so that summaries are comparable. ## Diagnosing missingness -`get_forecast_counts()` shows which models cover which targets. +Before adjusting scores it is worth understanding the pattern of missingness. +Models may have different coverage for legitimate reasons (a model may only forecast deaths, not cases) or because of operational failures (a missed submission deadline). + +`get_forecast_counts()` tabulates how many forecasts each model has, grouped by the columns you choose. ```{r counts} library(scoringutils) @@ -28,85 +31,160 @@ fc <- as_forecast_quantile(example_quantile) get_forecast_counts(fc, by = c("model", "target_type")) ``` -Here `UMass-MechBayes` has no case forecasts and `epiforecasts-EpiNow2` has fewer death forecasts than the other models. +`UMass-MechBayes` does not forecast cases at all, and `epiforecasts-EpiNow2` has fewer death forecasts than the other models. -## Filtering to a common set of targets +To see exactly which death targets `epiforecasts-EpiNow2` is missing, we can request counts at a finer level and filter to zero-count rows. -`filter_scores()` removes scores for target combinations not shared across models. -The default strategy, `filter_to_intersection()`, keeps only targets covered by all models. -Both functions accept a `compare` argument (default `"model"`) that specifies which column identifies the unit being compared. +```{r missing-detail} +death_counts <- get_forecast_counts( + fc, + by = c("model", "target_type", "location", "target_end_date") +) +death_counts[ + model == "epiforecasts-EpiNow2" & + target_type == "Deaths" & + count == 0 +] +``` + +Note that `get_pairwise_comparisons()` has long handled missingness internally, restricting each pair of models to their shared set of targets before comparing. +The functions below give you explicit control over how to handle missingness when computing score summaries. + +## Scoring + +We score all forecasts first, then adjust the resulting scores table. ```{r score} scores <- score(fc) ``` -```{r filter} +A naive summary averages over different numbers of targets per model, making direct comparison unreliable. + +```{r naive-summary} +summarise_scores(scores, by = "model") +``` + +The sections below show two approaches to fixing this: filtering scores to a common set of targets, and imputing scores for missing targets. + +## Filtering to a common set of targets + +`filter_scores()` removes scores for target combinations not shared across models. +This is appropriate when a model legitimately does not cover certain targets and you want to compare only on shared ground. + +The default strategy, `filter_to_intersection()`, keeps only targets covered by **all** models. + +```{r filter-default} scores_filtered <- filter_scores(scores) +summarise_scores(scores_filtered, by = "model") ``` -You can relax the requirement with `min_coverage` to keep targets covered by at least a given proportion of models. +This drops all case targets (since `UMass-MechBayes` has no case forecasts) and the death targets `epiforecasts-EpiNow2` missed. + +### Relaxing with min_coverage + +You can relax the requirement with `min_coverage`, keeping targets covered by at least a given proportion of models. +With four models, `min_coverage = 0.75` requires coverage by at least three. ```{r filter-relaxed} scores_relaxed <- filter_scores( scores, - strategy = filter_to_intersection( - min_coverage = 0.75 - ) + strategy = filter_to_intersection(min_coverage = 0.75) ) +summarise_scores(scores_relaxed, by = "model") ``` -The `include` argument restricts to targets covered by specific values of the `compare` column. +This retains the case targets (covered by three of four models) while still dropping any target covered by fewer. + +### Filtering to a specific model's targets + +The `include` argument restricts to targets covered by named models. +For example, to evaluate all models only on the targets `epiforecasts-EpiNow2` covered: ```{r filter-include} -scores_baseline <- filter_scores( +scores_epinow2 <- filter_scores( scores, strategy = filter_to_intersection( - include = "EuroCOVIDhub-baseline" + include = "epiforecasts-EpiNow2" ) ) +summarise_scores(scores_epinow2, by = "model") ``` +This keeps both case and death targets where EpiNow2 submitted forecasts, but `UMass-MechBayes` will still be missing case scores in the result. + ## Imputing missing scores -Instead of dropping data, `impute_missing_scores()` fills in scores for missing target combinations. +Instead of dropping data, `impute_missing_scores()` fills in scores for target combinations a model did not cover. +Imputed rows are marked with `.imputed = TRUE` so they can be identified later. + +### Worst score -- `impute_worst_score()` fills with the worst (maximum) observed score for each target. -- `impute_mean_score()` fills with the mean observed score. -- `impute_na_score()` fills with `NA`. -- `impute_model_score("my-baseline")` fills with scores from a named reference model. +The simplest penalty: fill each missing score with the worst (maximum) observed score for that target. -```{r impute} -scores_imputed <- impute_missing_scores( +```{r impute-worst} +scores_worst <- impute_missing_scores( scores, strategy = impute_worst_score() ) +summarise_scores(scores_worst, by = "model") +``` + +You can inspect which rows were imputed: + +```{r imputed-check} +scores_worst[(.imputed)] ``` -Imputed rows are marked with `.imputed = TRUE`. +### Mean score + +A less severe penalty: fill with the mean score across models that did forecast each target. -```{r imputed-rows} -scores_imputed[(.imputed)] +```{r impute-mean} +scores_mean <- impute_missing_scores( + scores, + strategy = impute_mean_score() +) +summarise_scores(scores_mean, by = "model") ``` -## Recommended workflow +### NA + +Fill with `NA`, which propagates through summaries unless explicitly handled. -When some models have sparse coverage, filtering alone may drop too many targets. -Filtering to a minimum coverage threshold first, then imputing the remainder, balances data retention against fairness. +```{r impute-na} +scores_na <- impute_missing_scores( + scores, + strategy = impute_na_score() +) +summarise_scores(scores_na, by = "model") +``` + +### Reference model + +Fill with the scores of a named baseline model, treating a missing forecast as performing no better than the baseline. + +```{r impute-model} +scores_ref <- impute_missing_scores( + scores, + strategy = impute_model_score("EuroCOVIDhub-baseline") +) +summarise_scores(scores_ref, by = "model") +``` + +## Combining filter and impute + +When you want to focus on a specific model's targets but still need complete scores for all models, combine filtering with imputation. +For example, to evaluate on the targets `epiforecasts-EpiNow2` covered and impute scores for models that are missing forecasts within that set: ```{r pipeline} result <- scores |> filter_scores( strategy = filter_to_intersection( - min_coverage = 0.5 + include = "epiforecasts-EpiNow2" ) ) |> impute_missing_scores( strategy = impute_worst_score() ) -``` - -You can then summarise as usual with `summarise_scores()`. - -```{r summarise} summarise_scores(result, by = "model") ``` From 3fde364a70b0843c23d2f85795a76dd73c55de84 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 14:58:03 +0100 Subject: [PATCH 16/33] style: put cli_inform message on single line Co-authored-by: Sam Abbott --- R/impute-missing-scores.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R index d67cb6151..30136a8c6 100644 --- a/R/impute-missing-scores.R +++ b/R/impute-missing-scores.R @@ -86,8 +86,7 @@ impute_missing_scores <- function( if (n_comparators != 1) "s" else "" ) cli_inform(c( - "i" = "Imputing {n_missing} missing score row{?s} - across {compare_label}." + "i" = "Imputing {n_missing} missing score row{?s} across {compare_label}." # nolint: line_length_linter )) #nolint end From 3ff432a49ad29945e4320fd8a3fb22258a254499 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 15:00:07 +0100 Subject: [PATCH 17/33] fix: use cli::qty() for correct pluralisation in impute message Co-authored-by: Sam Abbott --- R/impute-missing-scores.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R index 30136a8c6..893abf646 100644 --- a/R/impute-missing-scores.R +++ b/R/impute-missing-scores.R @@ -81,12 +81,9 @@ impute_missing_scores <- function( n_comparators <- length(unique(missing_rows[[compare]])) #nolint end #nolint start: keyword_quote_linter - compare_label <- paste0( - n_comparators, " ", compare, - if (n_comparators != 1) "s" else "" - ) cli_inform(c( - "i" = "Imputing {n_missing} missing score row{?s} across {compare_label}." # nolint: line_length_linter + "i" = "Imputing {n_missing} missing score row{?s}.", + "i" = "{n_comparators} {compare} {cli::qty(n_comparators)}value{?s} affected." # nolint: line_length_linter )) #nolint end From 3490100ccd2e6059514498ee393f8838aaad311d Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 15:08:20 +0100 Subject: [PATCH 18/33] docs: credit Kim et al (2026) as inspiration for missing scores handling Co-authored-by: Sam Abbott --- vignettes/handling-missing-forecasts.Rmd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vignettes/handling-missing-forecasts.Rmd b/vignettes/handling-missing-forecasts.Rmd index 606c31598..a7bb47e0a 100644 --- a/vignettes/handling-missing-forecasts.Rmd +++ b/vignettes/handling-missing-forecasts.Rmd @@ -18,6 +18,8 @@ When comparing forecast models, not all models will have made predictions for ev Naively averaging scores across different sets of targets produces misleading summaries. This vignette walks through diagnosing which targets each model covers, scoring the forecasts, and then adjusting the scores using filtering and imputation so that summaries are comparable. +The approaches here were initially inspired by [Kim, Ray & Reich (2026)](https://doi.org/10.1016/j.ijforecast.2025.12.006), who discuss the importance of handling missing forecasts when evaluating model contributions beyond simple leaderboard rankings. + ## Diagnosing missingness Before adjusting scores it is worth understanding the pattern of missingness. From e11bc621557736a558608dced67d13fc0f8b17e6 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 15:16:14 +0100 Subject: [PATCH 19/33] docs: soften absolute claims in vignette prose Co-authored-by: Sam Abbott --- vignettes/handling-missing-forecasts.Rmd | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vignettes/handling-missing-forecasts.Rmd b/vignettes/handling-missing-forecasts.Rmd index a7bb47e0a..da0900326 100644 --- a/vignettes/handling-missing-forecasts.Rmd +++ b/vignettes/handling-missing-forecasts.Rmd @@ -15,11 +15,12 @@ data.table::setDTthreads(2) ``` When comparing forecast models, not all models will have made predictions for every target. -Naively averaging scores across different sets of targets produces misleading summaries. +Naively averaging scores across different sets of targets can produce misleading summaries. This vignette walks through diagnosing which targets each model covers, scoring the forecasts, and then adjusting the scores using filtering and imputation so that summaries are comparable. The approaches here were initially inspired by [Kim, Ray & Reich (2026)](https://doi.org/10.1016/j.ijforecast.2025.12.006), who discuss the importance of handling missing forecasts when evaluating model contributions beyond simple leaderboard rankings. + ## Diagnosing missingness Before adjusting scores it is worth understanding the pattern of missingness. @@ -60,18 +61,18 @@ We score all forecasts first, then adjust the resulting scores table. scores <- score(fc) ``` -A naive summary averages over different numbers of targets per model, making direct comparison unreliable. +A naive summary averages over different numbers of targets per model, which can make direct comparison misleading. ```{r naive-summary} summarise_scores(scores, by = "model") ``` -The sections below show two approaches to fixing this: filtering scores to a common set of targets, and imputing scores for missing targets. +The sections below show two approaches to addressing this: filtering scores to a common set of targets, and imputing scores for missing targets. ## Filtering to a common set of targets `filter_scores()` removes scores for target combinations not shared across models. -This is appropriate when a model legitimately does not cover certain targets and you want to compare only on shared ground. +This can be appropriate when a model legitimately does not cover certain targets and you want to compare only on shared ground. The default strategy, `filter_to_intersection()`, keeps only targets covered by **all** models. From eb1ebcdff2e78b7aab4476d9b2c4c22ee993701f Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 16:05:20 +0100 Subject: [PATCH 20/33] test: cover missing metric columns and nonexistent ref model Add tests for edge cases in imputation strategies: - impute_worst_score/impute_mean_score skip metrics not in columns - impute_model_score errors for nonexistent reference model Co-authored-by: Sam Abbott --- tests/testthat/test-impute-missing-scores.R | 61 +++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index 1b17193d4..51003f133 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -276,6 +276,67 @@ test_that( } ) +test_that( + "impute_worst_score skips metrics not in columns", + { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + # Attribute claims "wis" and "fake" but "fake" is + # not a column + scores <- new_scores(scores, c("wis", "fake")) + result <- suppressWarnings( + impute_missing_scores( + scores, strategy = impute_worst_score() + ) + ) + imputed <- result[result$.imputed] + expect_equal(nrow(imputed), 1) + expect_false("fake" %in% names(imputed)) + } +) + +test_that( + "impute_mean_score skips metrics not in columns", + { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, c("wis", "fake")) + result <- suppressWarnings( + impute_missing_scores( + scores, strategy = impute_mean_score() + ) + ) + imputed <- result[result$.imputed] + expect_equal(nrow(imputed), 1) + expect_false("fake" %in% names(imputed)) + } +) + +test_that( + "impute_model_score errors for nonexistent model", + { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + expect_error( + impute_missing_scores( + scores, + strategy = impute_model_score("nonexistent") + ), + "not found" + ) + } +) + test_that( "impute_missing_scores works with non-default compare", { From 8856e8d030c93f6029854ee9ad04ac12e31b165d Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 16:20:22 +0100 Subject: [PATCH 21/33] docs: add articles section to pkgdown config Co-authored-by: Sam Abbott --- _pkgdown.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/_pkgdown.yml b/_pkgdown.yml index 34853a432..21b8c219a 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -26,6 +26,17 @@ navbar: text: "scoringutils paper" href: https://raw.githubusercontent.com/epiforecasts/scoringutils/main/inst/manuscript/manuscript.pdf +articles: + - title: Getting started + contents: + - scoring-rules + - scoring-multivariate-forecasts + - handling-missing-forecasts + - title: Deprecated + contents: + - Deprecated-functions + - Deprecated-visualisations + reference: - title: Package documentation contents: From bf60b740613c5e2299205466f1163e14d2b4b67d Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 16:51:46 +0100 Subject: [PATCH 22/33] docs: rename articles group from Getting started to Articles Co-authored-by: Sam Abbott --- _pkgdown.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 21b8c219a..34f63e56e 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -27,7 +27,7 @@ navbar: href: https://raw.githubusercontent.com/epiforecasts/scoringutils/main/inst/manuscript/manuscript.pdf articles: - - title: Getting started + - title: Articles contents: - scoring-rules - scoring-multivariate-forecasts From 12ef1a71bc8261d58901e32314e70d445feb6308 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 16:54:29 +0100 Subject: [PATCH 23/33] revert: remove articles section from pkgdown config Moved to a separate issue (#1159). Co-authored-by: Sam Abbott --- _pkgdown.yml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 34f63e56e..34853a432 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -26,17 +26,6 @@ navbar: text: "scoringutils paper" href: https://raw.githubusercontent.com/epiforecasts/scoringutils/main/inst/manuscript/manuscript.pdf -articles: - - title: Articles - contents: - - scoring-rules - - scoring-multivariate-forecasts - - handling-missing-forecasts - - title: Deprecated - contents: - - Deprecated-functions - - Deprecated-visualisations - reference: - title: Package documentation contents: From d9aa0fd038ff3b96bf26d96d8d5b166790c8a133 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 17:43:10 +0100 Subject: [PATCH 24/33] docs: address vignette review TODOs - Rename "Relaxing with min_coverage" to "Requiring partial coverage" with clearer explanation of why 0.75 keeps all targets here - Mention compare argument early in the Scoring section - Reorder imputation strategies by severity: NA (diagnostic), worst (heavy penalty), reference model (moderate), mean (least severe) - Show imputed row counts by model and target_type instead of raw rows - Add context to each strategy about when to use it - Fix combined workflow text Co-authored-by: Sam Abbott --- vignettes/handling-missing-forecasts.Rmd | 83 ++++++++++++++---------- 1 file changed, 48 insertions(+), 35 deletions(-) diff --git a/vignettes/handling-missing-forecasts.Rmd b/vignettes/handling-missing-forecasts.Rmd index da0900326..c1d9733c0 100644 --- a/vignettes/handling-missing-forecasts.Rmd +++ b/vignettes/handling-missing-forecasts.Rmd @@ -20,10 +20,9 @@ This vignette walks through diagnosing which targets each model covers, scoring The approaches here were initially inspired by [Kim, Ray & Reich (2026)](https://doi.org/10.1016/j.ijforecast.2025.12.006), who discuss the importance of handling missing forecasts when evaluating model contributions beyond simple leaderboard rankings. - ## Diagnosing missingness -Before adjusting scores it is worth understanding the pattern of missingness. +Before adjusting scores we should aim to understand the patterns of potential missingness. Models may have different coverage for legitimate reasons (a model may only forecast deaths, not cases) or because of operational failures (a missed submission deadline). `get_forecast_counts()` tabulates how many forecasts each model has, grouped by the columns you choose. @@ -50,12 +49,13 @@ death_counts[ ] ``` -Note that `get_pairwise_comparisons()` has long handled missingness internally, restricting each pair of models to their shared set of targets before comparing. +Note that `get_pairwise_comparisons()` handles missingness internally, restricting each pair of models to their shared set of targets before comparing. The functions below give you explicit control over how to handle missingness when computing score summaries. ## Scoring We score all forecasts first, then adjust the resulting scores table. +Both `filter_scores()` and `impute_missing_scores()` accept a `compare` argument (default `"model"`) that specifies which column identifies the units being compared. ```{r score} scores <- score(fc) @@ -71,7 +71,7 @@ The sections below show two approaches to addressing this: filtering scores to a ## Filtering to a common set of targets -`filter_scores()` removes scores for target combinations not shared across models. +`filter_scores()` removes scores based on the supplied strategy. This can be appropriate when a model legitimately does not cover certain targets and you want to compare only on shared ground. The default strategy, `filter_to_intersection()`, keeps only targets covered by **all** models. @@ -83,9 +83,10 @@ summarise_scores(scores_filtered, by = "model") This drops all case targets (since `UMass-MechBayes` has no case forecasts) and the death targets `epiforecasts-EpiNow2` missed. -### Relaxing with min_coverage +### Requiring partial coverage -You can relax the requirement with `min_coverage`, keeping targets covered by at least a given proportion of models. +The default requires every model to cover a target for it to be kept. +The `min_coverage` argument relaxes this by keeping targets covered by at least a given proportion of models. With four models, `min_coverage = 0.75` requires coverage by at least three. ```{r filter-relaxed} @@ -96,7 +97,9 @@ scores_relaxed <- filter_scores( summarise_scores(scores_relaxed, by = "model") ``` -This retains the case targets (covered by three of four models) while still dropping any target covered by fewer. +In this example, case targets are covered by three of four models and death targets by three or four, so `min_coverage = 0.75` retains all targets. +The default (`min_coverage = 1`) is stricter and drops all case targets because `UMass-MechBayes` has no case forecasts. +Between 0.75 and 1.0 no intermediate threshold changes the result here because no target is covered by exactly three out of four models while being missing for one that isn't `UMass-MechBayes`. ### Filtering to a specific model's targets @@ -120,9 +123,24 @@ This keeps both case and death targets where EpiNow2 submitted forecasts, but `U Instead of dropping data, `impute_missing_scores()` fills in scores for target combinations a model did not cover. Imputed rows are marked with `.imputed = TRUE` so they can be identified later. +### NA + +The simplest option: fill missing scores with `NA`. +This preserves the structure of the data without making assumptions about what the score would have been. +`NA` values propagate through summaries, so this can also serve as a diagnostic check to confirm where missingness exists. + +```{r impute-na} +scores_na <- impute_missing_scores( + scores, + strategy = impute_na_score() +) +summarise_scores(scores_na, by = "model") +``` + ### Worst score -The simplest penalty: fill each missing score with the worst (maximum) observed score for that target. +Fill each missing score with the worst (maximum) observed score for that target across all models. +This penalises models most heavily for missing targets. ```{r impute-worst} scores_worst <- impute_missing_scores( @@ -132,52 +150,47 @@ scores_worst <- impute_missing_scores( summarise_scores(scores_worst, by = "model") ``` -You can inspect which rows were imputed: +We can check that the imputed rows match the models and targets we identified as missing earlier. ```{r imputed-check} -scores_worst[(.imputed)] +scores_worst[ + (.imputed), + .(n_imputed = .N), + by = c("model", "target_type") +] ``` -### Mean score +### Reference model -A less severe penalty: fill with the mean score across models that did forecast each target. +Fill with the scores of a named baseline model, treating a missing forecast as performing no better than that baseline. -```{r impute-mean} -scores_mean <- impute_missing_scores( +```{r impute-model} +scores_ref <- impute_missing_scores( scores, - strategy = impute_mean_score() + strategy = impute_model_score("EuroCOVIDhub-baseline") ) -summarise_scores(scores_mean, by = "model") +summarise_scores(scores_ref, by = "model") ``` -### NA - -Fill with `NA`, which propagates through summaries unless explicitly handled. - -```{r impute-na} -scores_na <- impute_missing_scores( - scores, - strategy = impute_na_score() -) -summarise_scores(scores_na, by = "model") -``` +This is a reasonable default when a suitable baseline exists, though more research is needed on best practice for choosing the reference model and understanding the impact of this choice. -### Reference model +### Mean score -Fill with the scores of a named baseline model, treating a missing forecast as performing no better than the baseline. +Fill with the mean score across models that did forecast each target. +This is the least severe penalty as it assigns the average performance, which may be close to the ensemble performance. -```{r impute-model} -scores_ref <- impute_missing_scores( +```{r impute-mean} +scores_mean <- impute_missing_scores( scores, - strategy = impute_model_score("EuroCOVIDhub-baseline") + strategy = impute_mean_score() ) -summarise_scores(scores_ref, by = "model") +summarise_scores(scores_mean, by = "model") ``` ## Combining filter and impute -When you want to focus on a specific model's targets but still need complete scores for all models, combine filtering with imputation. -For example, to evaluate on the targets `epiforecasts-EpiNow2` covered and impute scores for models that are missing forecasts within that set: +Consider combining filtering and imputation when you want to focus on a specific model's targets but still need complete scores for all models. +For example, to evaluate on the targets `epiforecasts-EpiNow2` covered and then impute scores for models that are missing forecasts within that set: ```{r pipeline} result <- scores |> From 3cfcfccd170648fcca9f3f8e5b625f8ad53d7ed5 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 17:47:05 +0100 Subject: [PATCH 25/33] test: add integration tests for filter and impute with real data - filter_to_intersection(include) with scores_quantile verifies EpiNow2's targets are correctly selected - filter_to_intersection(min_coverage) boundary: 0.75 keeps cases, 1.0 drops them - filter then summarise gives equal target counts per model - impute then summarise gives equal row counts per model - impute_model_score values match reference model scores exactly Co-authored-by: Sam Abbott --- tests/testthat/test-filter-scores.R | 113 ++++++++++++++++++++ tests/testthat/test-impute-missing-scores.R | 79 ++++++++++++++ 2 files changed, 192 insertions(+) diff --git a/tests/testthat/test-filter-scores.R b/tests/testthat/test-filter-scores.R index 065e54afb..ff0cb6307 100644 --- a/tests/testthat/test-filter-scores.R +++ b/tests/testthat/test-filter-scores.R @@ -193,3 +193,116 @@ test_that("filter_scores() works with non-default compare", { expect_true(all(result$location == "DE")) expect_equal(nrow(result), 2) }) + + +# ============================================================================== +# Integration tests with scores_quantile +# ============================================================================== +test_that( + "filter_to_intersection(include) with real data filters + to model's targets", + { + scores <- scores_quantile + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + # EpiNow2 is missing 9 death targets out of 256 + epinow2_targets <- unique( + scores[ + model == "epiforecasts-EpiNow2", + target_cols, + with = FALSE + ] + ) + n_epinow2 <- nrow(epinow2_targets) + + result <- filter_scores( + scores, + strategy = filter_to_intersection( + include = "epiforecasts-EpiNow2" + ) + ) + + # All remaining targets should be EpiNow2's targets + result_targets <- unique( + result[, target_cols, with = FALSE] + ) + expect_equal(nrow(result_targets), n_epinow2) + + # The 9 death targets EpiNow2 doesn't cover + # should have been dropped + all_targets <- unique( + scores[, target_cols, with = FALSE] + ) + n_dropped_targets <- nrow(all_targets) - n_epinow2 + expect_equal(n_dropped_targets, 9) + + # Every model in the result should only have + # targets that EpiNow2 covers + data.table::setkeyv(result_targets, target_cols) + data.table::setkeyv(epinow2_targets, target_cols) + expect_equal(result_targets, epinow2_targets) + } +) + +test_that( + "filter_to_intersection(min_coverage) boundary with + real data", + { + scores <- scores_quantile + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + # 4 models total. UMass-MechBayes has no case targets + # (128/256), so case targets are covered by 3/4 = 0.75. + # At min_coverage = 0.75 case targets should be kept. + result_relaxed <- filter_scores( + scores, + strategy = filter_to_intersection( + min_coverage = 0.75 + ) + ) + relaxed_types <- unique(result_relaxed$target_type) + expect_true("Cases" %in% relaxed_types) + expect_true("Deaths" %in% relaxed_types) + + # At min_coverage = 1.0 (default), case targets + # should be dropped because UMass-MechBayes lacks them. + result_strict <- filter_scores(scores) + strict_types <- unique(result_strict$target_type) + expect_false("Cases" %in% strict_types) + expect_true("Deaths" %in% strict_types) + + # Relaxed should have strictly more rows + expect_gt(nrow(result_relaxed), nrow(result_strict)) + } +) + +test_that( + "filter_scores then summarise_scores gives equal target + counts per model", + { + scores <- scores_quantile + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + filtered <- filter_scores(scores) + + # Count distinct targets per model + targets_per_model <- filtered[, + .(n_targets = data.table::uniqueN( + .SD[, target_cols, with = FALSE] + )), + by = "model" + ] + # All models should have the same number of targets + expect_length(unique(targets_per_model$n_targets), 1) + + # Summarise should work and give a row per model + summary <- summarise_scores(filtered, by = "model") + expect_equal( + nrow(summary), + length(unique(filtered$model)) + ) + } +) diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index 51003f133..7eb4288a3 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -358,3 +358,82 @@ test_that( expect_equal(imputed$location, "US") } ) + + +# ============================================================================== +# Additional integration tests +# ============================================================================== +test_that( + "impute then summarise gives equal row counts per model", + { + scores <- scores_quantile + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + result <- impute_missing_scores( + scores, strategy = impute_na_score() + ) + + # After imputation, every model should have the same + # number of rows (one per target combination) + rows_per_model <- result[, .N, by = "model"] + expect_equal( + length(unique(rows_per_model$N)), 1, + info = paste( + "Expected equal rows per model, got:", + paste( + rows_per_model$model, + rows_per_model$N, + sep = "=", collapse = ", " + ) + ) + ) + } +) + +test_that( + "impute_model_score values match reference model scores", + { + scores <- scores_quantile + metrics <- attr(scores, "metrics") + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + ref_name <- "EuroCOVIDhub-baseline" + + result <- impute_missing_scores( + scores, + strategy = impute_model_score(ref_name) + ) + imputed <- result[(.imputed)] + expect_gt(nrow(imputed), 0) + + # Get the reference model's actual scores + ref_scores <- scores[model == ref_name] + + # For each imputed row, the metric values should equal + # the reference model's scores for the same target + merged <- merge( + imputed, + ref_scores, + by = target_cols, + suffixes = c(".imputed", ".ref") + ) + expect_equal(nrow(merged), nrow(imputed)) + + for (m in metrics) { + col_imp <- paste0(m, ".imputed") + col_ref <- paste0(m, ".ref") + if (col_imp %in% names(merged) && + col_ref %in% names(merged)) { + expect_equal( + merged[[col_imp]], + merged[[col_ref]], + info = paste( + "Imputed", m, + "should match reference model" + ) + ) + } + } + } +) From 22c790f076928b83eac29c11ad222d9f1491b0e3 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 17:52:19 +0100 Subject: [PATCH 26/33] docs: address vignette TODOs, suppress test messages Vignette: - Rename "Relaxing with min_coverage" to "Requiring partial coverage" with explanation of why 0.75 keeps all targets here - Mention compare argument early in the Scoring section - Reorder imputation: NA, worst, reference model, mean (severity order) - Show imputed row counts by model/target_type not raw rows - Add context to each strategy about when to use it - Soften absolute claims Tests: - Wrap filter_scores/impute_missing_scores calls in suppressMessages to reduce test output noise, matching existing test conventions Co-authored-by: Sam Abbott --- tests/testthat/test-filter-scores.R | 20 +++--- tests/testthat/test-impute-missing-scores.R | 80 +++++++++++---------- 2 files changed, 51 insertions(+), 49 deletions(-) diff --git a/tests/testthat/test-filter-scores.R b/tests/testthat/test-filter-scores.R index ff0cb6307..ef99b1ed7 100644 --- a/tests/testthat/test-filter-scores.R +++ b/tests/testthat/test-filter-scores.R @@ -66,7 +66,7 @@ test_that("filter_scores() drops incomplete targets", { wis = c(1, 2, 3) ) scores <- new_scores(scores, "wis") - result <- filter_scores(scores) + result <- suppressMessages(filter_scores(scores)) # Only DE should remain (both include have it) expect_equal(nrow(result), 2) expect_true(all(result$location == "DE")) @@ -79,7 +79,7 @@ test_that("filter_scores() preserves class and metrics", { wis = c(1, 2, 3) ) scores <- new_scores(scores, "wis") - result <- filter_scores(scores) + result <- suppressMessages(filter_scores(scores)) expect_s3_class(result, "scores") expect_equal(attr(result, "metrics"), "wis") }) @@ -187,9 +187,9 @@ test_that("filter_scores() works with non-default compare", { wis = c(1, 2, 3) ) scores <- new_scores(scores, "wis") - result <- filter_scores( + result <- suppressMessages(filter_scores( scores, compare = "forecaster" - ) + )) expect_true(all(result$location == "DE")) expect_equal(nrow(result), 2) }) @@ -216,12 +216,12 @@ test_that( ) n_epinow2 <- nrow(epinow2_targets) - result <- filter_scores( + result <- suppressMessages(filter_scores( scores, strategy = filter_to_intersection( include = "epiforecasts-EpiNow2" ) - ) + )) # All remaining targets should be EpiNow2's targets result_targets <- unique( @@ -256,19 +256,19 @@ test_that( # 4 models total. UMass-MechBayes has no case targets # (128/256), so case targets are covered by 3/4 = 0.75. # At min_coverage = 0.75 case targets should be kept. - result_relaxed <- filter_scores( + result_relaxed <- suppressMessages(filter_scores( scores, strategy = filter_to_intersection( min_coverage = 0.75 ) - ) + )) relaxed_types <- unique(result_relaxed$target_type) expect_true("Cases" %in% relaxed_types) expect_true("Deaths" %in% relaxed_types) # At min_coverage = 1.0 (default), case targets # should be dropped because UMass-MechBayes lacks them. - result_strict <- filter_scores(scores) + result_strict <- suppressMessages(filter_scores(scores)) strict_types <- unique(result_strict$target_type) expect_false("Cases" %in% strict_types) expect_true("Deaths" %in% strict_types) @@ -286,7 +286,7 @@ test_that( fu <- get_forecast_unit(scores) target_cols <- setdiff(fu, "model") - filtered <- filter_scores(scores) + filtered <- suppressMessages(filter_scores(scores)) # Count distinct targets per model targets_per_model <- filtered[, diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index 7eb4288a3..05918b7f8 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -14,9 +14,9 @@ test_that( scores <- new_scores( scores, get_metrics.scores(scores_quantile) ) - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = impute_na_score() - ) + )) expect_true(".imputed" %in% names(result)) expect_false(any(result$.imputed)) } @@ -26,9 +26,9 @@ test_that( "impute_missing_scores preserves scores class and metrics", { scores <- scores_quantile - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = impute_na_score() - ) + )) expect_s3_class(result, "scores") expect_identical( get_metrics.scores(result), @@ -39,9 +39,9 @@ test_that( test_that(".imputed is not in get_metrics.scores output", { scores <- scores_quantile - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = impute_na_score() - ) + )) metrics <- get_metrics.scores(result) expect_false(".imputed" %in% metrics) }) @@ -54,9 +54,9 @@ test_that(".imputed is not in get_forecast_unit output", { ".imputed not yet in get_protected_columns" ) scores <- scores_quantile - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = impute_na_score() - ) + )) fu <- get_forecast_unit(result) expect_false(".imputed" %in% fu) }) @@ -78,9 +78,9 @@ test_that("impute_worst_score fills with max observed score", { ) scores <- scores_quantile metrics <- get_metrics.scores(scores) - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = impute_worst_score() - ) + )) # Imputed rows should exist imputed <- result[(.imputed)] @@ -122,9 +122,9 @@ test_that("impute_mean_score fills with mean observed score", { "build_missing_grid not yet available" ) scores <- scores_quantile - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = impute_mean_score() - ) + )) imputed <- result[(.imputed)] expect_gte(nrow(imputed), 0) }) @@ -137,9 +137,9 @@ test_that("impute_na_score fills with NA_real_", { ) scores <- scores_quantile metrics <- get_metrics.scores(scores) - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = impute_na_score() - ) + )) imputed <- result[(.imputed)] if (nrow(imputed) > 0) { for (m in metrics) { @@ -164,12 +164,12 @@ test_that( scores <- scores_quantile # EuroCOVIDhub-baseline has all 256 targets so can # serve as reference for all missing combinations - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = impute_model_score( "EuroCOVIDhub-baseline" ) - ) + )) expect_s3_class(result, "scores") } ) @@ -210,9 +210,9 @@ test_that("custom strategy function works", { } scores <- scores_quantile metrics <- get_metrics.scores(scores) - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = custom_strategy - ) + )) imputed <- result[(.imputed)] if (nrow(imputed) > 0) { for (m in metrics) { @@ -235,9 +235,10 @@ test_that( "build_missing_grid not yet available" ) scores <- scores_quantile - result <- scores |> - impute_missing_scores(strategy = impute_na_score()) |> - summarise_scores(by = "model") + imputed <- suppressMessages(impute_missing_scores( + scores, strategy = impute_na_score() + )) + result <- summarise_scores(imputed, by = "model") expect_s3_class(result, "data.table") } ) @@ -263,15 +264,16 @@ test_that( scores <- scores_quantile ref_model <- "EuroCOVIDhub-baseline" - result <- scores |> - filter_scores( - strategy = filter_to_intersection( - include = ref_model - ) - ) |> - impute_missing_scores( - strategy = impute_model_score(ref_model) + filtered <- suppressMessages(filter_scores( + scores, + strategy = filter_to_intersection( + include = ref_model ) + )) + result <- suppressMessages(impute_missing_scores( + filtered, + strategy = impute_model_score(ref_model) + )) expect_s3_class(result, "scores") } ) @@ -287,11 +289,11 @@ test_that( # Attribute claims "wis" and "fake" but "fake" is # not a column scores <- new_scores(scores, c("wis", "fake")) - result <- suppressWarnings( + result <- suppressWarnings(suppressMessages( impute_missing_scores( scores, strategy = impute_worst_score() ) - ) + )) imputed <- result[result$.imputed] expect_equal(nrow(imputed), 1) expect_false("fake" %in% names(imputed)) @@ -307,11 +309,11 @@ test_that( wis = c(1, 2, 3) ) scores <- new_scores(scores, c("wis", "fake")) - result <- suppressWarnings( + result <- suppressWarnings(suppressMessages( impute_missing_scores( scores, strategy = impute_mean_score() ) - ) + )) imputed <- result[result$.imputed] expect_equal(nrow(imputed), 1) expect_false("fake" %in% names(imputed)) @@ -346,11 +348,11 @@ test_that( wis = c(1, 2, 3) ) scores <- new_scores(scores, "wis") - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = impute_worst_score(), compare = "forecaster" - ) + )) expect_true(".imputed" %in% names(result)) imputed <- result[result$.imputed] expect_equal(nrow(imputed), 1) @@ -370,9 +372,9 @@ test_that( fu <- get_forecast_unit(scores) target_cols <- setdiff(fu, "model") - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = impute_na_score() - ) + )) # After imputation, every model should have the same # number of rows (one per target combination) @@ -400,10 +402,10 @@ test_that( target_cols <- setdiff(fu, "model") ref_name <- "EuroCOVIDhub-baseline" - result <- impute_missing_scores( + result <- suppressMessages(impute_missing_scores( scores, strategy = impute_model_score(ref_name) - ) + )) imputed <- result[(.imputed)] expect_gt(nrow(imputed), 0) From a7934c6bf6c30d41c7af1babba8f3dfb06fee8c4 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 18:05:12 +0100 Subject: [PATCH 27/33] style: use bare i = in cli_inform to avoid nolint blocks Co-authored-by: Sam Abbott --- R/filter-scores.R | 10 +++------- R/impute-missing-scores.R | 10 +++------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/R/filter-scores.R b/R/filter-scores.R index 71c691457..fb0744938 100644 --- a/R/filter-scores.R +++ b/R/filter-scores.R @@ -47,20 +47,16 @@ filter_scores <- function( #nolint end if (n_dropped == 0) { - #nolint start: keyword_quote_linter cli_inform(c( - "i" = "No rows filtered. Returning scores unchanged." + i = "No rows filtered. Returning scores unchanged." )) - #nolint end return(scores) } - #nolint start: keyword_quote_linter cli_inform(c( - "i" = "Filtered out {n_dropped} rows.", - "i" = "{n_after} of {n_before} rows remaining." + i = "Filtered out {n_dropped} rows.", + i = "{n_after} of {n_before} rows remaining." # nolint: duplicate_argument_linter )) - #nolint end # Preserve class and metrics class(result) <- original_class diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R index 893abf646..0c0d66ac2 100644 --- a/R/impute-missing-scores.R +++ b/R/impute-missing-scores.R @@ -67,11 +67,9 @@ impute_missing_scores <- function( missing_rows <- build_missing_grid(scores, compare) # nolint: object_usage_linter if (nrow(missing_rows) == 0) { - #nolint start: keyword_quote_linter cli_inform(c( - "i" = "No missing scores to impute. Returning scores unchanged." + i = "No missing scores to impute. Returning scores unchanged." )) - #nolint end data.table::set(scores, j = ".imputed", value = FALSE) return(scores[]) } @@ -80,12 +78,10 @@ impute_missing_scores <- function( n_missing <- nrow(missing_rows) n_comparators <- length(unique(missing_rows[[compare]])) #nolint end - #nolint start: keyword_quote_linter cli_inform(c( - "i" = "Imputing {n_missing} missing score row{?s}.", - "i" = "{n_comparators} {compare} {cli::qty(n_comparators)}value{?s} affected." # nolint: line_length_linter + i = "Imputing {n_missing} missing score row{?s}.", + i = "{n_comparators} {compare} {cli::qty(n_comparators)}value{?s} affected." # nolint: line_length_linter )) - #nolint end filled <- strategy(scores, missing_rows, metrics, compare) From 7c88db22c042ae4ac0bd50f36856b2833c17961b Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 18:06:09 +0100 Subject: [PATCH 28/33] style: remove unhelpful comment in build_missing_grid Co-authored-by: Sam Abbott --- R/missing-scores-internal.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/missing-scores-internal.R b/R/missing-scores-internal.R index 30cce93aa..5b6f94295 100644 --- a/R/missing-scores-internal.R +++ b/R/missing-scores-internal.R @@ -21,7 +21,6 @@ build_missing_grid <- function(scores, compare = "model") { forecast_unit <- get_forecast_unit(scores) target_cols <- setdiff(forecast_unit, compare) - # Observed target combinations (NOT per-column expand.grid) targets <- unique(scores[, target_cols, with = FALSE]) # All unique compare values From 1803d9c3cc31e269869526c179cae620e8117d25 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 18:08:03 +0100 Subject: [PATCH 29/33] docs: add Post-process scores section to pkgdown reference Move filter_scores, filter_to_intersection, and impute_* functions from handle-metrics to a new postprocess-scores keyword and pkgdown section, separating them from metric selection functions. Co-authored-by: Sam Abbott --- R/filter-scores.R | 4 ++-- R/impute-missing-scores.R | 10 +++++----- _pkgdown.yml | 3 +++ man/filter_scores.Rd | 2 +- man/filter_to_intersection.Rd | 2 +- man/impute_mean_score.Rd | 2 +- man/impute_missing_scores.Rd | 2 +- man/impute_model_score.Rd | 2 +- man/impute_na_score.Rd | 2 +- man/impute_worst_score.Rd | 2 +- 10 files changed, 17 insertions(+), 14 deletions(-) diff --git a/R/filter-scores.R b/R/filter-scores.R index fb0744938..dc66fc8a7 100644 --- a/R/filter-scores.R +++ b/R/filter-scores.R @@ -24,7 +24,7 @@ #' @importFrom checkmate assert_class assert_character #' assert_function assert_subset #' @export -#' @keywords handle-metrics +#' @keywords postprocess-scores filter_scores <- function( scores, strategy = filter_to_intersection(), @@ -89,7 +89,7 @@ filter_scores <- function( #' @importFrom data.table as.data.table setkeyv #' @importFrom checkmate assert_number assert_character #' @export -#' @keywords handle-metrics +#' @keywords postprocess-scores filter_to_intersection <- function( min_coverage = 1, include = NULL diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R index 0c0d66ac2..93723e0e3 100644 --- a/R/impute-missing-scores.R +++ b/R/impute-missing-scores.R @@ -40,7 +40,7 @@ #' assert_character assert_subset #' @importFrom cli cli_abort cli_inform #' @export -#' @keywords handle-metrics +#' @keywords postprocess-scores #' @examples #' \dontshow{ #' data.table::setDTthreads(2) @@ -107,7 +107,7 @@ impute_missing_scores <- function( #' @return A function suitable for use as the `strategy` #' argument in [impute_missing_scores()]. #' @export -#' @keywords handle-metrics +#' @keywords postprocess-scores #' @examples #' \dontshow{ #' data.table::setDTthreads(2) @@ -158,7 +158,7 @@ impute_worst_score <- function() { #' @return A function suitable for use as the `strategy` #' argument in [impute_missing_scores()]. #' @export -#' @keywords handle-metrics +#' @keywords postprocess-scores #' @examples #' \dontshow{ #' data.table::setDTthreads(2) @@ -205,7 +205,7 @@ impute_mean_score <- function() { #' @return A function suitable for use as the `strategy` #' argument in [impute_missing_scores()]. #' @export -#' @keywords handle-metrics +#' @keywords postprocess-scores #' @examples #' \dontshow{ #' data.table::setDTthreads(2) @@ -242,7 +242,7 @@ impute_na_score <- function() { #' #' @importFrom cli cli_abort #' @export -#' @keywords handle-metrics +#' @keywords postprocess-scores #' @examples #' \dontshow{ #' data.table::setDTthreads(2) diff --git a/_pkgdown.yml b/_pkgdown.yml index 34853a432..91c4f6038 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -50,6 +50,9 @@ reference: - title: Handling metrics/scoring functions contents: - has_keyword("handle-metrics") + - title: Post-process scores + contents: + - has_keyword("postprocess-scores") - title: Evaluate forecasts contents: - has_keyword("scoring") diff --git a/man/filter_scores.Rd b/man/filter_scores.Rd index 5b129b121..7856e0529 100644 --- a/man/filter_scores.Rd +++ b/man/filter_scores.Rd @@ -32,4 +32,4 @@ all logic to the strategy. \seealso{ \code{vignette("handling-missing-forecasts")} } -\keyword{handle-metrics} +\keyword{postprocess-scores} diff --git a/man/filter_to_intersection.Rd b/man/filter_to_intersection.Rd index a1a04d336..f849c212c 100644 --- a/man/filter_to_intersection.Rd +++ b/man/filter_to_intersection.Rd @@ -27,4 +27,4 @@ Strategy factory for \code{\link[=filter_scores]{filter_scores()}}. Returns a function that keeps only target combinations covered by a minimum proportion of comparators. } -\keyword{handle-metrics} +\keyword{postprocess-scores} diff --git a/man/impute_mean_score.Rd b/man/impute_mean_score.Rd index c00796cc0..1caf41367 100644 --- a/man/impute_mean_score.Rd +++ b/man/impute_mean_score.Rd @@ -26,4 +26,4 @@ scores <- example_quantile |> impute_missing_scores(scores, strategy = impute_mean_score()) } -\keyword{handle-metrics} +\keyword{postprocess-scores} diff --git a/man/impute_missing_scores.Rd b/man/impute_missing_scores.Rd index e81a7fbcd..5c63dc0a5 100644 --- a/man/impute_missing_scores.Rd +++ b/man/impute_missing_scores.Rd @@ -58,4 +58,4 @@ impute_missing_scores(scores, strategy = impute_na_score()) \seealso{ \code{vignette("handling-missing-forecasts")} } -\keyword{handle-metrics} +\keyword{postprocess-scores} diff --git a/man/impute_model_score.Rd b/man/impute_model_score.Rd index b75f86957..fa027a11c 100644 --- a/man/impute_model_score.Rd +++ b/man/impute_model_score.Rd @@ -34,4 +34,4 @@ impute_missing_scores( strategy = impute_model_score("EuroCOVIDhub-baseline") ) } -\keyword{handle-metrics} +\keyword{postprocess-scores} diff --git a/man/impute_na_score.Rd b/man/impute_na_score.Rd index 399af054c..c1a812cd4 100644 --- a/man/impute_na_score.Rd +++ b/man/impute_na_score.Rd @@ -24,4 +24,4 @@ scores <- example_quantile |> impute_missing_scores(scores, strategy = impute_na_score()) } -\keyword{handle-metrics} +\keyword{postprocess-scores} diff --git a/man/impute_worst_score.Rd b/man/impute_worst_score.Rd index 48c5e6806..e6528caaf 100644 --- a/man/impute_worst_score.Rd +++ b/man/impute_worst_score.Rd @@ -26,4 +26,4 @@ scores <- example_quantile |> impute_missing_scores(scores, strategy = impute_worst_score()) } -\keyword{handle-metrics} +\keyword{postprocess-scores} From cfdc3c401fcef55c4fbe03d067255cc377f9089f Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Tue, 31 Mar 2026 18:11:08 +0100 Subject: [PATCH 30/33] test: verify impute_mean_score values, original rows unchanged, filter messages - impute_mean_score now verifies imputed values match per-target mean - imputation preserves original score values (compared as numeric to handle logical-to-integer coercion from rbindlist) - filter_scores reports correct "Filtered out N rows" message Co-authored-by: Sam Abbott --- tests/testthat/test-filter-scores.R | 13 +++++ tests/testthat/test-impute-missing-scores.R | 55 ++++++++++++++++++++- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/tests/testthat/test-filter-scores.R b/tests/testthat/test-filter-scores.R index ef99b1ed7..95d00f3bb 100644 --- a/tests/testthat/test-filter-scores.R +++ b/tests/testthat/test-filter-scores.R @@ -98,6 +98,19 @@ test_that("filter_scores() unchanged when nothing missing", { expect_equal(nrow(result), 4) }) +test_that("filter_scores() reports rows dropped", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + expect_message( + filter_scores(scores), + "Filtered out 1 row" + ) +}) + test_that("filter_scores() errors on invalid compare column", { scores <- data.table::data.table( model = c("A", "A", "B"), diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index 05918b7f8..641d2090e 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -122,11 +122,38 @@ test_that("impute_mean_score fills with mean observed score", { "build_missing_grid not yet available" ) scores <- scores_quantile + metrics <- get_metrics.scores(scores) result <- suppressMessages(impute_missing_scores( scores, strategy = impute_mean_score() )) imputed <- result[(.imputed)] - expect_gte(nrow(imputed), 0) + expect_gt(nrow(imputed), 0) + + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + for (m in metrics) { + if (!(m %in% names(imputed)) || + !(m %in% names(scores))) next + mean_per_target <- scores[, + .(..mean = mean(get(m), na.rm = TRUE)), + by = target_cols + ] + merged <- merge( + imputed, mean_per_target, + by = target_cols, all.x = TRUE + ) + expect_true( + all( + abs(merged[[m]] - merged[["..mean"]]) < 1e-10 | + is.na(merged[[m]]) + ), + info = paste( + "metric", m, + "does not match per-target mean" + ) + ) + } }) test_that("impute_na_score fills with NA_real_", { @@ -439,3 +466,29 @@ test_that( } } ) + +test_that( + "imputation does not alter original score values", + { + scores <- scores_quantile + metrics <- get_metrics.scores(scores) + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_worst_score() + )) + + originals <- result[!(.imputed)] + originals[, .imputed := NULL] + + # Original metric values should be unchanged + for (m in metrics) { + if (!(m %in% names(originals))) next + data.table::setkeyv(originals, names(scores)) + data.table::setkeyv(scores, names(scores)) + expect_equal( + as.numeric(originals[[m]]), + as.numeric(scores[[m]]), + info = paste("metric", m, "changed") + ) + } + } +) From 1cf4beb454ff10c484caf3c0716ef72efa88b307 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Wed, 1 Apr 2026 10:41:12 +0100 Subject: [PATCH 31/33] test: update vdiffr plot snapshots after merge from main Co-authored-by: Sam Abbott --- .../plot-pairwise-comparison-pval.svg | 8 ++++---- .../pairwise_comparison/plot-pairwise-comparison.svg | 8 ++++---- tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg | 8 ++++---- tests/testthat/_snaps/plot_wis/plot-wis-flip.svg | 12 ++++++------ .../_snaps/plot_wis/plot-wis-no-relative.svg | 12 ++++++------ tests/testthat/_snaps/plot_wis/plot-wis.svg | 12 ++++++------ 6 files changed, 30 insertions(+), 30 deletions(-) diff --git a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg index b22701e63..a8d99587f 100644 --- a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg +++ b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg @@ -29,8 +29,8 @@ - + @@ -38,8 +38,8 @@ < 0.001 < 0.001 1 -< 0.001 0.298 +< 0.001 1 0.298 < 0.001 @@ -57,9 +57,9 @@ - + @@ -73,9 +73,9 @@ < 0.001 < 0.001 1 -< 0.001 < 0.001 < 0.001 +< 0.001 1 0.007 < 0.001 diff --git a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg index 372e416d9..aa58d5f93 100644 --- a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg +++ b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg @@ -29,8 +29,8 @@ - + @@ -38,8 +38,8 @@ 1.37 1.59 1 -0.63 0.86 +0.63 1 1.16 0.73 @@ -57,9 +57,9 @@ - + @@ -73,9 +73,9 @@ 3.03 3.85 1 -0.26 0.62 0.79 +0.26 1 0.74 1.27 diff --git a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg index 74ca3f682..fb64844df 100644 --- a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg +++ b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg @@ -26,18 +26,18 @@ - + - + --0.06 0.1 +-0.06 -0.08 -0.07 0.34 +0.07 -0.02 -0.01 diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg index aae13f7d0..00103e3ab 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg @@ -26,14 +26,14 @@ - + - + - + @@ -44,16 +44,16 @@ - + - + - + diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg index 2c21e6aca..a271f2373 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg @@ -26,14 +26,14 @@ - + - + - + @@ -44,16 +44,16 @@ - + - + - + diff --git a/tests/testthat/_snaps/plot_wis/plot-wis.svg b/tests/testthat/_snaps/plot_wis/plot-wis.svg index 6d842785e..c03f3893d 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis.svg @@ -26,14 +26,14 @@ - + - + - + @@ -44,16 +44,16 @@ - + - + - + From 0d3cf4bdb9e063441898fa7288864555645134b1 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Wed, 22 Apr 2026 15:37:24 +0100 Subject: [PATCH 32/33] refactor(filter-scores): split intersection strategy and tidy Addresses inline review on PR #1156. - Split filter_to_intersection() into two strategies: filter_to_intersection(min_coverage) for proportion-based coverage and filter_to_include(include) for intersection of named compare values. The two branches were mutually exclusive and now have single-purpose docs. - filter_to_include() uses Reduce() for the merge chain and drops redundant setkeyv calls since merge's by handles the join. - filter_scores() now returns via new_scores() so class and metric preservation follow the same path as impute_missing_scores(). - Add internal assert_strategy() to validate strategy formals (used next commit). Update tests and the handling-missing vignette to use filter_to_include(). --- NAMESPACE | 2 + R/filter-scores.R | 258 ++++++++++++-------- R/missing-scores-internal.R | 40 +++ man/assert_strategy.Rd | 27 ++ man/filter_scores.Rd | 55 ++++- man/filter_to_include.Rd | 45 ++++ man/filter_to_intersection.Rd | 46 ++-- man/get_forecast_type_ids.Rd | 2 +- tests/testthat/test-filter-scores.R | 20 +- tests/testthat/test-impute-missing-scores.R | 8 +- vignettes/handling-missing-forecasts.Rmd | 10 +- 11 files changed, 363 insertions(+), 150 deletions(-) create mode 100644 man/assert_strategy.Rd create mode 100644 man/filter_to_include.Rd diff --git a/NAMESPACE b/NAMESPACE index b5b246eb0..8dea0e210 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -79,6 +79,7 @@ export(dispersion_sample) export(dss_sample) export(energy_score_multivariate) export(filter_scores) +export(filter_to_include) export(filter_to_intersection) export(get_correlations) export(get_coverage) @@ -192,6 +193,7 @@ importFrom(data.table,setcolorder) importFrom(data.table,setkeyv) importFrom(data.table,setnames) importFrom(data.table,setorderv) +importFrom(data.table,uniqueN) importFrom(ggplot2,.data) importFrom(ggplot2,`%+replace%`) importFrom(ggplot2,aes) diff --git a/R/filter-scores.R b/R/filter-scores.R index dc66fc8a7..eb4c0daec 100644 --- a/R/filter-scores.R +++ b/R/filter-scores.R @@ -1,30 +1,58 @@ #' @title Filter scores #' #' @description -#' Filters a `scores` object according to a given strategy. -#' The filtering behaviour is controlled by the `strategy` -#' argument, which defaults to [filter_to_intersection()]. -#' This is a general-purpose filtering function that delegates -#' all logic to the strategy. +#' Filter a `scores` object using a supplied strategy function. +#' `filter_scores()` is responsible for preserving the `scores` +#' class and the `metrics` attribute; the strategy is +#' responsible only for the filtering logic. +#' +#' Strategies are constructed by helpers such as +#' [filter_to_intersection()] and [filter_to_include()] and can +#' also be user-defined. A strategy is a function with +#' signature `function(scores, compare)` that returns a +#' filtered data.table with the same columns as its input. #' #' @param scores An object of class `scores` (a data.table with -#' scores and an additional attribute `metrics` as produced -#' by [score()]). -#' @param strategy A strategy function as returned by -#' [filter_to_intersection()]. Default is -#' `filter_to_intersection()`. +#' an additional `metrics` attribute as produced by [score()]). +#' @param strategy A strategy function. See Description for the +#' expected signature. Default: [filter_to_intersection()]. #' @param compare Character string (default `"model"`) naming the -#' column whose values are compared for filtering. +#' column whose values are compared when deciding which +#' target combinations to keep. #' -#' @return A filtered `scores` object with the same class and -#' `metrics` attribute as the input. +#' @return A `scores` object with the same class and `metrics` +#' attribute as the input, with rows filtered according to +#' `strategy`. #' -#' @seealso \code{vignette("handling-missing-forecasts")} +#' @seealso [filter_to_intersection()], [filter_to_include()], +#' \code{vignette("handling-missing-forecasts")} #' @importFrom cli cli_inform #' @importFrom checkmate assert_class assert_character -#' assert_function assert_subset +#' assert_subset #' @export #' @keywords postprocess-scores +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' # Keep only targets covered by every model (the default) +#' filter_scores(scores) +#' +#' # Keep targets covered by at least 75% of models +#' filter_scores( +#' scores, +#' strategy = filter_to_intersection(min_coverage = 0.75) +#' ) +#' +#' # Keep only targets covered by a named model +#' filter_scores( +#' scores, +#' strategy = filter_to_include("EuroCOVIDhub-baseline") +#' ) filter_scores <- function( scores, strategy = filter_to_intersection(), @@ -33,18 +61,15 @@ filter_scores <- function( assert_class(scores, "scores") assert_character(compare, len = 1) assert_subset(compare, names(scores)) - assert_function(strategy) + assert_strategy(strategy, required = "compare") - original_class <- class(scores) original_metrics <- attr(scores, "metrics") result <- strategy(scores, compare = compare) n_before <- nrow(scores) n_after <- nrow(result) - #nolint start: object_usage_linter n_dropped <- n_before - n_after - #nolint end if (n_dropped == 0) { cli_inform(c( @@ -58,109 +83,146 @@ filter_scores <- function( i = "{n_after} of {n_before} rows remaining." # nolint: duplicate_argument_linter )) - # Preserve class and metrics - class(result) <- original_class - data.table::setattr(result, "metrics", original_metrics) - - return(result) + return(new_scores(result, original_metrics)) } -#' @title Filter to intersection of model-target combinations +#' @title Filter to target combinations meeting a coverage threshold #' #' @description -#' Strategy factory for [filter_scores()]. -#' Returns a function that keeps only target combinations -#' covered by a minimum proportion of comparators. +#' Strategy for [filter_scores()] that keeps target combinations +#' covered by at least `min_coverage` of the values in the +#' `compare` column. With the default `min_coverage = 1`, only +#' target combinations present for every compare value are kept +#' (strict intersection across the full set). +#' +#' To restrict to the targets covered by a named subset of +#' compare values instead of by a proportion, use +#' [filter_to_include()]. #' #' @param min_coverage Numeric between 0 and 1 (default `1`). -#' Minimum proportion of comparators that must cover a +#' Minimum proportion of compare values that must cover a #' target combination for it to be kept. -#' @param include Character vector or `NULL` (default). If -#' provided, the target grid is restricted to targets -#' covered by these values of the `compare` column. When -#' multiple values are given, only the intersection of -#' their targets is used. #' -#' @return A function with signature `function(scores, compare)` -#' suitable for use as a strategy in -#' [filter_scores()]. +#' @return A strategy function for [filter_scores()]. Intended +#' to be passed to `filter_scores()` rather than called +#' directly — `filter_scores()` is where the `scores` class +#' and `metrics` attribute are preserved. #' -#' @importFrom data.table as.data.table setkeyv -#' @importFrom checkmate assert_number assert_character +#' @seealso [filter_scores()], [filter_to_include()] +#' @importFrom data.table as.data.table setkeyv uniqueN +#' @importFrom checkmate assert_number #' @export #' @keywords postprocess-scores -filter_to_intersection <- function( - min_coverage = 1, - include = NULL -) { +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' filter_scores( +#' scores, +#' strategy = filter_to_intersection(min_coverage = 0.75) +#' ) +filter_to_intersection <- function(min_coverage = 1) { assert_number(min_coverage, lower = 0, upper = 1) - if (!is.null(include)) { - assert_character(include, min.len = 1) + + function(scores, compare = "model") { + scores <- data.table::as.data.table(scores) + forecast_unit <- get_forecast_unit(scores) + target_cols <- setdiff(forecast_unit, compare) + + n_total <- data.table::uniqueN(scores[[compare]]) + + target_coverage <- scores[, + .(n_compare = data.table::uniqueN(get(compare))), + by = target_cols + ] + + keep <- target_coverage$n_compare / n_total >= min_coverage + qualifying <- target_coverage[keep, target_cols, with = FALSE] + + data.table::setkeyv(scores, target_cols) + data.table::setkeyv(qualifying, target_cols) + scores[qualifying, nomatch = NULL] } +} + + +#' @title Filter to targets covered by named compare values +#' +#' @description +#' Strategy for [filter_scores()] that restricts the kept +#' target combinations to those covered by every value listed +#' in `include`. With a single value this keeps only that +#' value's targets; with several values, the intersection of +#' their target sets is kept. +#' +#' To use a proportion-based threshold over all compare values +#' instead, use [filter_to_intersection()]. +#' +#' @param include Character vector of length one or more. Values +#' from the `compare` column whose target sets should be +#' intersected. +#' +#' @return A strategy function for [filter_scores()]. Intended +#' to be passed to `filter_scores()` rather than called +#' directly — `filter_scores()` is where the `scores` class +#' and `metrics` attribute are preserved. +#' +#' @seealso [filter_scores()], [filter_to_intersection()] +#' @importFrom data.table as.data.table setkeyv +#' @importFrom checkmate assert_character +#' @importFrom cli cli_abort +#' @export +#' @keywords postprocess-scores +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' filter_scores( +#' scores, +#' strategy = filter_to_include("EuroCOVIDhub-baseline") +#' ) +filter_to_include <- function(include) { + assert_character(include, min.len = 1) function(scores, compare = "model") { scores <- data.table::as.data.table(scores) forecast_unit <- get_forecast_unit(scores) target_cols <- setdiff(forecast_unit, compare) - if (!is.null(include)) { - unknown <- setdiff(include, unique(scores[[compare]])) - if (length(unknown) > 0) { - cli::cli_abort(c( - "!" = paste0( - "{.val {unknown}} not found in ", - "{.arg {compare}} column." - ) - )) - } - # Restrict to targets covered by specified values - model_targets <- lapply(include, function(m) { - unique( - scores[ - scores[[compare]] == m, - target_cols, - with = FALSE - ] + unknown <- setdiff(include, unique(scores[[compare]])) + if (length(unknown) > 0) { + cli_abort(c( + "!" = paste0( + "{.val {unknown}} not found in ", + "{.arg {compare}} column." ) - }) - # Intersection of all specified values' targets - qualifying <- model_targets[[1]] - if (length(model_targets) > 1) { - for (i in seq(2, length(model_targets))) { - data.table::setkeyv(qualifying, target_cols) - data.table::setkeyv( - model_targets[[i]], target_cols - ) - qualifying <- merge( - qualifying, model_targets[[i]], - by = target_cols - ) - } - } - } else { - # Count include per target combination - all_include <- unique(scores[[compare]]) - n_total <- length(all_include) - - target_coverage <- scores[ - , .(n_include = data.table::uniqueN(get(compare))), - by = target_cols - ] - #nolint start: object_usage_linter - qualifying <- target_coverage[ - n_include / n_total >= min_coverage, - #nolint end - target_cols, - with = FALSE - ] + )) } - # Semi-join: keep scores rows matching qualifying targets + target_sets <- lapply(include, function(v) { + unique( + scores[ + scores[[compare]] == v, + target_cols, + with = FALSE + ] + ) + }) + + qualifying <- Reduce( + function(a, b) merge(a, b, by = target_cols), + target_sets + ) + data.table::setkeyv(scores, target_cols) data.table::setkeyv(qualifying, target_cols) - result <- scores[qualifying, nomatch = NULL] - - return(result) + scores[qualifying, nomatch = NULL] } } diff --git a/R/missing-scores-internal.R b/R/missing-scores-internal.R index 5b6f94295..ab54e1210 100644 --- a/R/missing-scores-internal.R +++ b/R/missing-scores-internal.R @@ -1,3 +1,43 @@ +#' Assert that a strategy has the expected signature +#' +#' @description +#' Internal helper used by [filter_scores()] and +#' [impute_missing_scores()] to check that a user-supplied +#' strategy function has at least the required named formals. +#' This catches common mistakes early (e.g. forgetting the +#' `compare` argument) without constraining the strategy +#' author to a specific internal type. +#' +#' @param strategy A function. +#' @param required Character vector of formal names that +#' `strategy` must accept. +#' +#' @return `invisible(NULL)`. Called for its side effect of +#' erroring when the check fails. +#' +#' @importFrom checkmate assert_function +#' @importFrom cli cli_abort +#' @keywords internal +assert_strategy <- function(strategy, required) { + assert_function(strategy) + strategy_formals <- names(formals(strategy)) + missing_args <- setdiff(required, strategy_formals) + if (length(missing_args) > 0) { + cli_abort(c( + "!" = paste0( + "Strategy function is missing required ", + "argument{?s}: {.arg {missing_args}}." + ), + i = paste0( + "Expected formals including: ", + "{.arg {required}}." + ) + )) + } + return(invisible(NULL)) +} + + #' Build grid of missing model-target combinations #' #' @description diff --git a/man/assert_strategy.Rd b/man/assert_strategy.Rd new file mode 100644 index 000000000..389ff57b9 --- /dev/null +++ b/man/assert_strategy.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/missing-scores-internal.R +\name{assert_strategy} +\alias{assert_strategy} +\title{Assert that a strategy has the expected signature} +\usage{ +assert_strategy(strategy, required) +} +\arguments{ +\item{strategy}{A function.} + +\item{required}{Character vector of formal names that +\code{strategy} must accept.} +} +\value{ +\code{invisible(NULL)}. Called for its side effect of +erroring when the check fails. +} +\description{ +Internal helper used by \code{\link[=filter_scores]{filter_scores()}} and +\code{\link[=impute_missing_scores]{impute_missing_scores()}} to check that a user-supplied +strategy function has at least the required named formals. +This catches common mistakes early (e.g. forgetting the +\code{compare} argument) without constraining the strategy +author to a specific internal type. +} +\keyword{internal} diff --git a/man/filter_scores.Rd b/man/filter_scores.Rd index 7856e0529..76b32b474 100644 --- a/man/filter_scores.Rd +++ b/man/filter_scores.Rd @@ -8,28 +8,57 @@ filter_scores(scores, strategy = filter_to_intersection(), compare = "model") } \arguments{ \item{scores}{An object of class \code{scores} (a data.table with -scores and an additional attribute \code{metrics} as produced -by \code{\link[=score]{score()}}).} +an additional \code{metrics} attribute as produced by \code{\link[=score]{score()}}).} -\item{strategy}{A strategy function as returned by -\code{\link[=filter_to_intersection]{filter_to_intersection()}}. Default is -\code{filter_to_intersection()}.} +\item{strategy}{A strategy function. See Description for the +expected signature. Default: \code{\link[=filter_to_intersection]{filter_to_intersection()}}.} \item{compare}{Character string (default \code{"model"}) naming the -column whose values are compared for filtering.} +column whose values are compared when deciding which +target combinations to keep.} } \value{ -A filtered \code{scores} object with the same class and -\code{metrics} attribute as the input. +A \code{scores} object with the same class and \code{metrics} +attribute as the input, with rows filtered according to +\code{strategy}. } \description{ -Filters a \code{scores} object according to a given strategy. -The filtering behaviour is controlled by the \code{strategy} -argument, which defaults to \code{\link[=filter_to_intersection]{filter_to_intersection()}}. -This is a general-purpose filtering function that delegates -all logic to the strategy. +Filter a \code{scores} object using a supplied strategy function. +\code{filter_scores()} is responsible for preserving the \code{scores} +class and the \code{metrics} attribute; the strategy is +responsible only for the filtering logic. + +Strategies are constructed by helpers such as +\code{\link[=filter_to_intersection]{filter_to_intersection()}} and \code{\link[=filter_to_include]{filter_to_include()}} and can +also be user-defined. A strategy is a function with +signature \verb{function(scores, compare)} that returns a +filtered data.table with the same columns as its input. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +# Keep only targets covered by every model (the default) +filter_scores(scores) + +# Keep targets covered by at least 75\% of models +filter_scores( + scores, + strategy = filter_to_intersection(min_coverage = 0.75) +) + +# Keep only targets covered by a named model +filter_scores( + scores, + strategy = filter_to_include("EuroCOVIDhub-baseline") +) } \seealso{ +\code{\link[=filter_to_intersection]{filter_to_intersection()}}, \code{\link[=filter_to_include]{filter_to_include()}}, \code{vignette("handling-missing-forecasts")} } \keyword{postprocess-scores} diff --git a/man/filter_to_include.Rd b/man/filter_to_include.Rd new file mode 100644 index 000000000..983129d9a --- /dev/null +++ b/man/filter_to_include.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter-scores.R +\name{filter_to_include} +\alias{filter_to_include} +\title{Filter to targets covered by named compare values} +\usage{ +filter_to_include(include) +} +\arguments{ +\item{include}{Character vector of length one or more. Values +from the \code{compare} column whose target sets should be +intersected.} +} +\value{ +A strategy function for \code{\link[=filter_scores]{filter_scores()}}. Intended +to be passed to \code{filter_scores()} rather than called +directly — \code{filter_scores()} is where the \code{scores} class +and \code{metrics} attribute are preserved. +} +\description{ +Strategy for \code{\link[=filter_scores]{filter_scores()}} that restricts the kept +target combinations to those covered by every value listed +in \code{include}. With a single value this keeps only that +value's targets; with several values, the intersection of +their target sets is kept. + +To use a proportion-based threshold over all compare values +instead, use \code{\link[=filter_to_intersection]{filter_to_intersection()}}. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() +filter_scores( + scores, + strategy = filter_to_include("EuroCOVIDhub-baseline") +) +} +\seealso{ +\code{\link[=filter_scores]{filter_scores()}}, \code{\link[=filter_to_intersection]{filter_to_intersection()}} +} +\keyword{postprocess-scores} diff --git a/man/filter_to_intersection.Rd b/man/filter_to_intersection.Rd index f849c212c..5f27eaf2c 100644 --- a/man/filter_to_intersection.Rd +++ b/man/filter_to_intersection.Rd @@ -2,29 +2,45 @@ % Please edit documentation in R/filter-scores.R \name{filter_to_intersection} \alias{filter_to_intersection} -\title{Filter to intersection of model-target combinations} +\title{Filter to target combinations meeting a coverage threshold} \usage{ -filter_to_intersection(min_coverage = 1, include = NULL) +filter_to_intersection(min_coverage = 1) } \arguments{ \item{min_coverage}{Numeric between 0 and 1 (default \code{1}). -Minimum proportion of comparators that must cover a +Minimum proportion of compare values that must cover a target combination for it to be kept.} - -\item{include}{Character vector or \code{NULL} (default). If -provided, the target grid is restricted to targets -covered by these values of the \code{compare} column. When -multiple values are given, only the intersection of -their targets is used.} } \value{ -A function with signature \verb{function(scores, compare)} -suitable for use as a strategy in -\code{\link[=filter_scores]{filter_scores()}}. +A strategy function for \code{\link[=filter_scores]{filter_scores()}}. Intended +to be passed to \code{filter_scores()} rather than called +directly — \code{filter_scores()} is where the \code{scores} class +and \code{metrics} attribute are preserved. } \description{ -Strategy factory for \code{\link[=filter_scores]{filter_scores()}}. -Returns a function that keeps only target combinations -covered by a minimum proportion of comparators. +Strategy for \code{\link[=filter_scores]{filter_scores()}} that keeps target combinations +covered by at least \code{min_coverage} of the values in the +\code{compare} column. With the default \code{min_coverage = 1}, only +target combinations present for every compare value are kept +(strict intersection across the full set). + +To restrict to the targets covered by a named subset of +compare values instead of by a proportion, use +\code{\link[=filter_to_include]{filter_to_include()}}. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() +filter_scores( + scores, + strategy = filter_to_intersection(min_coverage = 0.75) +) +} +\seealso{ +\code{\link[=filter_scores]{filter_scores()}}, \code{\link[=filter_to_include]{filter_to_include()}} } \keyword{postprocess-scores} diff --git a/man/get_forecast_type_ids.Rd b/man/get_forecast_type_ids.Rd index 5bd5b81d7..d624b017b 100644 --- a/man/get_forecast_type_ids.Rd +++ b/man/get_forecast_type_ids.Rd @@ -26,7 +26,7 @@ get_forecast_type_ids(data) } \arguments{ \item{data}{A data.frame (or similar) with predicted and observed values. -See the details section of for additional information +See the "Target format" section in Details for additional information on the required input format.} } \value{ diff --git a/tests/testthat/test-filter-scores.R b/tests/testthat/test-filter-scores.R index 95d00f3bb..8486b5e9d 100644 --- a/tests/testthat/test-filter-scores.R +++ b/tests/testthat/test-filter-scores.R @@ -147,14 +147,14 @@ test_that("filter_to_intersection(min_coverage=0.5) works", { expect_equal(nrow(result2), 5) }) -test_that("filter_to_intersection(include) keeps targets", { +test_that("filter_to_include() keeps targets of single model", { scores <- data.table::data.table( model = c("m1", "m1", "m2", "m2", "m3"), location = c("DE", "US", "DE", "FR", "DE"), wis = c(1, 2, 3, 4, 5) ) scores <- new_scores(scores, "wis") - strategy <- filter_to_intersection(include = "m1") + strategy <- filter_to_include("m1") result <- strategy(scores, compare = "model") # m1 covers DE and US, so keep all rows with DE or US expect_true(all(result$location %in% c("DE", "US"))) @@ -162,22 +162,20 @@ test_that("filter_to_intersection(include) keeps targets", { expect_false("FR" %in% result$location) }) -test_that("filter_to_intersection(include=c()) intersects", { +test_that("filter_to_include() intersects multiple models", { scores <- data.table::data.table( model = c("m1", "m1", "m2", "m2", "m3"), location = c("DE", "US", "DE", "FR", "DE"), wis = c(1, 2, 3, 4, 5) ) scores <- new_scores(scores, "wis") - strategy <- filter_to_intersection( - include = c("m1", "m2") - ) + strategy <- filter_to_include(c("m1", "m2")) result <- strategy(scores, compare = "model") # m1 covers DE, US; m2 covers DE, FR; intersection = DE expect_true(all(result$location == "DE")) }) -test_that("filter_to_intersection(include) errors on unknown", { +test_that("filter_to_include() errors on unknown compare value", { scores <- data.table::data.table( model = c("A", "B"), location = c("DE", "DE"), @@ -187,7 +185,7 @@ test_that("filter_to_intersection(include) errors on unknown", { expect_error( filter_scores( scores, - strategy = filter_to_intersection(include = "Z") + strategy = filter_to_include("Z") ), "not found" ) @@ -212,7 +210,7 @@ test_that("filter_scores() works with non-default compare", { # Integration tests with scores_quantile # ============================================================================== test_that( - "filter_to_intersection(include) with real data filters + "filter_to_include() with real data filters to model's targets", { scores <- scores_quantile @@ -231,8 +229,8 @@ test_that( result <- suppressMessages(filter_scores( scores, - strategy = filter_to_intersection( - include = "epiforecasts-EpiNow2" + strategy = filter_to_include( + "epiforecasts-EpiNow2" ) )) diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index 641d2090e..90bb2718a 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -284,18 +284,16 @@ test_that( "filter_scores not yet available" ) skip_if_not( - exists("filter_to_intersection", + exists("filter_to_include", where = asNamespace("scoringutils")), - "filter_to_intersection not yet available" + "filter_to_include not yet available" ) scores <- scores_quantile ref_model <- "EuroCOVIDhub-baseline" filtered <- suppressMessages(filter_scores( scores, - strategy = filter_to_intersection( - include = ref_model - ) + strategy = filter_to_include(ref_model) )) result <- suppressMessages(impute_missing_scores( filtered, diff --git a/vignettes/handling-missing-forecasts.Rmd b/vignettes/handling-missing-forecasts.Rmd index c1d9733c0..5658119a9 100644 --- a/vignettes/handling-missing-forecasts.Rmd +++ b/vignettes/handling-missing-forecasts.Rmd @@ -103,15 +103,13 @@ Between 0.75 and 1.0 no intermediate threshold changes the result here because n ### Filtering to a specific model's targets -The `include` argument restricts to targets covered by named models. +`filter_to_include()` restricts to targets covered by named models. For example, to evaluate all models only on the targets `epiforecasts-EpiNow2` covered: ```{r filter-include} scores_epinow2 <- filter_scores( scores, - strategy = filter_to_intersection( - include = "epiforecasts-EpiNow2" - ) + strategy = filter_to_include("epiforecasts-EpiNow2") ) summarise_scores(scores_epinow2, by = "model") ``` @@ -195,9 +193,7 @@ For example, to evaluate on the targets `epiforecasts-EpiNow2` covered and then ```{r pipeline} result <- scores |> filter_scores( - strategy = filter_to_intersection( - include = "epiforecasts-EpiNow2" - ) + strategy = filter_to_include("epiforecasts-EpiNow2") ) |> impute_missing_scores( strategy = impute_worst_score() From 5a4beeef9f393dc6e031911c0d4e8ad91c4d8af0 Mon Sep 17 00:00:00 2001 From: seabbs-bot Date: Wed, 22 Apr 2026 15:41:02 +0100 Subject: [PATCH 33/33] refactor(impute-scores): share summary helper and guard all-NA Addresses inline review on PR #1156. - Extract impute_summary_score(fn) so impute_worst_score() and impute_mean_score() are one-liners over max/mean and the loop body lives in a single place. - Guard the all-NA case: target combinations with no non-NA observations now return NA_real_ rather than -Inf from max() or NaN from mean(). Covered by new tests. - Validate the strategy function formals via assert_strategy() in both impute_missing_scores() and filter_scores(), with tests for the error path. - Replace wrap-style nolint blocks with targeted trailing `# nolint:` comments on the specific variables that are only used in cli_inform glue strings; drops the confusing multi-line blocks that Seb flagged. --- R/impute-missing-scores.R | 221 ++++++++++---------- man/impute_mean_score.Rd | 15 +- man/impute_missing_scores.Rd | 45 ++-- man/impute_model_score.Rd | 14 +- man/impute_na_score.Rd | 10 +- man/impute_worst_score.Rd | 15 +- tests/testthat/test-filter-scores.R | 14 ++ tests/testthat/test-impute-missing-scores.R | 55 +++++ 8 files changed, 231 insertions(+), 158 deletions(-) diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R index 93723e0e3..86ec46c0d 100644 --- a/R/impute-missing-scores.R +++ b/R/impute-missing-scores.R @@ -7,37 +7,36 @@ #' on the same set of targets, which avoids bias when #' summarising scores. #' -#' Missing combinations are identified by comparing each -#' element in `compare` against the full set of targets -#' present across all elements. The strategy function then -#' provides the imputed values for the missing metric columns. +#' Missing combinations are identified by comparing each value +#' of the `compare` column against the union of targets observed +#' across all values. The strategy is then called to fill the +#' metric columns for those rows. #' -#' An `.imputed` column is added to the output indicating -#' which rows were imputed (`TRUE`) and which are original -#' (`FALSE`). +#' An `.imputed` column is added to the output indicating which +#' rows were imputed (`TRUE`) and which are original (`FALSE`). #' -#' @param scores An object of class `scores` (a data.table -#' with scores and an additional attribute `metrics` as -#' produced by [score()]). -#' @param strategy A function or factory-created function that -#' fills missing metric values. Built-in options are -#' [impute_worst_score()], [impute_mean_score()], -#' [impute_na_score()], and [impute_model_score()]. -#' The function must accept four arguments: -#' `(scores, missing_rows, metrics, compare)` and return -#' `missing_rows` with metric columns filled. -#' @param compare Character vector of length one with the -#' column name that defines the unit of comparison. -#' Default is `"model"`. +#' @param scores An object of class `scores` (a data.table with +#' an additional `metrics` attribute as produced by [score()]). +#' @param strategy A strategy function with signature +#' `function(scores, missing_rows, metrics, compare)` that +#' returns `missing_rows` with the metric columns filled. +#' Built-in options are [impute_worst_score()], +#' [impute_mean_score()], [impute_na_score()], and +#' [impute_model_score()]. Custom strategies are also +#' supported. +#' @param compare Character string (default `"model"`) naming the +#' column whose values are compared against each target to +#' identify missing combinations. #' -#' @return An object of class `scores` with an additional -#' `.imputed` column. Rows that were imputed have -#' `.imputed = TRUE`. +#' @return A `scores` object with an additional `.imputed` +#' column. Rows that were imputed have `.imputed = TRUE`. #' -#' @seealso \code{vignette("handling-missing-forecasts")} +#' @seealso [impute_worst_score()], [impute_mean_score()], +#' [impute_na_score()], [impute_model_score()], +#' \code{vignette("handling-missing-forecasts")} #' @importFrom data.table copy set rbindlist setattr -#' @importFrom checkmate assert_class assert_function -#' assert_character assert_subset +#' @importFrom checkmate assert_class assert_character +#' assert_subset #' @importFrom cli cli_abort cli_inform #' @export #' @keywords postprocess-scores @@ -60,7 +59,10 @@ impute_missing_scores <- function( metrics <- get_metrics.scores(scores, error = TRUE) assert_character(compare, len = 1) assert_subset(compare, names(scores)) - assert_function(strategy) + assert_strategy( + strategy, + required = c("scores", "missing_rows", "metrics", "compare") + ) scores <- copy(scores) @@ -74,10 +76,8 @@ impute_missing_scores <- function( return(scores[]) } - #nolint start: object_usage_linter - n_missing <- nrow(missing_rows) - n_comparators <- length(unique(missing_rows[[compare]])) - #nolint end + n_missing <- nrow(missing_rows) # nolint: object_usage_linter + n_comparators <- length(unique(missing_rows[[compare]])) # nolint: object_usage_linter, line_length_linter cli_inform(c( i = "Imputing {n_missing} missing score row{?s}.", i = "{n_comparators} {compare} {cli::qty(n_comparators)}value{?s} affected." # nolint: line_length_linter @@ -88,58 +88,54 @@ impute_missing_scores <- function( data.table::set(filled, j = ".imputed", value = TRUE) data.table::set(scores, j = ".imputed", value = FALSE) - out <- rbindlist(list(scores, filled), use.names = TRUE, - fill = TRUE) + out <- rbindlist( + list(scores, filled), + use.names = TRUE, + fill = TRUE + ) - out <- new_scores(out, metrics) - return(out[]) + return(new_scores(out, metrics)) } -#' @title Impute with worst (maximum) observed score -#' -#' @description -#' Creates an imputation strategy that fills each missing -#' metric with the worst (maximum) observed value for that -#' metric within the same target combination across all -#' elements of `compare`. -#' -#' @return A function suitable for use as the `strategy` -#' argument in [impute_missing_scores()]. -#' @export -#' @keywords postprocess-scores -#' @examples -#' \dontshow{ -#' data.table::setDTthreads(2) -#' } -#' scores <- example_quantile |> -#' as_forecast_quantile() |> -#' score() -#' -#' impute_missing_scores(scores, strategy = impute_worst_score()) -impute_worst_score <- function() { +# Shared implementation for simple summary-based imputation +# (e.g. max, mean). `fn` is a summary function applied to each +# metric within the same target combination across all compare +# values. NA values are ignored; target combinations with no +# non-NA observations produce NA rather than -Inf or NaN. +impute_summary_score <- function(fn) { + safe_fn <- function(x) { + x <- x[!is.na(x)] + if (length(x) == 0) { + return(NA_real_) + } + fn(x) + } function(scores, missing_rows, metrics, compare) { fu <- get_forecast_unit(scores) target_cols <- setdiff(fu, compare) for (m in metrics) { if (!(m %in% names(scores))) next - # Compute max per target combination agg <- scores[, - .(..val = max(get(m), na.rm = TRUE)), + .(..val = safe_fn(get(m))), by = target_cols ] - # Merge onto missing_rows missing_rows <- merge( - missing_rows, agg, - by = target_cols, all.x = TRUE + missing_rows, + agg, + by = target_cols, + all.x = TRUE ) data.table::set( - missing_rows, j = m, + missing_rows, + j = m, value = missing_rows[["..val"]] ) data.table::set( - missing_rows, j = "..val", value = NULL + missing_rows, + j = "..val", + value = NULL ) } return(missing_rows) @@ -147,16 +143,44 @@ impute_worst_score <- function() { } +#' @title Impute with worst (maximum) observed score +#' +#' @description +#' Strategy for [impute_missing_scores()] that fills each +#' missing metric with the worst (maximum) observed value for +#' that metric within the same target combination across all +#' values of the `compare` column. Target combinations with no +#' non-NA observations are filled with `NA_real_`. +#' +#' @return A strategy function for [impute_missing_scores()]. +#' @seealso [impute_missing_scores()], [impute_mean_score()] +#' @export +#' @keywords postprocess-scores +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' impute_missing_scores(scores, strategy = impute_worst_score()) +impute_worst_score <- function() { + impute_summary_score(max) +} + + #' @title Impute with mean observed score #' #' @description -#' Creates an imputation strategy that fills each missing -#' metric with the mean observed value for that metric within -#' the same target combination across all elements of -#' `compare`. +#' Strategy for [impute_missing_scores()] that fills each +#' missing metric with the mean observed value for that metric +#' within the same target combination across all values of the +#' `compare` column. Target combinations with no non-NA +#' observations are filled with `NA_real_`. #' -#' @return A function suitable for use as the `strategy` -#' argument in [impute_missing_scores()]. +#' @return A strategy function for [impute_missing_scores()]. +#' @seealso [impute_missing_scores()], [impute_worst_score()] #' @export #' @keywords postprocess-scores #' @examples @@ -169,41 +193,18 @@ impute_worst_score <- function() { #' #' impute_missing_scores(scores, strategy = impute_mean_score()) impute_mean_score <- function() { - function(scores, missing_rows, metrics, compare) { - fu <- get_forecast_unit(scores) - target_cols <- setdiff(fu, compare) - - for (m in metrics) { - if (!(m %in% names(scores))) next - agg <- scores[, - .(..val = mean(get(m), na.rm = TRUE)), - by = target_cols - ] - missing_rows <- merge( - missing_rows, agg, - by = target_cols, all.x = TRUE - ) - data.table::set( - missing_rows, j = m, - value = missing_rows[["..val"]] - ) - data.table::set( - missing_rows, j = "..val", value = NULL - ) - } - return(missing_rows) - } + impute_summary_score(mean) } #' @title Impute with NA values #' #' @description -#' Creates an imputation strategy that fills each missing -#' metric with `NA_real_`. +#' Strategy for [impute_missing_scores()] that fills each +#' missing metric with `NA_real_`. #' -#' @return A function suitable for use as the `strategy` -#' argument in [impute_missing_scores()]. +#' @return A strategy function for [impute_missing_scores()]. +#' @seealso [impute_missing_scores()] #' @export #' @keywords postprocess-scores #' @examples @@ -228,18 +229,17 @@ impute_na_score <- function() { #' @title Impute with a reference model's scores #' #' @description -#' Creates an imputation strategy that fills missing scores -#' with the actual scores from a specified reference model -#' for each target combination. +#' Strategy for [impute_missing_scores()] that fills missing +#' scores with the scores of a specified reference model for +#' the same target combination. #' #' @param model Character string naming the reference model #' whose scores should be used for imputation. The reference #' model must have scores for all target combinations that -#' need imputing. -#' -#' @return A function suitable for use as the `strategy` -#' argument in [impute_missing_scores()]. +#' need imputing; otherwise an error is raised. #' +#' @return A strategy function for [impute_missing_scores()]. +#' @seealso [impute_missing_scores()] #' @importFrom cli cli_abort #' @export #' @keywords postprocess-scores @@ -257,8 +257,8 @@ impute_na_score <- function() { #' ) impute_model_score <- function(model) { assert_character(model, len = 1) - # Store in a different name to avoid collision with - # the "model" column in data.table expressions + # Store under a different name to avoid collision with + # the "model" column in data.table expressions below. ref_model_name <- model function(scores, missing_rows, metrics, compare) { fu <- get_forecast_unit(scores) @@ -277,17 +277,13 @@ impute_model_score <- function(model) { )) } - # Check that the reference model has scores for all - # needed target combinations needed <- unique( missing_rows[, target_cols, with = FALSE] ) available <- unique( ref[, target_cols, with = FALSE] ) - missing_targets <- needed[!available, - on = target_cols - ] + missing_targets <- needed[!available, on = target_cols] if (nrow(missing_targets) > 0) { cli_abort(c( "!" = paste0( @@ -299,7 +295,6 @@ impute_model_score <- function(model) { )) } - # Merge reference model scores onto missing rows ref_scores <- ref[, c( target_cols, diff --git a/man/impute_mean_score.Rd b/man/impute_mean_score.Rd index 1caf41367..55b32441f 100644 --- a/man/impute_mean_score.Rd +++ b/man/impute_mean_score.Rd @@ -7,14 +7,14 @@ impute_mean_score() } \value{ -A function suitable for use as the \code{strategy} -argument in \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}. } \description{ -Creates an imputation strategy that fills each missing -metric with the mean observed value for that metric within -the same target combination across all elements of -\code{compare}. +Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills each +missing metric with the mean observed value for that metric +within the same target combination across all values of the +\code{compare} column. Target combinations with no non-NA +observations are filled with \code{NA_real_}. } \examples{ \dontshow{ @@ -26,4 +26,7 @@ scores <- example_quantile |> impute_missing_scores(scores, strategy = impute_mean_score()) } +\seealso{ +\code{\link[=impute_missing_scores]{impute_missing_scores()}}, \code{\link[=impute_worst_score]{impute_worst_score()}} +} \keyword{postprocess-scores} diff --git a/man/impute_missing_scores.Rd b/man/impute_missing_scores.Rd index 5c63dc0a5..16c6902f4 100644 --- a/man/impute_missing_scores.Rd +++ b/man/impute_missing_scores.Rd @@ -7,26 +7,24 @@ impute_missing_scores(scores, strategy, compare = "model") } \arguments{ -\item{scores}{An object of class \code{scores} (a data.table -with scores and an additional attribute \code{metrics} as -produced by \code{\link[=score]{score()}}).} +\item{scores}{An object of class \code{scores} (a data.table with +an additional \code{metrics} attribute as produced by \code{\link[=score]{score()}}).} -\item{strategy}{A function or factory-created function that -fills missing metric values. Built-in options are -\code{\link[=impute_worst_score]{impute_worst_score()}}, \code{\link[=impute_mean_score]{impute_mean_score()}}, -\code{\link[=impute_na_score]{impute_na_score()}}, and \code{\link[=impute_model_score]{impute_model_score()}}. -The function must accept four arguments: -\verb{(scores, missing_rows, metrics, compare)} and return -\code{missing_rows} with metric columns filled.} +\item{strategy}{A strategy function with signature +\verb{function(scores, missing_rows, metrics, compare)} that +returns \code{missing_rows} with the metric columns filled. +Built-in options are \code{\link[=impute_worst_score]{impute_worst_score()}}, +\code{\link[=impute_mean_score]{impute_mean_score()}}, \code{\link[=impute_na_score]{impute_na_score()}}, and +\code{\link[=impute_model_score]{impute_model_score()}}. Custom strategies are also +supported.} -\item{compare}{Character vector of length one with the -column name that defines the unit of comparison. -Default is \code{"model"}.} +\item{compare}{Character string (default \code{"model"}) naming the +column whose values are compared against each target to +identify missing combinations.} } \value{ -An object of class \code{scores} with an additional -\code{.imputed} column. Rows that were imputed have -\code{.imputed = TRUE}. +A \code{scores} object with an additional \code{.imputed} +column. Rows that were imputed have \code{.imputed = TRUE}. } \description{ Fills in scores for forecast-target combinations that are @@ -35,14 +33,13 @@ strategy. This is useful to ensure all models are evaluated on the same set of targets, which avoids bias when summarising scores. -Missing combinations are identified by comparing each -element in \code{compare} against the full set of targets -present across all elements. The strategy function then -provides the imputed values for the missing metric columns. +Missing combinations are identified by comparing each value +of the \code{compare} column against the union of targets observed +across all values. The strategy is then called to fill the +metric columns for those rows. -An \code{.imputed} column is added to the output indicating -which rows were imputed (\code{TRUE}) and which are original -(\code{FALSE}). +An \code{.imputed} column is added to the output indicating which +rows were imputed (\code{TRUE}) and which are original (\code{FALSE}). } \examples{ \dontshow{ @@ -56,6 +53,8 @@ scores <- example_quantile |> impute_missing_scores(scores, strategy = impute_na_score()) } \seealso{ +\code{\link[=impute_worst_score]{impute_worst_score()}}, \code{\link[=impute_mean_score]{impute_mean_score()}}, +\code{\link[=impute_na_score]{impute_na_score()}}, \code{\link[=impute_model_score]{impute_model_score()}}, \code{vignette("handling-missing-forecasts")} } \keyword{postprocess-scores} diff --git a/man/impute_model_score.Rd b/man/impute_model_score.Rd index fa027a11c..ca44b1737 100644 --- a/man/impute_model_score.Rd +++ b/man/impute_model_score.Rd @@ -10,16 +10,15 @@ impute_model_score(model) \item{model}{Character string naming the reference model whose scores should be used for imputation. The reference model must have scores for all target combinations that -need imputing.} +need imputing; otherwise an error is raised.} } \value{ -A function suitable for use as the \code{strategy} -argument in \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}. } \description{ -Creates an imputation strategy that fills missing scores -with the actual scores from a specified reference model -for each target combination. +Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills missing +scores with the scores of a specified reference model for +the same target combination. } \examples{ \dontshow{ @@ -34,4 +33,7 @@ impute_missing_scores( strategy = impute_model_score("EuroCOVIDhub-baseline") ) } +\seealso{ +\code{\link[=impute_missing_scores]{impute_missing_scores()}} +} \keyword{postprocess-scores} diff --git a/man/impute_na_score.Rd b/man/impute_na_score.Rd index c1a812cd4..ad676e703 100644 --- a/man/impute_na_score.Rd +++ b/man/impute_na_score.Rd @@ -7,12 +7,11 @@ impute_na_score() } \value{ -A function suitable for use as the \code{strategy} -argument in \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}. } \description{ -Creates an imputation strategy that fills each missing -metric with \code{NA_real_}. +Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills each +missing metric with \code{NA_real_}. } \examples{ \dontshow{ @@ -24,4 +23,7 @@ scores <- example_quantile |> impute_missing_scores(scores, strategy = impute_na_score()) } +\seealso{ +\code{\link[=impute_missing_scores]{impute_missing_scores()}} +} \keyword{postprocess-scores} diff --git a/man/impute_worst_score.Rd b/man/impute_worst_score.Rd index e6528caaf..0868cf041 100644 --- a/man/impute_worst_score.Rd +++ b/man/impute_worst_score.Rd @@ -7,14 +7,14 @@ impute_worst_score() } \value{ -A function suitable for use as the \code{strategy} -argument in \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}. } \description{ -Creates an imputation strategy that fills each missing -metric with the worst (maximum) observed value for that -metric within the same target combination across all -elements of \code{compare}. +Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills each +missing metric with the worst (maximum) observed value for +that metric within the same target combination across all +values of the \code{compare} column. Target combinations with no +non-NA observations are filled with \code{NA_real_}. } \examples{ \dontshow{ @@ -26,4 +26,7 @@ scores <- example_quantile |> impute_missing_scores(scores, strategy = impute_worst_score()) } +\seealso{ +\code{\link[=impute_missing_scores]{impute_missing_scores()}}, \code{\link[=impute_mean_score]{impute_mean_score()}} +} \keyword{postprocess-scores} diff --git a/tests/testthat/test-filter-scores.R b/tests/testthat/test-filter-scores.R index 8486b5e9d..54015a291 100644 --- a/tests/testthat/test-filter-scores.R +++ b/tests/testthat/test-filter-scores.R @@ -111,6 +111,20 @@ test_that("filter_scores() reports rows dropped", { ) }) +test_that("filter_scores() errors on strategy with wrong formals", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + bad_strategy <- function(scores) scores + expect_error( + filter_scores(scores, strategy = bad_strategy), + "missing required" + ) +}) + test_that("filter_scores() errors on invalid compare column", { scores <- data.table::data.table( model = c("A", "A", "B"), diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R index 90bb2718a..c0327bc6d 100644 --- a/tests/testthat/test-impute-missing-scores.R +++ b/tests/testthat/test-impute-missing-scores.R @@ -364,6 +364,61 @@ test_that( } ) +test_that( + "impute_worst_score returns NA when all scores for a target are NA", + { + scores <- data.table::data.table( + model = c("A", "A", "B", "C"), + location = c("DE", "US", "DE", "DE"), + wis = c(NA_real_, 2, NA_real_, NA_real_) + ) + scores <- new_scores(scores, "wis") + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_worst_score() + )) + # B and C are missing US; all DE wis are NA so the + # imputed DE rows for any model should be NA, not -Inf. + imputed <- result[(.imputed) & location == "DE"] + expect_true(all(is.na(imputed$wis))) + expect_false(any(is.infinite(imputed$wis))) + } +) + +test_that( + "impute_mean_score returns NA when all scores for a target are NA", + { + scores <- data.table::data.table( + model = c("A", "A", "B", "C"), + location = c("DE", "US", "DE", "DE"), + wis = c(NA_real_, 2, NA_real_, NA_real_) + ) + scores <- new_scores(scores, "wis") + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_mean_score() + )) + imputed <- result[(.imputed) & location == "DE"] + expect_true(all(is.na(imputed$wis))) + expect_false(any(is.nan(imputed$wis))) + } +) + +test_that( + "impute_missing_scores errors on strategy with wrong formals", + { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + bad_strategy <- function(scores, missing_rows) missing_rows + expect_error( + impute_missing_scores(scores, strategy = bad_strategy), + "missing required" + ) + } +) + test_that( "impute_missing_scores works with non-default compare", {