diff --git a/NAMESPACE b/NAMESPACE index 93237a640..8dea0e210 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -78,6 +78,9 @@ export(dispersion_quantile) export(dispersion_sample) export(dss_sample) export(energy_score_multivariate) +export(filter_scores) +export(filter_to_include) +export(filter_to_intersection) export(get_correlations) export(get_coverage) export(get_duplicate_forecasts) @@ -88,6 +91,11 @@ export(get_grouping) export(get_metrics) export(get_pairwise_comparisons) export(get_pit_histogram) +export(impute_mean_score) +export(impute_missing_scores) +export(impute_model_score) +export(impute_na_score) +export(impute_worst_score) export(interval_coverage) export(is_forecast) export(is_forecast_binary) @@ -178,12 +186,14 @@ importFrom(data.table,key) importFrom(data.table,melt) importFrom(data.table,nafill) importFrom(data.table,rbindlist) +importFrom(data.table,set) importFrom(data.table,setDT) importFrom(data.table,setattr) importFrom(data.table,setcolorder) importFrom(data.table,setkeyv) importFrom(data.table,setnames) importFrom(data.table,setorderv) +importFrom(data.table,uniqueN) importFrom(ggplot2,.data) importFrom(ggplot2,`%+replace%`) importFrom(ggplot2,aes) diff --git a/NEWS.md b/NEWS.md index 52d75f99a..e5eced91f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,6 @@ # scoringutils (development version) +- Added `filter_scores()` and `impute_missing_scores()` for handling missing forecasts before summarisation. `filter_scores()` removes target combinations with insufficient model coverage, while `impute_missing_scores()` fills in missing scores using configurable strategies (worst, mean, NA, or reference model). Both use a strategy function pattern for extensibility. See `vignette("handling-missing-forecasts")` for details (#1122). - Added internal S3 generic `get_forecast_type_ids()` so each forecast type declares the columns (beyond the forecast unit) that identify a unique row. `get_duplicate_forecasts()` now uses this instead of hard-coded column names (#888). - Removed the deprecated vignettes `Deprecated-functions` and `Deprecated-visualisations`. The code for removed functions (`plot_predictions()`, `make_NA()`, `plot_ranges()`, `plot_score_table()`, `merge_pred_and_obs()`) can still be found in the [git history](https://github.com/epiforecasts/scoringutils/tree/d0cd8e2/vignettes) (#1158). diff --git a/R/filter-scores.R b/R/filter-scores.R new file mode 100644 index 000000000..eb4c0daec --- /dev/null +++ b/R/filter-scores.R @@ -0,0 +1,228 @@ +#' @title Filter scores +#' +#' @description +#' Filter a `scores` object using a supplied strategy function. +#' `filter_scores()` is responsible for preserving the `scores` +#' class and the `metrics` attribute; the strategy is +#' responsible only for the filtering logic. +#' +#' Strategies are constructed by helpers such as +#' [filter_to_intersection()] and [filter_to_include()] and can +#' also be user-defined. A strategy is a function with +#' signature `function(scores, compare)` that returns a +#' filtered data.table with the same columns as its input. +#' +#' @param scores An object of class `scores` (a data.table with +#' an additional `metrics` attribute as produced by [score()]). +#' @param strategy A strategy function. See Description for the +#' expected signature. Default: [filter_to_intersection()]. +#' @param compare Character string (default `"model"`) naming the +#' column whose values are compared when deciding which +#' target combinations to keep. +#' +#' @return A `scores` object with the same class and `metrics` +#' attribute as the input, with rows filtered according to +#' `strategy`. +#' +#' @seealso [filter_to_intersection()], [filter_to_include()], +#' \code{vignette("handling-missing-forecasts")} +#' @importFrom cli cli_inform +#' @importFrom checkmate assert_class assert_character +#' assert_subset +#' @export +#' @keywords postprocess-scores +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' # Keep only targets covered by every model (the default) +#' filter_scores(scores) +#' +#' # Keep targets covered by at least 75% of models +#' filter_scores( +#' scores, +#' strategy = filter_to_intersection(min_coverage = 0.75) +#' ) +#' +#' # Keep only targets covered by a named model +#' filter_scores( +#' scores, +#' strategy = filter_to_include("EuroCOVIDhub-baseline") +#' ) +filter_scores <- function( + scores, + strategy = filter_to_intersection(), + compare = "model" +) { + assert_class(scores, "scores") + assert_character(compare, len = 1) + assert_subset(compare, names(scores)) + assert_strategy(strategy, required = "compare") + + original_metrics <- attr(scores, "metrics") + + result <- strategy(scores, compare = compare) + + n_before <- nrow(scores) + n_after <- nrow(result) + n_dropped <- n_before - n_after + + if (n_dropped == 0) { + cli_inform(c( + i = "No rows filtered. Returning scores unchanged." + )) + return(scores) + } + + cli_inform(c( + i = "Filtered out {n_dropped} rows.", + i = "{n_after} of {n_before} rows remaining." # nolint: duplicate_argument_linter + )) + + return(new_scores(result, original_metrics)) +} + + +#' @title Filter to target combinations meeting a coverage threshold +#' +#' @description +#' Strategy for [filter_scores()] that keeps target combinations +#' covered by at least `min_coverage` of the values in the +#' `compare` column. With the default `min_coverage = 1`, only +#' target combinations present for every compare value are kept +#' (strict intersection across the full set). +#' +#' To restrict to the targets covered by a named subset of +#' compare values instead of by a proportion, use +#' [filter_to_include()]. +#' +#' @param min_coverage Numeric between 0 and 1 (default `1`). +#' Minimum proportion of compare values that must cover a +#' target combination for it to be kept. +#' +#' @return A strategy function for [filter_scores()]. Intended +#' to be passed to `filter_scores()` rather than called +#' directly — `filter_scores()` is where the `scores` class +#' and `metrics` attribute are preserved. +#' +#' @seealso [filter_scores()], [filter_to_include()] +#' @importFrom data.table as.data.table setkeyv uniqueN +#' @importFrom checkmate assert_number +#' @export +#' @keywords postprocess-scores +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' filter_scores( +#' scores, +#' strategy = filter_to_intersection(min_coverage = 0.75) +#' ) +filter_to_intersection <- function(min_coverage = 1) { + assert_number(min_coverage, lower = 0, upper = 1) + + function(scores, compare = "model") { + scores <- data.table::as.data.table(scores) + forecast_unit <- get_forecast_unit(scores) + target_cols <- setdiff(forecast_unit, compare) + + n_total <- data.table::uniqueN(scores[[compare]]) + + target_coverage <- scores[, + .(n_compare = data.table::uniqueN(get(compare))), + by = target_cols + ] + + keep <- target_coverage$n_compare / n_total >= min_coverage + qualifying <- target_coverage[keep, target_cols, with = FALSE] + + data.table::setkeyv(scores, target_cols) + data.table::setkeyv(qualifying, target_cols) + scores[qualifying, nomatch = NULL] + } +} + + +#' @title Filter to targets covered by named compare values +#' +#' @description +#' Strategy for [filter_scores()] that restricts the kept +#' target combinations to those covered by every value listed +#' in `include`. With a single value this keeps only that +#' value's targets; with several values, the intersection of +#' their target sets is kept. +#' +#' To use a proportion-based threshold over all compare values +#' instead, use [filter_to_intersection()]. +#' +#' @param include Character vector of length one or more. Values +#' from the `compare` column whose target sets should be +#' intersected. +#' +#' @return A strategy function for [filter_scores()]. Intended +#' to be passed to `filter_scores()` rather than called +#' directly — `filter_scores()` is where the `scores` class +#' and `metrics` attribute are preserved. +#' +#' @seealso [filter_scores()], [filter_to_intersection()] +#' @importFrom data.table as.data.table setkeyv +#' @importFrom checkmate assert_character +#' @importFrom cli cli_abort +#' @export +#' @keywords postprocess-scores +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' filter_scores( +#' scores, +#' strategy = filter_to_include("EuroCOVIDhub-baseline") +#' ) +filter_to_include <- function(include) { + assert_character(include, min.len = 1) + + function(scores, compare = "model") { + scores <- data.table::as.data.table(scores) + forecast_unit <- get_forecast_unit(scores) + target_cols <- setdiff(forecast_unit, compare) + + unknown <- setdiff(include, unique(scores[[compare]])) + if (length(unknown) > 0) { + cli_abort(c( + "!" = paste0( + "{.val {unknown}} not found in ", + "{.arg {compare}} column." + ) + )) + } + + target_sets <- lapply(include, function(v) { + unique( + scores[ + scores[[compare]] == v, + target_cols, + with = FALSE + ] + ) + }) + + qualifying <- Reduce( + function(a, b) merge(a, b, by = target_cols), + target_sets + ) + + data.table::setkeyv(scores, target_cols) + data.table::setkeyv(qualifying, target_cols) + scores[qualifying, nomatch = NULL] + } +} diff --git a/R/get-protected-columns.R b/R/get-protected-columns.R index 0cf59bb39..839d24861 100644 --- a/R/get-protected-columns.R +++ b/R/get-protected-columns.R @@ -15,6 +15,7 @@ get_protected_columns <- function(data = NULL) { protected_columns <- c( ".mv_group_id", + ".imputed", "predicted", "observed", "sample_id", "quantile_level", "upper", "lower", "pit_value", "interval_range", "boundary", "predicted_label", "interval_coverage", "interval_coverage_deviation", diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R new file mode 100644 index 000000000..86ec46c0d --- /dev/null +++ b/R/impute-missing-scores.R @@ -0,0 +1,316 @@ +#' @title Impute missing scores +#' +#' @description +#' Fills in scores for forecast-target combinations that are +#' missing from the data, using a user-specified imputation +#' strategy. This is useful to ensure all models are evaluated +#' on the same set of targets, which avoids bias when +#' summarising scores. +#' +#' Missing combinations are identified by comparing each value +#' of the `compare` column against the union of targets observed +#' across all values. The strategy is then called to fill the +#' metric columns for those rows. +#' +#' An `.imputed` column is added to the output indicating which +#' rows were imputed (`TRUE`) and which are original (`FALSE`). +#' +#' @param scores An object of class `scores` (a data.table with +#' an additional `metrics` attribute as produced by [score()]). +#' @param strategy A strategy function with signature +#' `function(scores, missing_rows, metrics, compare)` that +#' returns `missing_rows` with the metric columns filled. +#' Built-in options are [impute_worst_score()], +#' [impute_mean_score()], [impute_na_score()], and +#' [impute_model_score()]. Custom strategies are also +#' supported. +#' @param compare Character string (default `"model"`) naming the +#' column whose values are compared against each target to +#' identify missing combinations. +#' +#' @return A `scores` object with an additional `.imputed` +#' column. Rows that were imputed have `.imputed = TRUE`. +#' +#' @seealso [impute_worst_score()], [impute_mean_score()], +#' [impute_na_score()], [impute_model_score()], +#' \code{vignette("handling-missing-forecasts")} +#' @importFrom data.table copy set rbindlist setattr +#' @importFrom checkmate assert_class assert_character +#' assert_subset +#' @importFrom cli cli_abort cli_inform +#' @export +#' @keywords postprocess-scores +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' # Impute with NA values +#' impute_missing_scores(scores, strategy = impute_na_score()) +impute_missing_scores <- function( + scores, + strategy, + compare = "model" +) { + assert_class(scores, "scores") + metrics <- get_metrics.scores(scores, error = TRUE) + assert_character(compare, len = 1) + assert_subset(compare, names(scores)) + assert_strategy( + strategy, + required = c("scores", "missing_rows", "metrics", "compare") + ) + + scores <- copy(scores) + + missing_rows <- build_missing_grid(scores, compare) # nolint: object_usage_linter + + if (nrow(missing_rows) == 0) { + cli_inform(c( + i = "No missing scores to impute. Returning scores unchanged." + )) + data.table::set(scores, j = ".imputed", value = FALSE) + return(scores[]) + } + + n_missing <- nrow(missing_rows) # nolint: object_usage_linter + n_comparators <- length(unique(missing_rows[[compare]])) # nolint: object_usage_linter, line_length_linter + cli_inform(c( + i = "Imputing {n_missing} missing score row{?s}.", + i = "{n_comparators} {compare} {cli::qty(n_comparators)}value{?s} affected." # nolint: line_length_linter + )) + + filled <- strategy(scores, missing_rows, metrics, compare) + + data.table::set(filled, j = ".imputed", value = TRUE) + data.table::set(scores, j = ".imputed", value = FALSE) + + out <- rbindlist( + list(scores, filled), + use.names = TRUE, + fill = TRUE + ) + + return(new_scores(out, metrics)) +} + + +# Shared implementation for simple summary-based imputation +# (e.g. max, mean). `fn` is a summary function applied to each +# metric within the same target combination across all compare +# values. NA values are ignored; target combinations with no +# non-NA observations produce NA rather than -Inf or NaN. +impute_summary_score <- function(fn) { + safe_fn <- function(x) { + x <- x[!is.na(x)] + if (length(x) == 0) { + return(NA_real_) + } + fn(x) + } + function(scores, missing_rows, metrics, compare) { + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, compare) + + for (m in metrics) { + if (!(m %in% names(scores))) next + agg <- scores[, + .(..val = safe_fn(get(m))), + by = target_cols + ] + missing_rows <- merge( + missing_rows, + agg, + by = target_cols, + all.x = TRUE + ) + data.table::set( + missing_rows, + j = m, + value = missing_rows[["..val"]] + ) + data.table::set( + missing_rows, + j = "..val", + value = NULL + ) + } + return(missing_rows) + } +} + + +#' @title Impute with worst (maximum) observed score +#' +#' @description +#' Strategy for [impute_missing_scores()] that fills each +#' missing metric with the worst (maximum) observed value for +#' that metric within the same target combination across all +#' values of the `compare` column. Target combinations with no +#' non-NA observations are filled with `NA_real_`. +#' +#' @return A strategy function for [impute_missing_scores()]. +#' @seealso [impute_missing_scores()], [impute_mean_score()] +#' @export +#' @keywords postprocess-scores +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' impute_missing_scores(scores, strategy = impute_worst_score()) +impute_worst_score <- function() { + impute_summary_score(max) +} + + +#' @title Impute with mean observed score +#' +#' @description +#' Strategy for [impute_missing_scores()] that fills each +#' missing metric with the mean observed value for that metric +#' within the same target combination across all values of the +#' `compare` column. Target combinations with no non-NA +#' observations are filled with `NA_real_`. +#' +#' @return A strategy function for [impute_missing_scores()]. +#' @seealso [impute_missing_scores()], [impute_worst_score()] +#' @export +#' @keywords postprocess-scores +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' impute_missing_scores(scores, strategy = impute_mean_score()) +impute_mean_score <- function() { + impute_summary_score(mean) +} + + +#' @title Impute with NA values +#' +#' @description +#' Strategy for [impute_missing_scores()] that fills each +#' missing metric with `NA_real_`. +#' +#' @return A strategy function for [impute_missing_scores()]. +#' @seealso [impute_missing_scores()] +#' @export +#' @keywords postprocess-scores +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' impute_missing_scores(scores, strategy = impute_na_score()) +impute_na_score <- function() { + function(scores, missing_rows, metrics, compare) { + for (m in metrics) { + data.table::set(missing_rows, j = m, value = NA_real_) + } + return(missing_rows) + } +} + + +#' @title Impute with a reference model's scores +#' +#' @description +#' Strategy for [impute_missing_scores()] that fills missing +#' scores with the scores of a specified reference model for +#' the same target combination. +#' +#' @param model Character string naming the reference model +#' whose scores should be used for imputation. The reference +#' model must have scores for all target combinations that +#' need imputing; otherwise an error is raised. +#' +#' @return A strategy function for [impute_missing_scores()]. +#' @seealso [impute_missing_scores()] +#' @importFrom cli cli_abort +#' @export +#' @keywords postprocess-scores +#' @examples +#' \dontshow{ +#' data.table::setDTthreads(2) +#' } +#' scores <- example_quantile |> +#' as_forecast_quantile() |> +#' score() +#' +#' impute_missing_scores( +#' scores, +#' strategy = impute_model_score("EuroCOVIDhub-baseline") +#' ) +impute_model_score <- function(model) { + assert_character(model, len = 1) + # Store under a different name to avoid collision with + # the "model" column in data.table expressions below. + ref_model_name <- model + function(scores, missing_rows, metrics, compare) { + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, compare) + + ref <- scores[ + get(compare) == ref_model_name + ] + + if (nrow(ref) == 0) { + cli_abort(c( + "!" = paste0( + "Reference model {.val {ref_model_name}} ", + "not found in scores." + ) + )) + } + + needed <- unique( + missing_rows[, target_cols, with = FALSE] + ) + available <- unique( + ref[, target_cols, with = FALSE] + ) + missing_targets <- needed[!available, on = target_cols] + if (nrow(missing_targets) > 0) { + cli_abort(c( + "!" = paste0( + "Reference model {.val {ref_model_name}} ", + "is missing scores for ", + "{nrow(missing_targets)} target ", + "combination{?s} that need imputing." + ) + )) + } + + ref_scores <- ref[, + c( + target_cols, + metrics[metrics %in% names(ref)] + ), + with = FALSE + ] + missing_rows <- merge( + missing_rows[, + setdiff(names(missing_rows), metrics), + with = FALSE + ], + ref_scores, + by = target_cols, + all.x = TRUE + ) + return(missing_rows) + } +} diff --git a/R/missing-scores-internal.R b/R/missing-scores-internal.R new file mode 100644 index 000000000..ab54e1210 --- /dev/null +++ b/R/missing-scores-internal.R @@ -0,0 +1,88 @@ +#' Assert that a strategy has the expected signature +#' +#' @description +#' Internal helper used by [filter_scores()] and +#' [impute_missing_scores()] to check that a user-supplied +#' strategy function has at least the required named formals. +#' This catches common mistakes early (e.g. forgetting the +#' `compare` argument) without constraining the strategy +#' author to a specific internal type. +#' +#' @param strategy A function. +#' @param required Character vector of formal names that +#' `strategy` must accept. +#' +#' @return `invisible(NULL)`. Called for its side effect of +#' erroring when the check fails. +#' +#' @importFrom checkmate assert_function +#' @importFrom cli cli_abort +#' @keywords internal +assert_strategy <- function(strategy, required) { + assert_function(strategy) + strategy_formals <- names(formals(strategy)) + missing_args <- setdiff(required, strategy_formals) + if (length(missing_args) > 0) { + cli_abort(c( + "!" = paste0( + "Strategy function is missing required ", + "argument{?s}: {.arg {missing_args}}." + ), + i = paste0( + "Expected formals including: ", + "{.arg {required}}." + ) + )) + } + return(invisible(NULL)) +} + + +#' Build grid of missing model-target combinations +#' +#' @description +#' Internal function that detects missing model x target +#' combinations by comparing observed data against the complete +#' grid of all compare-column values crossed with all observed +#' target combinations. +#' +#' @param scores A `scores` object (data.table with a `metrics` +#' attribute as produced by [score()]). +#' @param compare Character string (default `"model"`) naming the +#' column whose values are compared against each target. +#' +#' @return A `data.table` with forecast-unit columns for each +#' missing combination. Zero rows if nothing is missing. +#' +#' @importFrom data.table as.data.table setkeyv rbindlist copy +#' @keywords internal +build_missing_grid <- function(scores, compare = "model") { + scores <- data.table::as.data.table(scores) + forecast_unit <- get_forecast_unit(scores) + target_cols <- setdiff(forecast_unit, compare) + + targets <- unique(scores[, target_cols, with = FALSE]) + + # All unique compare values + compare_vals <- unique(scores[[compare]]) + + # Complete grid: every compare value x every observed target + complete <- data.table::rbindlist(lapply( + compare_vals, + function(val) { + out <- data.table::copy(targets) + out[, (compare) := val] + out + } + )) + + # Observed combinations (forecast unit cols only) + observed <- unique(scores[, forecast_unit, with = FALSE]) + + # Anti-join: rows in complete but not in observed + data.table::setkeyv(complete, forecast_unit) + data.table::setkeyv(observed, forecast_unit) + missing <- complete[!observed] + + return(missing) +} diff --git a/R/z-globalVariables.R b/R/z-globalVariables.R index 881a69a19..e9f2ffeca 100644 --- a/R/z-globalVariables.R +++ b/R/z-globalVariables.R @@ -11,6 +11,7 @@ globalVariables(c( ".BY", ".GRP", ".SD", + ".imputed", ".mv_group_id", ".scoringutils_N", ".scoringutils_count", @@ -51,6 +52,7 @@ globalVariables(c( "metrics", "metrics_select", "model", + "n_models", "n_obs", "n_obs wis_component_name", "observed", diff --git a/_pkgdown.yml b/_pkgdown.yml index 34853a432..91c4f6038 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -50,6 +50,9 @@ reference: - title: Handling metrics/scoring functions contents: - has_keyword("handle-metrics") + - title: Post-process scores + contents: + - has_keyword("postprocess-scores") - title: Evaluate forecasts contents: - has_keyword("scoring") diff --git a/man/assert_strategy.Rd b/man/assert_strategy.Rd new file mode 100644 index 000000000..389ff57b9 --- /dev/null +++ b/man/assert_strategy.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/missing-scores-internal.R +\name{assert_strategy} +\alias{assert_strategy} +\title{Assert that a strategy has the expected signature} +\usage{ +assert_strategy(strategy, required) +} +\arguments{ +\item{strategy}{A function.} + +\item{required}{Character vector of formal names that +\code{strategy} must accept.} +} +\value{ +\code{invisible(NULL)}. Called for its side effect of +erroring when the check fails. +} +\description{ +Internal helper used by \code{\link[=filter_scores]{filter_scores()}} and +\code{\link[=impute_missing_scores]{impute_missing_scores()}} to check that a user-supplied +strategy function has at least the required named formals. +This catches common mistakes early (e.g. forgetting the +\code{compare} argument) without constraining the strategy +author to a specific internal type. +} +\keyword{internal} diff --git a/man/build_missing_grid.Rd b/man/build_missing_grid.Rd new file mode 100644 index 000000000..f0e770800 --- /dev/null +++ b/man/build_missing_grid.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/missing-scores-internal.R +\name{build_missing_grid} +\alias{build_missing_grid} +\title{Build grid of missing model-target combinations} +\usage{ +build_missing_grid(scores, compare = "model") +} +\arguments{ +\item{scores}{A \code{scores} object (data.table with a \code{metrics} +attribute as produced by \code{\link[=score]{score()}}).} + +\item{compare}{Character string (default \code{"model"}) naming the +column whose values are compared against each target.} +} +\value{ +A \code{data.table} with forecast-unit columns for each +missing combination. Zero rows if nothing is missing. +} +\description{ +Internal function that detects missing model x target +combinations by comparing observed data against the complete +grid of all compare-column values crossed with all observed +target combinations. +} +\keyword{internal} diff --git a/man/filter_scores.Rd b/man/filter_scores.Rd new file mode 100644 index 000000000..76b32b474 --- /dev/null +++ b/man/filter_scores.Rd @@ -0,0 +1,64 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter-scores.R +\name{filter_scores} +\alias{filter_scores} +\title{Filter scores} +\usage{ +filter_scores(scores, strategy = filter_to_intersection(), compare = "model") +} +\arguments{ +\item{scores}{An object of class \code{scores} (a data.table with +an additional \code{metrics} attribute as produced by \code{\link[=score]{score()}}).} + +\item{strategy}{A strategy function. See Description for the +expected signature. Default: \code{\link[=filter_to_intersection]{filter_to_intersection()}}.} + +\item{compare}{Character string (default \code{"model"}) naming the +column whose values are compared when deciding which +target combinations to keep.} +} +\value{ +A \code{scores} object with the same class and \code{metrics} +attribute as the input, with rows filtered according to +\code{strategy}. +} +\description{ +Filter a \code{scores} object using a supplied strategy function. +\code{filter_scores()} is responsible for preserving the \code{scores} +class and the \code{metrics} attribute; the strategy is +responsible only for the filtering logic. + +Strategies are constructed by helpers such as +\code{\link[=filter_to_intersection]{filter_to_intersection()}} and \code{\link[=filter_to_include]{filter_to_include()}} and can +also be user-defined. A strategy is a function with +signature \verb{function(scores, compare)} that returns a +filtered data.table with the same columns as its input. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +# Keep only targets covered by every model (the default) +filter_scores(scores) + +# Keep targets covered by at least 75\% of models +filter_scores( + scores, + strategy = filter_to_intersection(min_coverage = 0.75) +) + +# Keep only targets covered by a named model +filter_scores( + scores, + strategy = filter_to_include("EuroCOVIDhub-baseline") +) +} +\seealso{ +\code{\link[=filter_to_intersection]{filter_to_intersection()}}, \code{\link[=filter_to_include]{filter_to_include()}}, +\code{vignette("handling-missing-forecasts")} +} +\keyword{postprocess-scores} diff --git a/man/filter_to_include.Rd b/man/filter_to_include.Rd new file mode 100644 index 000000000..983129d9a --- /dev/null +++ b/man/filter_to_include.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter-scores.R +\name{filter_to_include} +\alias{filter_to_include} +\title{Filter to targets covered by named compare values} +\usage{ +filter_to_include(include) +} +\arguments{ +\item{include}{Character vector of length one or more. Values +from the \code{compare} column whose target sets should be +intersected.} +} +\value{ +A strategy function for \code{\link[=filter_scores]{filter_scores()}}. Intended +to be passed to \code{filter_scores()} rather than called +directly — \code{filter_scores()} is where the \code{scores} class +and \code{metrics} attribute are preserved. +} +\description{ +Strategy for \code{\link[=filter_scores]{filter_scores()}} that restricts the kept +target combinations to those covered by every value listed +in \code{include}. With a single value this keeps only that +value's targets; with several values, the intersection of +their target sets is kept. + +To use a proportion-based threshold over all compare values +instead, use \code{\link[=filter_to_intersection]{filter_to_intersection()}}. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() +filter_scores( + scores, + strategy = filter_to_include("EuroCOVIDhub-baseline") +) +} +\seealso{ +\code{\link[=filter_scores]{filter_scores()}}, \code{\link[=filter_to_intersection]{filter_to_intersection()}} +} +\keyword{postprocess-scores} diff --git a/man/filter_to_intersection.Rd b/man/filter_to_intersection.Rd new file mode 100644 index 000000000..5f27eaf2c --- /dev/null +++ b/man/filter_to_intersection.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filter-scores.R +\name{filter_to_intersection} +\alias{filter_to_intersection} +\title{Filter to target combinations meeting a coverage threshold} +\usage{ +filter_to_intersection(min_coverage = 1) +} +\arguments{ +\item{min_coverage}{Numeric between 0 and 1 (default \code{1}). +Minimum proportion of compare values that must cover a +target combination for it to be kept.} +} +\value{ +A strategy function for \code{\link[=filter_scores]{filter_scores()}}. Intended +to be passed to \code{filter_scores()} rather than called +directly — \code{filter_scores()} is where the \code{scores} class +and \code{metrics} attribute are preserved. +} +\description{ +Strategy for \code{\link[=filter_scores]{filter_scores()}} that keeps target combinations +covered by at least \code{min_coverage} of the values in the +\code{compare} column. With the default \code{min_coverage = 1}, only +target combinations present for every compare value are kept +(strict intersection across the full set). + +To restrict to the targets covered by a named subset of +compare values instead of by a proportion, use +\code{\link[=filter_to_include]{filter_to_include()}}. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() +filter_scores( + scores, + strategy = filter_to_intersection(min_coverage = 0.75) +) +} +\seealso{ +\code{\link[=filter_scores]{filter_scores()}}, \code{\link[=filter_to_include]{filter_to_include()}} +} +\keyword{postprocess-scores} diff --git a/man/get_forecast_type_ids.Rd b/man/get_forecast_type_ids.Rd index 5bd5b81d7..d624b017b 100644 --- a/man/get_forecast_type_ids.Rd +++ b/man/get_forecast_type_ids.Rd @@ -26,7 +26,7 @@ get_forecast_type_ids(data) } \arguments{ \item{data}{A data.frame (or similar) with predicted and observed values. -See the details section of for additional information +See the "Target format" section in Details for additional information on the required input format.} } \value{ diff --git a/man/impute_mean_score.Rd b/man/impute_mean_score.Rd new file mode 100644 index 000000000..55b32441f --- /dev/null +++ b/man/impute_mean_score.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute-missing-scores.R +\name{impute_mean_score} +\alias{impute_mean_score} +\title{Impute with mean observed score} +\usage{ +impute_mean_score() +} +\value{ +A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +} +\description{ +Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills each +missing metric with the mean observed value for that metric +within the same target combination across all values of the +\code{compare} column. Target combinations with no non-NA +observations are filled with \code{NA_real_}. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +impute_missing_scores(scores, strategy = impute_mean_score()) +} +\seealso{ +\code{\link[=impute_missing_scores]{impute_missing_scores()}}, \code{\link[=impute_worst_score]{impute_worst_score()}} +} +\keyword{postprocess-scores} diff --git a/man/impute_missing_scores.Rd b/man/impute_missing_scores.Rd new file mode 100644 index 000000000..16c6902f4 --- /dev/null +++ b/man/impute_missing_scores.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute-missing-scores.R +\name{impute_missing_scores} +\alias{impute_missing_scores} +\title{Impute missing scores} +\usage{ +impute_missing_scores(scores, strategy, compare = "model") +} +\arguments{ +\item{scores}{An object of class \code{scores} (a data.table with +an additional \code{metrics} attribute as produced by \code{\link[=score]{score()}}).} + +\item{strategy}{A strategy function with signature +\verb{function(scores, missing_rows, metrics, compare)} that +returns \code{missing_rows} with the metric columns filled. +Built-in options are \code{\link[=impute_worst_score]{impute_worst_score()}}, +\code{\link[=impute_mean_score]{impute_mean_score()}}, \code{\link[=impute_na_score]{impute_na_score()}}, and +\code{\link[=impute_model_score]{impute_model_score()}}. Custom strategies are also +supported.} + +\item{compare}{Character string (default \code{"model"}) naming the +column whose values are compared against each target to +identify missing combinations.} +} +\value{ +A \code{scores} object with an additional \code{.imputed} +column. Rows that were imputed have \code{.imputed = TRUE}. +} +\description{ +Fills in scores for forecast-target combinations that are +missing from the data, using a user-specified imputation +strategy. This is useful to ensure all models are evaluated +on the same set of targets, which avoids bias when +summarising scores. + +Missing combinations are identified by comparing each value +of the \code{compare} column against the union of targets observed +across all values. The strategy is then called to fill the +metric columns for those rows. + +An \code{.imputed} column is added to the output indicating which +rows were imputed (\code{TRUE}) and which are original (\code{FALSE}). +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +# Impute with NA values +impute_missing_scores(scores, strategy = impute_na_score()) +} +\seealso{ +\code{\link[=impute_worst_score]{impute_worst_score()}}, \code{\link[=impute_mean_score]{impute_mean_score()}}, +\code{\link[=impute_na_score]{impute_na_score()}}, \code{\link[=impute_model_score]{impute_model_score()}}, +\code{vignette("handling-missing-forecasts")} +} +\keyword{postprocess-scores} diff --git a/man/impute_model_score.Rd b/man/impute_model_score.Rd new file mode 100644 index 000000000..ca44b1737 --- /dev/null +++ b/man/impute_model_score.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute-missing-scores.R +\name{impute_model_score} +\alias{impute_model_score} +\title{Impute with a reference model's scores} +\usage{ +impute_model_score(model) +} +\arguments{ +\item{model}{Character string naming the reference model +whose scores should be used for imputation. The reference +model must have scores for all target combinations that +need imputing; otherwise an error is raised.} +} +\value{ +A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +} +\description{ +Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills missing +scores with the scores of a specified reference model for +the same target combination. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +impute_missing_scores( + scores, + strategy = impute_model_score("EuroCOVIDhub-baseline") +) +} +\seealso{ +\code{\link[=impute_missing_scores]{impute_missing_scores()}} +} +\keyword{postprocess-scores} diff --git a/man/impute_na_score.Rd b/man/impute_na_score.Rd new file mode 100644 index 000000000..ad676e703 --- /dev/null +++ b/man/impute_na_score.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute-missing-scores.R +\name{impute_na_score} +\alias{impute_na_score} +\title{Impute with NA values} +\usage{ +impute_na_score() +} +\value{ +A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +} +\description{ +Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills each +missing metric with \code{NA_real_}. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +impute_missing_scores(scores, strategy = impute_na_score()) +} +\seealso{ +\code{\link[=impute_missing_scores]{impute_missing_scores()}} +} +\keyword{postprocess-scores} diff --git a/man/impute_worst_score.Rd b/man/impute_worst_score.Rd new file mode 100644 index 000000000..0868cf041 --- /dev/null +++ b/man/impute_worst_score.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/impute-missing-scores.R +\name{impute_worst_score} +\alias{impute_worst_score} +\title{Impute with worst (maximum) observed score} +\usage{ +impute_worst_score() +} +\value{ +A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}. +} +\description{ +Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills each +missing metric with the worst (maximum) observed value for +that metric within the same target combination across all +values of the \code{compare} column. Target combinations with no +non-NA observations are filled with \code{NA_real_}. +} +\examples{ +\dontshow{ + data.table::setDTthreads(2) +} +scores <- example_quantile |> + as_forecast_quantile() |> + score() + +impute_missing_scores(scores, strategy = impute_worst_score()) +} +\seealso{ +\code{\link[=impute_missing_scores]{impute_missing_scores()}}, \code{\link[=impute_mean_score]{impute_mean_score()}} +} +\keyword{postprocess-scores} diff --git a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg index b22701e63..a8d99587f 100644 --- a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg +++ b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg @@ -29,8 +29,8 @@ - + @@ -38,8 +38,8 @@ < 0.001 < 0.001 1 -< 0.001 0.298 +< 0.001 1 0.298 < 0.001 @@ -57,9 +57,9 @@ - + @@ -73,9 +73,9 @@ < 0.001 < 0.001 1 -< 0.001 < 0.001 < 0.001 +< 0.001 1 0.007 < 0.001 diff --git a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg index 372e416d9..aa58d5f93 100644 --- a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg +++ b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg @@ -29,8 +29,8 @@ - + @@ -38,8 +38,8 @@ 1.37 1.59 1 -0.63 0.86 +0.63 1 1.16 0.73 @@ -57,9 +57,9 @@ - + @@ -73,9 +73,9 @@ 3.03 3.85 1 -0.26 0.62 0.79 +0.26 1 0.74 1.27 diff --git a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg index 74ca3f682..fb64844df 100644 --- a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg +++ b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg @@ -26,18 +26,18 @@ - + - + --0.06 0.1 +-0.06 -0.08 -0.07 0.34 +0.07 -0.02 -0.01 diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg index aae13f7d0..00103e3ab 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg @@ -26,14 +26,14 @@ - + - + - + @@ -44,16 +44,16 @@ - + - + - + diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg index 2c21e6aca..a271f2373 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg @@ -26,14 +26,14 @@ - + - + - + @@ -44,16 +44,16 @@ - + - + - + diff --git a/tests/testthat/_snaps/plot_wis/plot-wis.svg b/tests/testthat/_snaps/plot_wis/plot-wis.svg index 6d842785e..c03f3893d 100644 --- a/tests/testthat/_snaps/plot_wis/plot-wis.svg +++ b/tests/testthat/_snaps/plot_wis/plot-wis.svg @@ -26,14 +26,14 @@ - + - + - + @@ -44,16 +44,16 @@ - + - + - + diff --git a/tests/testthat/test-filter-scores.R b/tests/testthat/test-filter-scores.R new file mode 100644 index 000000000..54015a291 --- /dev/null +++ b/tests/testthat/test-filter-scores.R @@ -0,0 +1,333 @@ +# ============================================================================== +# build_missing_grid() +# ============================================================================== +test_that("build_missing_grid() returns correct missing rows", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + missing <- build_missing_grid(scores, compare = "model") + expect_s3_class(missing, "data.table") + expect_equal(nrow(missing), 1) + expect_equal(missing$model, "B") + expect_equal(missing$location, "US") +}) + +test_that("build_missing_grid() returns zero rows when nothing missing", { + scores <- data.table::data.table( + model = c("A", "A", "B", "B"), + location = c("DE", "US", "DE", "US"), + wis = c(1, 2, 3, 4) + ) + scores <- new_scores(scores, "wis") + missing <- build_missing_grid(scores, compare = "model") + expect_equal(nrow(missing), 0) +}) + +test_that("build_missing_grid() uses observed target combos", { + # If we had locations DE, US and dates Mon, Tue but only + # (DE, Mon) and (US, Tue) are observed, the grid should NOT + # include (DE, Tue) or (US, Mon) + scores <- data.table::data.table( + model = c("A", "A", "B", "B"), + location = c("DE", "US", "DE", "US"), + date = c("Mon", "Tue", "Mon", "Tue"), + wis = c(1, 2, 3, 4) + ) + scores <- new_scores(scores, "wis") + missing <- build_missing_grid(scores, compare = "model") + expect_equal(nrow(missing), 0) + + # Now remove one combo - B should be missing (US, Tue) + scores2 <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + date = c("Mon", "Tue", "Mon"), + wis = c(1, 2, 3) + ) + scores2 <- new_scores(scores2, "wis") + missing2 <- build_missing_grid(scores2, compare = "model") + expect_equal(nrow(missing2), 1) + expect_equal(missing2$model, "B") + expect_equal(missing2$location, "US") + expect_equal(missing2$date, "Tue") +}) + + +# ============================================================================== +# filter_scores() +# ============================================================================== +test_that("filter_scores() drops incomplete targets", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + result <- suppressMessages(filter_scores(scores)) + # Only DE should remain (both include have it) + expect_equal(nrow(result), 2) + expect_true(all(result$location == "DE")) +}) + +test_that("filter_scores() preserves class and metrics", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + result <- suppressMessages(filter_scores(scores)) + expect_s3_class(result, "scores") + expect_equal(attr(result, "metrics"), "wis") +}) + +test_that("filter_scores() unchanged when nothing missing", { + scores <- data.table::data.table( + model = c("A", "A", "B", "B"), + location = c("DE", "US", "DE", "US"), + wis = c(1, 2, 3, 4) + ) + scores <- new_scores(scores, "wis") + expect_message( + result <- filter_scores(scores), + "No rows filtered" + ) + expect_equal(nrow(result), 4) +}) + +test_that("filter_scores() reports rows dropped", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + expect_message( + filter_scores(scores), + "Filtered out 1 row" + ) +}) + +test_that("filter_scores() errors on strategy with wrong formals", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + bad_strategy <- function(scores) scores + expect_error( + filter_scores(scores, strategy = bad_strategy), + "missing required" + ) +}) + +test_that("filter_scores() errors on invalid compare column", { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + expect_error( + filter_scores(scores, compare = "nonexistent"), + "nonexistent" + ) +}) + + +# ============================================================================== +# filter_to_intersection() +# ============================================================================== +test_that("filter_to_intersection(min_coverage=0.5) works", { + scores <- data.table::data.table( + model = c("A", "A", "A", "B", "C"), + location = c("DE", "US", "FR", "DE", "DE"), + wis = c(1, 2, 3, 4, 5) + ) + scores <- new_scores(scores, "wis") + strategy <- filter_to_intersection(min_coverage = 0.5) + result <- strategy(scores, compare = "model") + # DE covered by 3/3, US by 1/3, FR by 1/3 + # At min_coverage = 0.5, only DE qualifies + expect_true(all(result$location == "DE")) + + strategy2 <- filter_to_intersection(min_coverage = 1 / 3) + result2 <- strategy2(scores, compare = "model") + # All locations have coverage >= 1/3 + expect_equal(nrow(result2), 5) +}) + +test_that("filter_to_include() keeps targets of single model", { + scores <- data.table::data.table( + model = c("m1", "m1", "m2", "m2", "m3"), + location = c("DE", "US", "DE", "FR", "DE"), + wis = c(1, 2, 3, 4, 5) + ) + scores <- new_scores(scores, "wis") + strategy <- filter_to_include("m1") + result <- strategy(scores, compare = "model") + # m1 covers DE and US, so keep all rows with DE or US + expect_true(all(result$location %in% c("DE", "US"))) + # FR should be dropped + expect_false("FR" %in% result$location) +}) + +test_that("filter_to_include() intersects multiple models", { + scores <- data.table::data.table( + model = c("m1", "m1", "m2", "m2", "m3"), + location = c("DE", "US", "DE", "FR", "DE"), + wis = c(1, 2, 3, 4, 5) + ) + scores <- new_scores(scores, "wis") + strategy <- filter_to_include(c("m1", "m2")) + result <- strategy(scores, compare = "model") + # m1 covers DE, US; m2 covers DE, FR; intersection = DE + expect_true(all(result$location == "DE")) +}) + +test_that("filter_to_include() errors on unknown compare value", { + scores <- data.table::data.table( + model = c("A", "B"), + location = c("DE", "DE"), + wis = c(1, 2) + ) + scores <- new_scores(scores, "wis") + expect_error( + filter_scores( + scores, + strategy = filter_to_include("Z") + ), + "not found" + ) +}) + +test_that("filter_scores() works with non-default compare", { + scores <- data.table::data.table( + forecaster = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + result <- suppressMessages(filter_scores( + scores, compare = "forecaster" + )) + expect_true(all(result$location == "DE")) + expect_equal(nrow(result), 2) +}) + + +# ============================================================================== +# Integration tests with scores_quantile +# ============================================================================== +test_that( + "filter_to_include() with real data filters + to model's targets", + { + scores <- scores_quantile + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + # EpiNow2 is missing 9 death targets out of 256 + epinow2_targets <- unique( + scores[ + model == "epiforecasts-EpiNow2", + target_cols, + with = FALSE + ] + ) + n_epinow2 <- nrow(epinow2_targets) + + result <- suppressMessages(filter_scores( + scores, + strategy = filter_to_include( + "epiforecasts-EpiNow2" + ) + )) + + # All remaining targets should be EpiNow2's targets + result_targets <- unique( + result[, target_cols, with = FALSE] + ) + expect_equal(nrow(result_targets), n_epinow2) + + # The 9 death targets EpiNow2 doesn't cover + # should have been dropped + all_targets <- unique( + scores[, target_cols, with = FALSE] + ) + n_dropped_targets <- nrow(all_targets) - n_epinow2 + expect_equal(n_dropped_targets, 9) + + # Every model in the result should only have + # targets that EpiNow2 covers + data.table::setkeyv(result_targets, target_cols) + data.table::setkeyv(epinow2_targets, target_cols) + expect_equal(result_targets, epinow2_targets) + } +) + +test_that( + "filter_to_intersection(min_coverage) boundary with + real data", + { + scores <- scores_quantile + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + # 4 models total. UMass-MechBayes has no case targets + # (128/256), so case targets are covered by 3/4 = 0.75. + # At min_coverage = 0.75 case targets should be kept. + result_relaxed <- suppressMessages(filter_scores( + scores, + strategy = filter_to_intersection( + min_coverage = 0.75 + ) + )) + relaxed_types <- unique(result_relaxed$target_type) + expect_true("Cases" %in% relaxed_types) + expect_true("Deaths" %in% relaxed_types) + + # At min_coverage = 1.0 (default), case targets + # should be dropped because UMass-MechBayes lacks them. + result_strict <- suppressMessages(filter_scores(scores)) + strict_types <- unique(result_strict$target_type) + expect_false("Cases" %in% strict_types) + expect_true("Deaths" %in% strict_types) + + # Relaxed should have strictly more rows + expect_gt(nrow(result_relaxed), nrow(result_strict)) + } +) + +test_that( + "filter_scores then summarise_scores gives equal target + counts per model", + { + scores <- scores_quantile + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + filtered <- suppressMessages(filter_scores(scores)) + + # Count distinct targets per model + targets_per_model <- filtered[, + .(n_targets = data.table::uniqueN( + .SD[, target_cols, with = FALSE] + )), + by = "model" + ] + # All models should have the same number of targets + expect_length(unique(targets_per_model$n_targets), 1) + + # Summarise should work and give a row per model + summary <- summarise_scores(filtered, by = "model") + expect_equal( + nrow(summary), + length(unique(filtered$model)) + ) + } +) diff --git a/tests/testthat/test-get-protected-columns.R b/tests/testthat/test-get-protected-columns.R index 896dfa6f9..729a5b54b 100644 --- a/tests/testthat/test-get-protected-columns.R +++ b/tests/testthat/test-get-protected-columns.R @@ -6,9 +6,11 @@ test_that("get_protected_columns() works as expected", { get_protected_columns(), c( ".mv_group_id", + ".imputed", "predicted", "observed", "sample_id", "quantile_level", "upper", "lower", "pit_value", - "interval_range", "boundary", "predicted_label", "interval_coverage", + "interval_range", "boundary", "predicted_label", + "interval_coverage", "interval_coverage_deviation", "quantile_coverage", "quantile_coverage_deviation" ) diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R new file mode 100644 index 000000000..c0327bc6d --- /dev/null +++ b/tests/testthat/test-impute-missing-scores.R @@ -0,0 +1,547 @@ +# ============================================================================== +# impute_missing_scores() +# ============================================================================== +test_that( + "impute_missing_scores adds .imputed = FALSE when nothing missing", + { + # Use only models with complete targets + scores <- scores_quantile[ + model %in% c( + "EuroCOVIDhub-ensemble", + "EuroCOVIDhub-baseline" + ) + ] + scores <- new_scores( + scores, get_metrics.scores(scores_quantile) + ) + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_na_score() + )) + expect_true(".imputed" %in% names(result)) + expect_false(any(result$.imputed)) + } +) + +test_that( + "impute_missing_scores preserves scores class and metrics", + { + scores <- scores_quantile + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_na_score() + )) + expect_s3_class(result, "scores") + expect_identical( + get_metrics.scores(result), + get_metrics.scores(scores) + ) + } +) + +test_that(".imputed is not in get_metrics.scores output", { + scores <- scores_quantile + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_na_score() + )) + metrics <- get_metrics.scores(result) + expect_false(".imputed" %in% metrics) +}) + +test_that(".imputed is not in get_forecast_unit output", { + # Requires .imputed in get_protected_columns() + # (added by parallel agent) + skip_if_not( + ".imputed" %in% get_protected_columns(), + ".imputed not yet in get_protected_columns" + ) + scores <- scores_quantile + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_na_score() + )) + fu <- get_forecast_unit(result) + expect_false(".imputed" %in% fu) +}) + +# ============================================================================== +# Strategy factories with missing data +# ============================================================================== + +# scores_quantile already has missing combinations: +# UMass-MechBayes has 128 of 256 targets, +# epiforecasts-EpiNow2 has 247 of 256. +# Use it directly for tests needing missing data. + +test_that("impute_worst_score fills with max observed score", { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- scores_quantile + metrics <- get_metrics.scores(scores) + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_worst_score() + )) + # Imputed rows should exist + + imputed <- result[(.imputed)] + if (nrow(imputed) > 0) { + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + # Each imputed metric value should equal the max of + # that metric within the same target combination + for (m in metrics) { + if (!(m %in% names(imputed)) || + !(m %in% names(scores))) next + max_per_target <- scores[, + .(..max = max(get(m), na.rm = TRUE)), + by = target_cols + ] + merged <- merge( + imputed, max_per_target, + by = target_cols, all.x = TRUE + ) + expect_true( + all( + merged[[m]] == merged[["..max"]] | + is.na(merged[[m]]) + ), + info = paste( + "metric", m, + "does not match per-target max" + ) + ) + } + } +}) + +test_that("impute_mean_score fills with mean observed score", { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- scores_quantile + metrics <- get_metrics.scores(scores) + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_mean_score() + )) + imputed <- result[(.imputed)] + expect_gt(nrow(imputed), 0) + + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + for (m in metrics) { + if (!(m %in% names(imputed)) || + !(m %in% names(scores))) next + mean_per_target <- scores[, + .(..mean = mean(get(m), na.rm = TRUE)), + by = target_cols + ] + merged <- merge( + imputed, mean_per_target, + by = target_cols, all.x = TRUE + ) + expect_true( + all( + abs(merged[[m]] - merged[["..mean"]]) < 1e-10 | + is.na(merged[[m]]) + ), + info = paste( + "metric", m, + "does not match per-target mean" + ) + ) + } +}) + +test_that("impute_na_score fills with NA_real_", { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- scores_quantile + metrics <- get_metrics.scores(scores) + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_na_score() + )) + imputed <- result[(.imputed)] + if (nrow(imputed) > 0) { + for (m in metrics) { + if (m %in% names(imputed)) { + expect_true( + all(is.na(imputed[[m]])), + info = paste("metric", m, "should be NA") + ) + } + } + } +}) + +test_that( + "impute_model_score fills with reference model scores", + { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- scores_quantile + # EuroCOVIDhub-baseline has all 256 targets so can + # serve as reference for all missing combinations + result <- suppressMessages(impute_missing_scores( + scores, + strategy = impute_model_score( + "EuroCOVIDhub-baseline" + ) + )) + expect_s3_class(result, "scores") + } +) + +test_that( + "impute_model_score errors when ref model missing target", + { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- scores_quantile + # UMass-MechBayes only has 128 of 256 targets, so + # it cannot serve as reference for imputing all missing + # target combinations + expect_error( + impute_missing_scores( + scores, + strategy = impute_model_score("UMass-MechBayes") + ) + ) + } +) + +test_that("custom strategy function works", { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + custom_strategy <- function(scores, missing_rows, metrics, + compare) { + for (m in metrics) { + data.table::set(missing_rows, j = m, value = 999) + } + return(missing_rows) + } + scores <- scores_quantile + metrics <- get_metrics.scores(scores) + result <- suppressMessages(impute_missing_scores( + scores, strategy = custom_strategy + )) + imputed <- result[(.imputed)] + if (nrow(imputed) > 0) { + for (m in metrics) { + if (m %in% names(imputed)) { + expect_true(all(imputed[[m]] == 999)) + } + } + } +}) + +# ============================================================================== +# Integration tests +# ============================================================================== +test_that( + "impute_missing_scores |> summarise_scores works", + { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + scores <- scores_quantile + imputed <- suppressMessages(impute_missing_scores( + scores, strategy = impute_na_score() + )) + result <- summarise_scores(imputed, by = "model") + expect_s3_class(result, "data.table") + } +) + +test_that( + "filter then impute pipeline works", + { + skip_if_not( + exists("build_missing_grid", + where = asNamespace("scoringutils")), + "build_missing_grid not yet available" + ) + skip_if_not( + exists("filter_scores", + where = asNamespace("scoringutils")), + "filter_scores not yet available" + ) + skip_if_not( + exists("filter_to_include", + where = asNamespace("scoringutils")), + "filter_to_include not yet available" + ) + scores <- scores_quantile + ref_model <- "EuroCOVIDhub-baseline" + + filtered <- suppressMessages(filter_scores( + scores, + strategy = filter_to_include(ref_model) + )) + result <- suppressMessages(impute_missing_scores( + filtered, + strategy = impute_model_score(ref_model) + )) + expect_s3_class(result, "scores") + } +) + +test_that( + "impute_worst_score skips metrics not in columns", + { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + # Attribute claims "wis" and "fake" but "fake" is + # not a column + scores <- new_scores(scores, c("wis", "fake")) + result <- suppressWarnings(suppressMessages( + impute_missing_scores( + scores, strategy = impute_worst_score() + ) + )) + imputed <- result[result$.imputed] + expect_equal(nrow(imputed), 1) + expect_false("fake" %in% names(imputed)) + } +) + +test_that( + "impute_mean_score skips metrics not in columns", + { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, c("wis", "fake")) + result <- suppressWarnings(suppressMessages( + impute_missing_scores( + scores, strategy = impute_mean_score() + ) + )) + imputed <- result[result$.imputed] + expect_equal(nrow(imputed), 1) + expect_false("fake" %in% names(imputed)) + } +) + +test_that( + "impute_model_score errors for nonexistent model", + { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + expect_error( + impute_missing_scores( + scores, + strategy = impute_model_score("nonexistent") + ), + "not found" + ) + } +) + +test_that( + "impute_worst_score returns NA when all scores for a target are NA", + { + scores <- data.table::data.table( + model = c("A", "A", "B", "C"), + location = c("DE", "US", "DE", "DE"), + wis = c(NA_real_, 2, NA_real_, NA_real_) + ) + scores <- new_scores(scores, "wis") + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_worst_score() + )) + # B and C are missing US; all DE wis are NA so the + # imputed DE rows for any model should be NA, not -Inf. + imputed <- result[(.imputed) & location == "DE"] + expect_true(all(is.na(imputed$wis))) + expect_false(any(is.infinite(imputed$wis))) + } +) + +test_that( + "impute_mean_score returns NA when all scores for a target are NA", + { + scores <- data.table::data.table( + model = c("A", "A", "B", "C"), + location = c("DE", "US", "DE", "DE"), + wis = c(NA_real_, 2, NA_real_, NA_real_) + ) + scores <- new_scores(scores, "wis") + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_mean_score() + )) + imputed <- result[(.imputed) & location == "DE"] + expect_true(all(is.na(imputed$wis))) + expect_false(any(is.nan(imputed$wis))) + } +) + +test_that( + "impute_missing_scores errors on strategy with wrong formals", + { + scores <- data.table::data.table( + model = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + bad_strategy <- function(scores, missing_rows) missing_rows + expect_error( + impute_missing_scores(scores, strategy = bad_strategy), + "missing required" + ) + } +) + +test_that( + "impute_missing_scores works with non-default compare", + { + scores <- data.table::data.table( + forecaster = c("A", "A", "B"), + location = c("DE", "US", "DE"), + wis = c(1, 2, 3) + ) + scores <- new_scores(scores, "wis") + result <- suppressMessages(impute_missing_scores( + scores, + strategy = impute_worst_score(), + compare = "forecaster" + )) + expect_true(".imputed" %in% names(result)) + imputed <- result[result$.imputed] + expect_equal(nrow(imputed), 1) + expect_equal(imputed$forecaster, "B") + expect_equal(imputed$location, "US") + } +) + + +# ============================================================================== +# Additional integration tests +# ============================================================================== +test_that( + "impute then summarise gives equal row counts per model", + { + scores <- scores_quantile + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_na_score() + )) + + # After imputation, every model should have the same + # number of rows (one per target combination) + rows_per_model <- result[, .N, by = "model"] + expect_equal( + length(unique(rows_per_model$N)), 1, + info = paste( + "Expected equal rows per model, got:", + paste( + rows_per_model$model, + rows_per_model$N, + sep = "=", collapse = ", " + ) + ) + ) + } +) + +test_that( + "impute_model_score values match reference model scores", + { + scores <- scores_quantile + metrics <- attr(scores, "metrics") + fu <- get_forecast_unit(scores) + target_cols <- setdiff(fu, "model") + ref_name <- "EuroCOVIDhub-baseline" + + result <- suppressMessages(impute_missing_scores( + scores, + strategy = impute_model_score(ref_name) + )) + imputed <- result[(.imputed)] + expect_gt(nrow(imputed), 0) + + # Get the reference model's actual scores + ref_scores <- scores[model == ref_name] + + # For each imputed row, the metric values should equal + # the reference model's scores for the same target + merged <- merge( + imputed, + ref_scores, + by = target_cols, + suffixes = c(".imputed", ".ref") + ) + expect_equal(nrow(merged), nrow(imputed)) + + for (m in metrics) { + col_imp <- paste0(m, ".imputed") + col_ref <- paste0(m, ".ref") + if (col_imp %in% names(merged) && + col_ref %in% names(merged)) { + expect_equal( + merged[[col_imp]], + merged[[col_ref]], + info = paste( + "Imputed", m, + "should match reference model" + ) + ) + } + } + } +) + +test_that( + "imputation does not alter original score values", + { + scores <- scores_quantile + metrics <- get_metrics.scores(scores) + result <- suppressMessages(impute_missing_scores( + scores, strategy = impute_worst_score() + )) + + originals <- result[!(.imputed)] + originals[, .imputed := NULL] + + # Original metric values should be unchanged + for (m in metrics) { + if (!(m %in% names(originals))) next + data.table::setkeyv(originals, names(scores)) + data.table::setkeyv(scores, names(scores)) + expect_equal( + as.numeric(originals[[m]]), + as.numeric(scores[[m]]), + info = paste("metric", m, "changed") + ) + } + } +) diff --git a/vignettes/handling-missing-forecasts.Rmd b/vignettes/handling-missing-forecasts.Rmd new file mode 100644 index 000000000..5658119a9 --- /dev/null +++ b/vignettes/handling-missing-forecasts.Rmd @@ -0,0 +1,202 @@ +--- +title: "Handling missing forecasts" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Handling missing forecasts} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, comment = "#>" +) +data.table::setDTthreads(2) +``` + +When comparing forecast models, not all models will have made predictions for every target. +Naively averaging scores across different sets of targets can produce misleading summaries. +This vignette walks through diagnosing which targets each model covers, scoring the forecasts, and then adjusting the scores using filtering and imputation so that summaries are comparable. + +The approaches here were initially inspired by [Kim, Ray & Reich (2026)](https://doi.org/10.1016/j.ijforecast.2025.12.006), who discuss the importance of handling missing forecasts when evaluating model contributions beyond simple leaderboard rankings. + +## Diagnosing missingness + +Before adjusting scores we should aim to understand the patterns of potential missingness. +Models may have different coverage for legitimate reasons (a model may only forecast deaths, not cases) or because of operational failures (a missed submission deadline). + +`get_forecast_counts()` tabulates how many forecasts each model has, grouped by the columns you choose. + +```{r counts} +library(scoringutils) +fc <- as_forecast_quantile(example_quantile) +get_forecast_counts(fc, by = c("model", "target_type")) +``` + +`UMass-MechBayes` does not forecast cases at all, and `epiforecasts-EpiNow2` has fewer death forecasts than the other models. + +To see exactly which death targets `epiforecasts-EpiNow2` is missing, we can request counts at a finer level and filter to zero-count rows. + +```{r missing-detail} +death_counts <- get_forecast_counts( + fc, + by = c("model", "target_type", "location", "target_end_date") +) +death_counts[ + model == "epiforecasts-EpiNow2" & + target_type == "Deaths" & + count == 0 +] +``` + +Note that `get_pairwise_comparisons()` handles missingness internally, restricting each pair of models to their shared set of targets before comparing. +The functions below give you explicit control over how to handle missingness when computing score summaries. + +## Scoring + +We score all forecasts first, then adjust the resulting scores table. +Both `filter_scores()` and `impute_missing_scores()` accept a `compare` argument (default `"model"`) that specifies which column identifies the units being compared. + +```{r score} +scores <- score(fc) +``` + +A naive summary averages over different numbers of targets per model, which can make direct comparison misleading. + +```{r naive-summary} +summarise_scores(scores, by = "model") +``` + +The sections below show two approaches to addressing this: filtering scores to a common set of targets, and imputing scores for missing targets. + +## Filtering to a common set of targets + +`filter_scores()` removes scores based on the supplied strategy. +This can be appropriate when a model legitimately does not cover certain targets and you want to compare only on shared ground. + +The default strategy, `filter_to_intersection()`, keeps only targets covered by **all** models. + +```{r filter-default} +scores_filtered <- filter_scores(scores) +summarise_scores(scores_filtered, by = "model") +``` + +This drops all case targets (since `UMass-MechBayes` has no case forecasts) and the death targets `epiforecasts-EpiNow2` missed. + +### Requiring partial coverage + +The default requires every model to cover a target for it to be kept. +The `min_coverage` argument relaxes this by keeping targets covered by at least a given proportion of models. +With four models, `min_coverage = 0.75` requires coverage by at least three. + +```{r filter-relaxed} +scores_relaxed <- filter_scores( + scores, + strategy = filter_to_intersection(min_coverage = 0.75) +) +summarise_scores(scores_relaxed, by = "model") +``` + +In this example, case targets are covered by three of four models and death targets by three or four, so `min_coverage = 0.75` retains all targets. +The default (`min_coverage = 1`) is stricter and drops all case targets because `UMass-MechBayes` has no case forecasts. +Between 0.75 and 1.0 no intermediate threshold changes the result here because no target is covered by exactly three out of four models while being missing for one that isn't `UMass-MechBayes`. + +### Filtering to a specific model's targets + +`filter_to_include()` restricts to targets covered by named models. +For example, to evaluate all models only on the targets `epiforecasts-EpiNow2` covered: + +```{r filter-include} +scores_epinow2 <- filter_scores( + scores, + strategy = filter_to_include("epiforecasts-EpiNow2") +) +summarise_scores(scores_epinow2, by = "model") +``` + +This keeps both case and death targets where EpiNow2 submitted forecasts, but `UMass-MechBayes` will still be missing case scores in the result. + +## Imputing missing scores + +Instead of dropping data, `impute_missing_scores()` fills in scores for target combinations a model did not cover. +Imputed rows are marked with `.imputed = TRUE` so they can be identified later. + +### NA + +The simplest option: fill missing scores with `NA`. +This preserves the structure of the data without making assumptions about what the score would have been. +`NA` values propagate through summaries, so this can also serve as a diagnostic check to confirm where missingness exists. + +```{r impute-na} +scores_na <- impute_missing_scores( + scores, + strategy = impute_na_score() +) +summarise_scores(scores_na, by = "model") +``` + +### Worst score + +Fill each missing score with the worst (maximum) observed score for that target across all models. +This penalises models most heavily for missing targets. + +```{r impute-worst} +scores_worst <- impute_missing_scores( + scores, + strategy = impute_worst_score() +) +summarise_scores(scores_worst, by = "model") +``` + +We can check that the imputed rows match the models and targets we identified as missing earlier. + +```{r imputed-check} +scores_worst[ + (.imputed), + .(n_imputed = .N), + by = c("model", "target_type") +] +``` + +### Reference model + +Fill with the scores of a named baseline model, treating a missing forecast as performing no better than that baseline. + +```{r impute-model} +scores_ref <- impute_missing_scores( + scores, + strategy = impute_model_score("EuroCOVIDhub-baseline") +) +summarise_scores(scores_ref, by = "model") +``` + +This is a reasonable default when a suitable baseline exists, though more research is needed on best practice for choosing the reference model and understanding the impact of this choice. + +### Mean score + +Fill with the mean score across models that did forecast each target. +This is the least severe penalty as it assigns the average performance, which may be close to the ensemble performance. + +```{r impute-mean} +scores_mean <- impute_missing_scores( + scores, + strategy = impute_mean_score() +) +summarise_scores(scores_mean, by = "model") +``` + +## Combining filter and impute + +Consider combining filtering and imputation when you want to focus on a specific model's targets but still need complete scores for all models. +For example, to evaluate on the targets `epiforecasts-EpiNow2` covered and then impute scores for models that are missing forecasts within that set: + +```{r pipeline} +result <- scores |> + filter_scores( + strategy = filter_to_include("epiforecasts-EpiNow2") + ) |> + impute_missing_scores( + strategy = impute_worst_score() + ) +summarise_scores(result, by = "model") +```