diff --git a/NAMESPACE b/NAMESPACE
index 93237a640..8dea0e210 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -78,6 +78,9 @@ export(dispersion_quantile)
export(dispersion_sample)
export(dss_sample)
export(energy_score_multivariate)
+export(filter_scores)
+export(filter_to_include)
+export(filter_to_intersection)
export(get_correlations)
export(get_coverage)
export(get_duplicate_forecasts)
@@ -88,6 +91,11 @@ export(get_grouping)
export(get_metrics)
export(get_pairwise_comparisons)
export(get_pit_histogram)
+export(impute_mean_score)
+export(impute_missing_scores)
+export(impute_model_score)
+export(impute_na_score)
+export(impute_worst_score)
export(interval_coverage)
export(is_forecast)
export(is_forecast_binary)
@@ -178,12 +186,14 @@ importFrom(data.table,key)
importFrom(data.table,melt)
importFrom(data.table,nafill)
importFrom(data.table,rbindlist)
+importFrom(data.table,set)
importFrom(data.table,setDT)
importFrom(data.table,setattr)
importFrom(data.table,setcolorder)
importFrom(data.table,setkeyv)
importFrom(data.table,setnames)
importFrom(data.table,setorderv)
+importFrom(data.table,uniqueN)
importFrom(ggplot2,.data)
importFrom(ggplot2,`%+replace%`)
importFrom(ggplot2,aes)
diff --git a/NEWS.md b/NEWS.md
index 52d75f99a..e5eced91f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,6 @@
# scoringutils (development version)
+- Added `filter_scores()` and `impute_missing_scores()` for handling missing forecasts before summarisation. `filter_scores()` removes target combinations with insufficient model coverage, while `impute_missing_scores()` fills in missing scores using configurable strategies (worst, mean, NA, or reference model). Both use a strategy function pattern for extensibility. See `vignette("handling-missing-forecasts")` for details (#1122).
- Added internal S3 generic `get_forecast_type_ids()` so each forecast type declares the columns (beyond the forecast unit) that identify a unique row. `get_duplicate_forecasts()` now uses this instead of hard-coded column names (#888).
- Removed the deprecated vignettes `Deprecated-functions` and `Deprecated-visualisations`. The code for removed functions (`plot_predictions()`, `make_NA()`, `plot_ranges()`, `plot_score_table()`, `merge_pred_and_obs()`) can still be found in the [git history](https://github.com/epiforecasts/scoringutils/tree/d0cd8e2/vignettes) (#1158).
diff --git a/R/filter-scores.R b/R/filter-scores.R
new file mode 100644
index 000000000..eb4c0daec
--- /dev/null
+++ b/R/filter-scores.R
@@ -0,0 +1,228 @@
+#' @title Filter scores
+#'
+#' @description
+#' Filter a `scores` object using a supplied strategy function.
+#' `filter_scores()` is responsible for preserving the `scores`
+#' class and the `metrics` attribute; the strategy is
+#' responsible only for the filtering logic.
+#'
+#' Strategies are constructed by helpers such as
+#' [filter_to_intersection()] and [filter_to_include()] and can
+#' also be user-defined. A strategy is a function with
+#' signature `function(scores, compare)` that returns a
+#' filtered data.table with the same columns as its input.
+#'
+#' @param scores An object of class `scores` (a data.table with
+#' an additional `metrics` attribute as produced by [score()]).
+#' @param strategy A strategy function. See Description for the
+#' expected signature. Default: [filter_to_intersection()].
+#' @param compare Character string (default `"model"`) naming the
+#' column whose values are compared when deciding which
+#' target combinations to keep.
+#'
+#' @return A `scores` object with the same class and `metrics`
+#' attribute as the input, with rows filtered according to
+#' `strategy`.
+#'
+#' @seealso [filter_to_intersection()], [filter_to_include()],
+#' \code{vignette("handling-missing-forecasts")}
+#' @importFrom cli cli_inform
+#' @importFrom checkmate assert_class assert_character
+#' assert_subset
+#' @export
+#' @keywords postprocess-scores
+#' @examples
+#' \dontshow{
+#' data.table::setDTthreads(2)
+#' }
+#' scores <- example_quantile |>
+#' as_forecast_quantile() |>
+#' score()
+#'
+#' # Keep only targets covered by every model (the default)
+#' filter_scores(scores)
+#'
+#' # Keep targets covered by at least 75% of models
+#' filter_scores(
+#' scores,
+#' strategy = filter_to_intersection(min_coverage = 0.75)
+#' )
+#'
+#' # Keep only targets covered by a named model
+#' filter_scores(
+#' scores,
+#' strategy = filter_to_include("EuroCOVIDhub-baseline")
+#' )
+filter_scores <- function(
+ scores,
+ strategy = filter_to_intersection(),
+ compare = "model"
+) {
+ assert_class(scores, "scores")
+ assert_character(compare, len = 1)
+ assert_subset(compare, names(scores))
+ assert_strategy(strategy, required = "compare")
+
+ original_metrics <- attr(scores, "metrics")
+
+ result <- strategy(scores, compare = compare)
+
+ n_before <- nrow(scores)
+ n_after <- nrow(result)
+ n_dropped <- n_before - n_after
+
+ if (n_dropped == 0) {
+ cli_inform(c(
+ i = "No rows filtered. Returning scores unchanged."
+ ))
+ return(scores)
+ }
+
+ cli_inform(c(
+ i = "Filtered out {n_dropped} rows.",
+ i = "{n_after} of {n_before} rows remaining." # nolint: duplicate_argument_linter
+ ))
+
+ return(new_scores(result, original_metrics))
+}
+
+
+#' @title Filter to target combinations meeting a coverage threshold
+#'
+#' @description
+#' Strategy for [filter_scores()] that keeps target combinations
+#' covered by at least `min_coverage` of the values in the
+#' `compare` column. With the default `min_coverage = 1`, only
+#' target combinations present for every compare value are kept
+#' (strict intersection across the full set).
+#'
+#' To restrict to the targets covered by a named subset of
+#' compare values instead of by a proportion, use
+#' [filter_to_include()].
+#'
+#' @param min_coverage Numeric between 0 and 1 (default `1`).
+#' Minimum proportion of compare values that must cover a
+#' target combination for it to be kept.
+#'
+#' @return A strategy function for [filter_scores()]. Intended
+#' to be passed to `filter_scores()` rather than called
+#' directly — `filter_scores()` is where the `scores` class
+#' and `metrics` attribute are preserved.
+#'
+#' @seealso [filter_scores()], [filter_to_include()]
+#' @importFrom data.table as.data.table setkeyv uniqueN
+#' @importFrom checkmate assert_number
+#' @export
+#' @keywords postprocess-scores
+#' @examples
+#' \dontshow{
+#' data.table::setDTthreads(2)
+#' }
+#' scores <- example_quantile |>
+#' as_forecast_quantile() |>
+#' score()
+#' filter_scores(
+#' scores,
+#' strategy = filter_to_intersection(min_coverage = 0.75)
+#' )
+filter_to_intersection <- function(min_coverage = 1) {
+ assert_number(min_coverage, lower = 0, upper = 1)
+
+ function(scores, compare = "model") {
+ scores <- data.table::as.data.table(scores)
+ forecast_unit <- get_forecast_unit(scores)
+ target_cols <- setdiff(forecast_unit, compare)
+
+ n_total <- data.table::uniqueN(scores[[compare]])
+
+ target_coverage <- scores[,
+ .(n_compare = data.table::uniqueN(get(compare))),
+ by = target_cols
+ ]
+
+ keep <- target_coverage$n_compare / n_total >= min_coverage
+ qualifying <- target_coverage[keep, target_cols, with = FALSE]
+
+ data.table::setkeyv(scores, target_cols)
+ data.table::setkeyv(qualifying, target_cols)
+ scores[qualifying, nomatch = NULL]
+ }
+}
+
+
+#' @title Filter to targets covered by named compare values
+#'
+#' @description
+#' Strategy for [filter_scores()] that restricts the kept
+#' target combinations to those covered by every value listed
+#' in `include`. With a single value this keeps only that
+#' value's targets; with several values, the intersection of
+#' their target sets is kept.
+#'
+#' To use a proportion-based threshold over all compare values
+#' instead, use [filter_to_intersection()].
+#'
+#' @param include Character vector of length one or more. Values
+#' from the `compare` column whose target sets should be
+#' intersected.
+#'
+#' @return A strategy function for [filter_scores()]. Intended
+#' to be passed to `filter_scores()` rather than called
+#' directly — `filter_scores()` is where the `scores` class
+#' and `metrics` attribute are preserved.
+#'
+#' @seealso [filter_scores()], [filter_to_intersection()]
+#' @importFrom data.table as.data.table setkeyv
+#' @importFrom checkmate assert_character
+#' @importFrom cli cli_abort
+#' @export
+#' @keywords postprocess-scores
+#' @examples
+#' \dontshow{
+#' data.table::setDTthreads(2)
+#' }
+#' scores <- example_quantile |>
+#' as_forecast_quantile() |>
+#' score()
+#' filter_scores(
+#' scores,
+#' strategy = filter_to_include("EuroCOVIDhub-baseline")
+#' )
+filter_to_include <- function(include) {
+ assert_character(include, min.len = 1)
+
+ function(scores, compare = "model") {
+ scores <- data.table::as.data.table(scores)
+ forecast_unit <- get_forecast_unit(scores)
+ target_cols <- setdiff(forecast_unit, compare)
+
+ unknown <- setdiff(include, unique(scores[[compare]]))
+ if (length(unknown) > 0) {
+ cli_abort(c(
+ "!" = paste0(
+ "{.val {unknown}} not found in ",
+ "{.arg {compare}} column."
+ )
+ ))
+ }
+
+ target_sets <- lapply(include, function(v) {
+ unique(
+ scores[
+ scores[[compare]] == v,
+ target_cols,
+ with = FALSE
+ ]
+ )
+ })
+
+ qualifying <- Reduce(
+ function(a, b) merge(a, b, by = target_cols),
+ target_sets
+ )
+
+ data.table::setkeyv(scores, target_cols)
+ data.table::setkeyv(qualifying, target_cols)
+ scores[qualifying, nomatch = NULL]
+ }
+}
diff --git a/R/get-protected-columns.R b/R/get-protected-columns.R
index 0cf59bb39..839d24861 100644
--- a/R/get-protected-columns.R
+++ b/R/get-protected-columns.R
@@ -15,6 +15,7 @@ get_protected_columns <- function(data = NULL) {
protected_columns <- c(
".mv_group_id",
+ ".imputed",
"predicted", "observed", "sample_id", "quantile_level", "upper", "lower",
"pit_value", "interval_range", "boundary", "predicted_label",
"interval_coverage", "interval_coverage_deviation",
diff --git a/R/impute-missing-scores.R b/R/impute-missing-scores.R
new file mode 100644
index 000000000..86ec46c0d
--- /dev/null
+++ b/R/impute-missing-scores.R
@@ -0,0 +1,316 @@
+#' @title Impute missing scores
+#'
+#' @description
+#' Fills in scores for forecast-target combinations that are
+#' missing from the data, using a user-specified imputation
+#' strategy. This is useful to ensure all models are evaluated
+#' on the same set of targets, which avoids bias when
+#' summarising scores.
+#'
+#' Missing combinations are identified by comparing each value
+#' of the `compare` column against the union of targets observed
+#' across all values. The strategy is then called to fill the
+#' metric columns for those rows.
+#'
+#' An `.imputed` column is added to the output indicating which
+#' rows were imputed (`TRUE`) and which are original (`FALSE`).
+#'
+#' @param scores An object of class `scores` (a data.table with
+#' an additional `metrics` attribute as produced by [score()]).
+#' @param strategy A strategy function with signature
+#' `function(scores, missing_rows, metrics, compare)` that
+#' returns `missing_rows` with the metric columns filled.
+#' Built-in options are [impute_worst_score()],
+#' [impute_mean_score()], [impute_na_score()], and
+#' [impute_model_score()]. Custom strategies are also
+#' supported.
+#' @param compare Character string (default `"model"`) naming the
+#' column whose values are compared against each target to
+#' identify missing combinations.
+#'
+#' @return A `scores` object with an additional `.imputed`
+#' column. Rows that were imputed have `.imputed = TRUE`.
+#'
+#' @seealso [impute_worst_score()], [impute_mean_score()],
+#' [impute_na_score()], [impute_model_score()],
+#' \code{vignette("handling-missing-forecasts")}
+#' @importFrom data.table copy set rbindlist setattr
+#' @importFrom checkmate assert_class assert_character
+#' assert_subset
+#' @importFrom cli cli_abort cli_inform
+#' @export
+#' @keywords postprocess-scores
+#' @examples
+#' \dontshow{
+#' data.table::setDTthreads(2)
+#' }
+#' scores <- example_quantile |>
+#' as_forecast_quantile() |>
+#' score()
+#'
+#' # Impute with NA values
+#' impute_missing_scores(scores, strategy = impute_na_score())
+impute_missing_scores <- function(
+ scores,
+ strategy,
+ compare = "model"
+) {
+ assert_class(scores, "scores")
+ metrics <- get_metrics.scores(scores, error = TRUE)
+ assert_character(compare, len = 1)
+ assert_subset(compare, names(scores))
+ assert_strategy(
+ strategy,
+ required = c("scores", "missing_rows", "metrics", "compare")
+ )
+
+ scores <- copy(scores)
+
+ missing_rows <- build_missing_grid(scores, compare) # nolint: object_usage_linter
+
+ if (nrow(missing_rows) == 0) {
+ cli_inform(c(
+ i = "No missing scores to impute. Returning scores unchanged."
+ ))
+ data.table::set(scores, j = ".imputed", value = FALSE)
+ return(scores[])
+ }
+
+ n_missing <- nrow(missing_rows) # nolint: object_usage_linter
+ n_comparators <- length(unique(missing_rows[[compare]])) # nolint: object_usage_linter, line_length_linter
+ cli_inform(c(
+ i = "Imputing {n_missing} missing score row{?s}.",
+ i = "{n_comparators} {compare} {cli::qty(n_comparators)}value{?s} affected." # nolint: line_length_linter
+ ))
+
+ filled <- strategy(scores, missing_rows, metrics, compare)
+
+ data.table::set(filled, j = ".imputed", value = TRUE)
+ data.table::set(scores, j = ".imputed", value = FALSE)
+
+ out <- rbindlist(
+ list(scores, filled),
+ use.names = TRUE,
+ fill = TRUE
+ )
+
+ return(new_scores(out, metrics))
+}
+
+
+# Shared implementation for simple summary-based imputation
+# (e.g. max, mean). `fn` is a summary function applied to each
+# metric within the same target combination across all compare
+# values. NA values are ignored; target combinations with no
+# non-NA observations produce NA rather than -Inf or NaN.
+impute_summary_score <- function(fn) {
+ safe_fn <- function(x) {
+ x <- x[!is.na(x)]
+ if (length(x) == 0) {
+ return(NA_real_)
+ }
+ fn(x)
+ }
+ function(scores, missing_rows, metrics, compare) {
+ fu <- get_forecast_unit(scores)
+ target_cols <- setdiff(fu, compare)
+
+ for (m in metrics) {
+ if (!(m %in% names(scores))) next
+ agg <- scores[,
+ .(..val = safe_fn(get(m))),
+ by = target_cols
+ ]
+ missing_rows <- merge(
+ missing_rows,
+ agg,
+ by = target_cols,
+ all.x = TRUE
+ )
+ data.table::set(
+ missing_rows,
+ j = m,
+ value = missing_rows[["..val"]]
+ )
+ data.table::set(
+ missing_rows,
+ j = "..val",
+ value = NULL
+ )
+ }
+ return(missing_rows)
+ }
+}
+
+
+#' @title Impute with worst (maximum) observed score
+#'
+#' @description
+#' Strategy for [impute_missing_scores()] that fills each
+#' missing metric with the worst (maximum) observed value for
+#' that metric within the same target combination across all
+#' values of the `compare` column. Target combinations with no
+#' non-NA observations are filled with `NA_real_`.
+#'
+#' @return A strategy function for [impute_missing_scores()].
+#' @seealso [impute_missing_scores()], [impute_mean_score()]
+#' @export
+#' @keywords postprocess-scores
+#' @examples
+#' \dontshow{
+#' data.table::setDTthreads(2)
+#' }
+#' scores <- example_quantile |>
+#' as_forecast_quantile() |>
+#' score()
+#'
+#' impute_missing_scores(scores, strategy = impute_worst_score())
+impute_worst_score <- function() {
+ impute_summary_score(max)
+}
+
+
+#' @title Impute with mean observed score
+#'
+#' @description
+#' Strategy for [impute_missing_scores()] that fills each
+#' missing metric with the mean observed value for that metric
+#' within the same target combination across all values of the
+#' `compare` column. Target combinations with no non-NA
+#' observations are filled with `NA_real_`.
+#'
+#' @return A strategy function for [impute_missing_scores()].
+#' @seealso [impute_missing_scores()], [impute_worst_score()]
+#' @export
+#' @keywords postprocess-scores
+#' @examples
+#' \dontshow{
+#' data.table::setDTthreads(2)
+#' }
+#' scores <- example_quantile |>
+#' as_forecast_quantile() |>
+#' score()
+#'
+#' impute_missing_scores(scores, strategy = impute_mean_score())
+impute_mean_score <- function() {
+ impute_summary_score(mean)
+}
+
+
+#' @title Impute with NA values
+#'
+#' @description
+#' Strategy for [impute_missing_scores()] that fills each
+#' missing metric with `NA_real_`.
+#'
+#' @return A strategy function for [impute_missing_scores()].
+#' @seealso [impute_missing_scores()]
+#' @export
+#' @keywords postprocess-scores
+#' @examples
+#' \dontshow{
+#' data.table::setDTthreads(2)
+#' }
+#' scores <- example_quantile |>
+#' as_forecast_quantile() |>
+#' score()
+#'
+#' impute_missing_scores(scores, strategy = impute_na_score())
+impute_na_score <- function() {
+ function(scores, missing_rows, metrics, compare) {
+ for (m in metrics) {
+ data.table::set(missing_rows, j = m, value = NA_real_)
+ }
+ return(missing_rows)
+ }
+}
+
+
+#' @title Impute with a reference model's scores
+#'
+#' @description
+#' Strategy for [impute_missing_scores()] that fills missing
+#' scores with the scores of a specified reference model for
+#' the same target combination.
+#'
+#' @param model Character string naming the reference model
+#' whose scores should be used for imputation. The reference
+#' model must have scores for all target combinations that
+#' need imputing; otherwise an error is raised.
+#'
+#' @return A strategy function for [impute_missing_scores()].
+#' @seealso [impute_missing_scores()]
+#' @importFrom cli cli_abort
+#' @export
+#' @keywords postprocess-scores
+#' @examples
+#' \dontshow{
+#' data.table::setDTthreads(2)
+#' }
+#' scores <- example_quantile |>
+#' as_forecast_quantile() |>
+#' score()
+#'
+#' impute_missing_scores(
+#' scores,
+#' strategy = impute_model_score("EuroCOVIDhub-baseline")
+#' )
+impute_model_score <- function(model) {
+ assert_character(model, len = 1)
+ # Store under a different name to avoid collision with
+ # the "model" column in data.table expressions below.
+ ref_model_name <- model
+ function(scores, missing_rows, metrics, compare) {
+ fu <- get_forecast_unit(scores)
+ target_cols <- setdiff(fu, compare)
+
+ ref <- scores[
+ get(compare) == ref_model_name
+ ]
+
+ if (nrow(ref) == 0) {
+ cli_abort(c(
+ "!" = paste0(
+ "Reference model {.val {ref_model_name}} ",
+ "not found in scores."
+ )
+ ))
+ }
+
+ needed <- unique(
+ missing_rows[, target_cols, with = FALSE]
+ )
+ available <- unique(
+ ref[, target_cols, with = FALSE]
+ )
+ missing_targets <- needed[!available, on = target_cols]
+ if (nrow(missing_targets) > 0) {
+ cli_abort(c(
+ "!" = paste0(
+ "Reference model {.val {ref_model_name}} ",
+ "is missing scores for ",
+ "{nrow(missing_targets)} target ",
+ "combination{?s} that need imputing."
+ )
+ ))
+ }
+
+ ref_scores <- ref[,
+ c(
+ target_cols,
+ metrics[metrics %in% names(ref)]
+ ),
+ with = FALSE
+ ]
+ missing_rows <- merge(
+ missing_rows[,
+ setdiff(names(missing_rows), metrics),
+ with = FALSE
+ ],
+ ref_scores,
+ by = target_cols,
+ all.x = TRUE
+ )
+ return(missing_rows)
+ }
+}
diff --git a/R/missing-scores-internal.R b/R/missing-scores-internal.R
new file mode 100644
index 000000000..ab54e1210
--- /dev/null
+++ b/R/missing-scores-internal.R
@@ -0,0 +1,88 @@
+#' Assert that a strategy has the expected signature
+#'
+#' @description
+#' Internal helper used by [filter_scores()] and
+#' [impute_missing_scores()] to check that a user-supplied
+#' strategy function has at least the required named formals.
+#' This catches common mistakes early (e.g. forgetting the
+#' `compare` argument) without constraining the strategy
+#' author to a specific internal type.
+#'
+#' @param strategy A function.
+#' @param required Character vector of formal names that
+#' `strategy` must accept.
+#'
+#' @return `invisible(NULL)`. Called for its side effect of
+#' erroring when the check fails.
+#'
+#' @importFrom checkmate assert_function
+#' @importFrom cli cli_abort
+#' @keywords internal
+assert_strategy <- function(strategy, required) {
+ assert_function(strategy)
+ strategy_formals <- names(formals(strategy))
+ missing_args <- setdiff(required, strategy_formals)
+ if (length(missing_args) > 0) {
+ cli_abort(c(
+ "!" = paste0(
+ "Strategy function is missing required ",
+ "argument{?s}: {.arg {missing_args}}."
+ ),
+ i = paste0(
+ "Expected formals including: ",
+ "{.arg {required}}."
+ )
+ ))
+ }
+ return(invisible(NULL))
+}
+
+
+#' Build grid of missing model-target combinations
+#'
+#' @description
+#' Internal function that detects missing model x target
+#' combinations by comparing observed data against the complete
+#' grid of all compare-column values crossed with all observed
+#' target combinations.
+#'
+#' @param scores A `scores` object (data.table with a `metrics`
+#' attribute as produced by [score()]).
+#' @param compare Character string (default `"model"`) naming the
+#' column whose values are compared against each target.
+#'
+#' @return A `data.table` with forecast-unit columns for each
+#' missing combination. Zero rows if nothing is missing.
+#'
+#' @importFrom data.table as.data.table setkeyv rbindlist copy
+#' @keywords internal
+build_missing_grid <- function(scores, compare = "model") {
+ scores <- data.table::as.data.table(scores)
+ forecast_unit <- get_forecast_unit(scores)
+ target_cols <- setdiff(forecast_unit, compare)
+
+ targets <- unique(scores[, target_cols, with = FALSE])
+
+ # All unique compare values
+ compare_vals <- unique(scores[[compare]])
+
+ # Complete grid: every compare value x every observed target
+ complete <- data.table::rbindlist(lapply(
+ compare_vals,
+ function(val) {
+ out <- data.table::copy(targets)
+ out[, (compare) := val]
+ out
+ }
+ ))
+
+ # Observed combinations (forecast unit cols only)
+ observed <- unique(scores[, forecast_unit, with = FALSE])
+
+ # Anti-join: rows in complete but not in observed
+ data.table::setkeyv(complete, forecast_unit)
+ data.table::setkeyv(observed, forecast_unit)
+ missing <- complete[!observed]
+
+ return(missing)
+}
diff --git a/R/z-globalVariables.R b/R/z-globalVariables.R
index 881a69a19..e9f2ffeca 100644
--- a/R/z-globalVariables.R
+++ b/R/z-globalVariables.R
@@ -11,6 +11,7 @@ globalVariables(c(
".BY",
".GRP",
".SD",
+ ".imputed",
".mv_group_id",
".scoringutils_N",
".scoringutils_count",
@@ -51,6 +52,7 @@ globalVariables(c(
"metrics",
"metrics_select",
"model",
+ "n_models",
"n_obs",
"n_obs wis_component_name",
"observed",
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 34853a432..91c4f6038 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -50,6 +50,9 @@ reference:
- title: Handling metrics/scoring functions
contents:
- has_keyword("handle-metrics")
+ - title: Post-process scores
+ contents:
+ - has_keyword("postprocess-scores")
- title: Evaluate forecasts
contents:
- has_keyword("scoring")
diff --git a/man/assert_strategy.Rd b/man/assert_strategy.Rd
new file mode 100644
index 000000000..389ff57b9
--- /dev/null
+++ b/man/assert_strategy.Rd
@@ -0,0 +1,27 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/missing-scores-internal.R
+\name{assert_strategy}
+\alias{assert_strategy}
+\title{Assert that a strategy has the expected signature}
+\usage{
+assert_strategy(strategy, required)
+}
+\arguments{
+\item{strategy}{A function.}
+
+\item{required}{Character vector of formal names that
+\code{strategy} must accept.}
+}
+\value{
+\code{invisible(NULL)}. Called for its side effect of
+erroring when the check fails.
+}
+\description{
+Internal helper used by \code{\link[=filter_scores]{filter_scores()}} and
+\code{\link[=impute_missing_scores]{impute_missing_scores()}} to check that a user-supplied
+strategy function has at least the required named formals.
+This catches common mistakes early (e.g. forgetting the
+\code{compare} argument) without constraining the strategy
+author to a specific internal type.
+}
+\keyword{internal}
diff --git a/man/build_missing_grid.Rd b/man/build_missing_grid.Rd
new file mode 100644
index 000000000..f0e770800
--- /dev/null
+++ b/man/build_missing_grid.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/missing-scores-internal.R
+\name{build_missing_grid}
+\alias{build_missing_grid}
+\title{Build grid of missing model-target combinations}
+\usage{
+build_missing_grid(scores, compare = "model")
+}
+\arguments{
+\item{scores}{A \code{scores} object (data.table with a \code{metrics}
+attribute as produced by \code{\link[=score]{score()}}).}
+
+\item{compare}{Character string (default \code{"model"}) naming the
+column whose values are compared against each target.}
+}
+\value{
+A \code{data.table} with forecast-unit columns for each
+missing combination. Zero rows if nothing is missing.
+}
+\description{
+Internal function that detects missing model x target
+combinations by comparing observed data against the complete
+grid of all compare-column values crossed with all observed
+target combinations.
+}
+\keyword{internal}
diff --git a/man/filter_scores.Rd b/man/filter_scores.Rd
new file mode 100644
index 000000000..76b32b474
--- /dev/null
+++ b/man/filter_scores.Rd
@@ -0,0 +1,64 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/filter-scores.R
+\name{filter_scores}
+\alias{filter_scores}
+\title{Filter scores}
+\usage{
+filter_scores(scores, strategy = filter_to_intersection(), compare = "model")
+}
+\arguments{
+\item{scores}{An object of class \code{scores} (a data.table with
+an additional \code{metrics} attribute as produced by \code{\link[=score]{score()}}).}
+
+\item{strategy}{A strategy function. See Description for the
+expected signature. Default: \code{\link[=filter_to_intersection]{filter_to_intersection()}}.}
+
+\item{compare}{Character string (default \code{"model"}) naming the
+column whose values are compared when deciding which
+target combinations to keep.}
+}
+\value{
+A \code{scores} object with the same class and \code{metrics}
+attribute as the input, with rows filtered according to
+\code{strategy}.
+}
+\description{
+Filter a \code{scores} object using a supplied strategy function.
+\code{filter_scores()} is responsible for preserving the \code{scores}
+class and the \code{metrics} attribute; the strategy is
+responsible only for the filtering logic.
+
+Strategies are constructed by helpers such as
+\code{\link[=filter_to_intersection]{filter_to_intersection()}} and \code{\link[=filter_to_include]{filter_to_include()}} and can
+also be user-defined. A strategy is a function with
+signature \verb{function(scores, compare)} that returns a
+filtered data.table with the same columns as its input.
+}
+\examples{
+\dontshow{
+ data.table::setDTthreads(2)
+}
+scores <- example_quantile |>
+ as_forecast_quantile() |>
+ score()
+
+# Keep only targets covered by every model (the default)
+filter_scores(scores)
+
+# Keep targets covered by at least 75\% of models
+filter_scores(
+ scores,
+ strategy = filter_to_intersection(min_coverage = 0.75)
+)
+
+# Keep only targets covered by a named model
+filter_scores(
+ scores,
+ strategy = filter_to_include("EuroCOVIDhub-baseline")
+)
+}
+\seealso{
+\code{\link[=filter_to_intersection]{filter_to_intersection()}}, \code{\link[=filter_to_include]{filter_to_include()}},
+\code{vignette("handling-missing-forecasts")}
+}
+\keyword{postprocess-scores}
diff --git a/man/filter_to_include.Rd b/man/filter_to_include.Rd
new file mode 100644
index 000000000..983129d9a
--- /dev/null
+++ b/man/filter_to_include.Rd
@@ -0,0 +1,45 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/filter-scores.R
+\name{filter_to_include}
+\alias{filter_to_include}
+\title{Filter to targets covered by named compare values}
+\usage{
+filter_to_include(include)
+}
+\arguments{
+\item{include}{Character vector of length one or more. Values
+from the \code{compare} column whose target sets should be
+intersected.}
+}
+\value{
+A strategy function for \code{\link[=filter_scores]{filter_scores()}}. Intended
+to be passed to \code{filter_scores()} rather than called
+directly — \code{filter_scores()} is where the \code{scores} class
+and \code{metrics} attribute are preserved.
+}
+\description{
+Strategy for \code{\link[=filter_scores]{filter_scores()}} that restricts the kept
+target combinations to those covered by every value listed
+in \code{include}. With a single value this keeps only that
+value's targets; with several values, the intersection of
+their target sets is kept.
+
+To use a proportion-based threshold over all compare values
+instead, use \code{\link[=filter_to_intersection]{filter_to_intersection()}}.
+}
+\examples{
+\dontshow{
+ data.table::setDTthreads(2)
+}
+scores <- example_quantile |>
+ as_forecast_quantile() |>
+ score()
+filter_scores(
+ scores,
+ strategy = filter_to_include("EuroCOVIDhub-baseline")
+)
+}
+\seealso{
+\code{\link[=filter_scores]{filter_scores()}}, \code{\link[=filter_to_intersection]{filter_to_intersection()}}
+}
+\keyword{postprocess-scores}
diff --git a/man/filter_to_intersection.Rd b/man/filter_to_intersection.Rd
new file mode 100644
index 000000000..5f27eaf2c
--- /dev/null
+++ b/man/filter_to_intersection.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/filter-scores.R
+\name{filter_to_intersection}
+\alias{filter_to_intersection}
+\title{Filter to target combinations meeting a coverage threshold}
+\usage{
+filter_to_intersection(min_coverage = 1)
+}
+\arguments{
+\item{min_coverage}{Numeric between 0 and 1 (default \code{1}).
+Minimum proportion of compare values that must cover a
+target combination for it to be kept.}
+}
+\value{
+A strategy function for \code{\link[=filter_scores]{filter_scores()}}. Intended
+to be passed to \code{filter_scores()} rather than called
+directly — \code{filter_scores()} is where the \code{scores} class
+and \code{metrics} attribute are preserved.
+}
+\description{
+Strategy for \code{\link[=filter_scores]{filter_scores()}} that keeps target combinations
+covered by at least \code{min_coverage} of the values in the
+\code{compare} column. With the default \code{min_coverage = 1}, only
+target combinations present for every compare value are kept
+(strict intersection across the full set).
+
+To restrict to the targets covered by a named subset of
+compare values instead of by a proportion, use
+\code{\link[=filter_to_include]{filter_to_include()}}.
+}
+\examples{
+\dontshow{
+ data.table::setDTthreads(2)
+}
+scores <- example_quantile |>
+ as_forecast_quantile() |>
+ score()
+filter_scores(
+ scores,
+ strategy = filter_to_intersection(min_coverage = 0.75)
+)
+}
+\seealso{
+\code{\link[=filter_scores]{filter_scores()}}, \code{\link[=filter_to_include]{filter_to_include()}}
+}
+\keyword{postprocess-scores}
diff --git a/man/get_forecast_type_ids.Rd b/man/get_forecast_type_ids.Rd
index 5bd5b81d7..d624b017b 100644
--- a/man/get_forecast_type_ids.Rd
+++ b/man/get_forecast_type_ids.Rd
@@ -26,7 +26,7 @@ get_forecast_type_ids(data)
}
\arguments{
\item{data}{A data.frame (or similar) with predicted and observed values.
-See the details section of for additional information
+See the "Target format" section in Details for additional information
on the required input format.}
}
\value{
diff --git a/man/impute_mean_score.Rd b/man/impute_mean_score.Rd
new file mode 100644
index 000000000..55b32441f
--- /dev/null
+++ b/man/impute_mean_score.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/impute-missing-scores.R
+\name{impute_mean_score}
+\alias{impute_mean_score}
+\title{Impute with mean observed score}
+\usage{
+impute_mean_score()
+}
+\value{
+A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}.
+}
+\description{
+Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills each
+missing metric with the mean observed value for that metric
+within the same target combination across all values of the
+\code{compare} column. Target combinations with no non-NA
+observations are filled with \code{NA_real_}.
+}
+\examples{
+\dontshow{
+ data.table::setDTthreads(2)
+}
+scores <- example_quantile |>
+ as_forecast_quantile() |>
+ score()
+
+impute_missing_scores(scores, strategy = impute_mean_score())
+}
+\seealso{
+\code{\link[=impute_missing_scores]{impute_missing_scores()}}, \code{\link[=impute_worst_score]{impute_worst_score()}}
+}
+\keyword{postprocess-scores}
diff --git a/man/impute_missing_scores.Rd b/man/impute_missing_scores.Rd
new file mode 100644
index 000000000..16c6902f4
--- /dev/null
+++ b/man/impute_missing_scores.Rd
@@ -0,0 +1,60 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/impute-missing-scores.R
+\name{impute_missing_scores}
+\alias{impute_missing_scores}
+\title{Impute missing scores}
+\usage{
+impute_missing_scores(scores, strategy, compare = "model")
+}
+\arguments{
+\item{scores}{An object of class \code{scores} (a data.table with
+an additional \code{metrics} attribute as produced by \code{\link[=score]{score()}}).}
+
+\item{strategy}{A strategy function with signature
+\verb{function(scores, missing_rows, metrics, compare)} that
+returns \code{missing_rows} with the metric columns filled.
+Built-in options are \code{\link[=impute_worst_score]{impute_worst_score()}},
+\code{\link[=impute_mean_score]{impute_mean_score()}}, \code{\link[=impute_na_score]{impute_na_score()}}, and
+\code{\link[=impute_model_score]{impute_model_score()}}. Custom strategies are also
+supported.}
+
+\item{compare}{Character string (default \code{"model"}) naming the
+column whose values are compared against each target to
+identify missing combinations.}
+}
+\value{
+A \code{scores} object with an additional \code{.imputed}
+column. Rows that were imputed have \code{.imputed = TRUE}.
+}
+\description{
+Fills in scores for forecast-target combinations that are
+missing from the data, using a user-specified imputation
+strategy. This is useful to ensure all models are evaluated
+on the same set of targets, which avoids bias when
+summarising scores.
+
+Missing combinations are identified by comparing each value
+of the \code{compare} column against the union of targets observed
+across all values. The strategy is then called to fill the
+metric columns for those rows.
+
+An \code{.imputed} column is added to the output indicating which
+rows were imputed (\code{TRUE}) and which are original (\code{FALSE}).
+}
+\examples{
+\dontshow{
+ data.table::setDTthreads(2)
+}
+scores <- example_quantile |>
+ as_forecast_quantile() |>
+ score()
+
+# Impute with NA values
+impute_missing_scores(scores, strategy = impute_na_score())
+}
+\seealso{
+\code{\link[=impute_worst_score]{impute_worst_score()}}, \code{\link[=impute_mean_score]{impute_mean_score()}},
+\code{\link[=impute_na_score]{impute_na_score()}}, \code{\link[=impute_model_score]{impute_model_score()}},
+\code{vignette("handling-missing-forecasts")}
+}
+\keyword{postprocess-scores}
diff --git a/man/impute_model_score.Rd b/man/impute_model_score.Rd
new file mode 100644
index 000000000..ca44b1737
--- /dev/null
+++ b/man/impute_model_score.Rd
@@ -0,0 +1,39 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/impute-missing-scores.R
+\name{impute_model_score}
+\alias{impute_model_score}
+\title{Impute with a reference model's scores}
+\usage{
+impute_model_score(model)
+}
+\arguments{
+\item{model}{Character string naming the reference model
+whose scores should be used for imputation. The reference
+model must have scores for all target combinations that
+need imputing; otherwise an error is raised.}
+}
+\value{
+A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}.
+}
+\description{
+Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills missing
+scores with the scores of a specified reference model for
+the same target combination.
+}
+\examples{
+\dontshow{
+ data.table::setDTthreads(2)
+}
+scores <- example_quantile |>
+ as_forecast_quantile() |>
+ score()
+
+impute_missing_scores(
+ scores,
+ strategy = impute_model_score("EuroCOVIDhub-baseline")
+)
+}
+\seealso{
+\code{\link[=impute_missing_scores]{impute_missing_scores()}}
+}
+\keyword{postprocess-scores}
diff --git a/man/impute_na_score.Rd b/man/impute_na_score.Rd
new file mode 100644
index 000000000..ad676e703
--- /dev/null
+++ b/man/impute_na_score.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/impute-missing-scores.R
+\name{impute_na_score}
+\alias{impute_na_score}
+\title{Impute with NA values}
+\usage{
+impute_na_score()
+}
+\value{
+A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}.
+}
+\description{
+Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills each
+missing metric with \code{NA_real_}.
+}
+\examples{
+\dontshow{
+ data.table::setDTthreads(2)
+}
+scores <- example_quantile |>
+ as_forecast_quantile() |>
+ score()
+
+impute_missing_scores(scores, strategy = impute_na_score())
+}
+\seealso{
+\code{\link[=impute_missing_scores]{impute_missing_scores()}}
+}
+\keyword{postprocess-scores}
diff --git a/man/impute_worst_score.Rd b/man/impute_worst_score.Rd
new file mode 100644
index 000000000..0868cf041
--- /dev/null
+++ b/man/impute_worst_score.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/impute-missing-scores.R
+\name{impute_worst_score}
+\alias{impute_worst_score}
+\title{Impute with worst (maximum) observed score}
+\usage{
+impute_worst_score()
+}
+\value{
+A strategy function for \code{\link[=impute_missing_scores]{impute_missing_scores()}}.
+}
+\description{
+Strategy for \code{\link[=impute_missing_scores]{impute_missing_scores()}} that fills each
+missing metric with the worst (maximum) observed value for
+that metric within the same target combination across all
+values of the \code{compare} column. Target combinations with no
+non-NA observations are filled with \code{NA_real_}.
+}
+\examples{
+\dontshow{
+ data.table::setDTthreads(2)
+}
+scores <- example_quantile |>
+ as_forecast_quantile() |>
+ score()
+
+impute_missing_scores(scores, strategy = impute_worst_score())
+}
+\seealso{
+\code{\link[=impute_missing_scores]{impute_missing_scores()}}, \code{\link[=impute_mean_score]{impute_mean_score()}}
+}
+\keyword{postprocess-scores}
diff --git a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg
index b22701e63..a8d99587f 100644
--- a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg
+++ b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison-pval.svg
@@ -29,8 +29,8 @@
-
+
@@ -38,8 +38,8 @@
< 0.001
< 0.001
1
-< 0.001
0.298
+< 0.001
1
0.298
< 0.001
@@ -57,9 +57,9 @@
-
+
@@ -73,9 +73,9 @@
< 0.001
< 0.001
1
-< 0.001
< 0.001
< 0.001
+< 0.001
1
0.007
< 0.001
diff --git a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg
index 372e416d9..aa58d5f93 100644
--- a/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg
+++ b/tests/testthat/_snaps/pairwise_comparison/plot-pairwise-comparison.svg
@@ -29,8 +29,8 @@
-
+
@@ -38,8 +38,8 @@
1.37
1.59
1
-0.63
0.86
+0.63
1
1.16
0.73
@@ -57,9 +57,9 @@
-
+
@@ -73,9 +73,9 @@
3.03
3.85
1
-0.26
0.62
0.79
+0.26
1
0.74
1.27
diff --git a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg
index 74ca3f682..fb64844df 100644
--- a/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg
+++ b/tests/testthat/_snaps/plot_heatmap/plot-heatmap.svg
@@ -26,18 +26,18 @@
-
+
-
+
--0.06
0.1
+-0.06
-0.08
-0.07
0.34
+0.07
-0.02
-0.01
diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg
index aae13f7d0..00103e3ab 100644
--- a/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg
+++ b/tests/testthat/_snaps/plot_wis/plot-wis-flip.svg
@@ -26,14 +26,14 @@
-
+
-
+
-
+
@@ -44,16 +44,16 @@
-
+
-
+
-
+
diff --git a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg
index 2c21e6aca..a271f2373 100644
--- a/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg
+++ b/tests/testthat/_snaps/plot_wis/plot-wis-no-relative.svg
@@ -26,14 +26,14 @@
-
+
-
+
-
+
@@ -44,16 +44,16 @@
-
+
-
+
-
+
diff --git a/tests/testthat/_snaps/plot_wis/plot-wis.svg b/tests/testthat/_snaps/plot_wis/plot-wis.svg
index 6d842785e..c03f3893d 100644
--- a/tests/testthat/_snaps/plot_wis/plot-wis.svg
+++ b/tests/testthat/_snaps/plot_wis/plot-wis.svg
@@ -26,14 +26,14 @@
-
+
-
+
-
+
@@ -44,16 +44,16 @@
-
+
-
+
-
+
diff --git a/tests/testthat/test-filter-scores.R b/tests/testthat/test-filter-scores.R
new file mode 100644
index 000000000..54015a291
--- /dev/null
+++ b/tests/testthat/test-filter-scores.R
@@ -0,0 +1,333 @@
+# ==============================================================================
+# build_missing_grid()
+# ==============================================================================
+test_that("build_missing_grid() returns correct missing rows", {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ scores <- new_scores(scores, "wis")
+ missing <- build_missing_grid(scores, compare = "model")
+ expect_s3_class(missing, "data.table")
+ expect_equal(nrow(missing), 1)
+ expect_equal(missing$model, "B")
+ expect_equal(missing$location, "US")
+})
+
+test_that("build_missing_grid() returns zero rows when nothing missing", {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B", "B"),
+ location = c("DE", "US", "DE", "US"),
+ wis = c(1, 2, 3, 4)
+ )
+ scores <- new_scores(scores, "wis")
+ missing <- build_missing_grid(scores, compare = "model")
+ expect_equal(nrow(missing), 0)
+})
+
+test_that("build_missing_grid() uses observed target combos", {
+ # If we had locations DE, US and dates Mon, Tue but only
+ # (DE, Mon) and (US, Tue) are observed, the grid should NOT
+ # include (DE, Tue) or (US, Mon)
+ scores <- data.table::data.table(
+ model = c("A", "A", "B", "B"),
+ location = c("DE", "US", "DE", "US"),
+ date = c("Mon", "Tue", "Mon", "Tue"),
+ wis = c(1, 2, 3, 4)
+ )
+ scores <- new_scores(scores, "wis")
+ missing <- build_missing_grid(scores, compare = "model")
+ expect_equal(nrow(missing), 0)
+
+ # Now remove one combo - B should be missing (US, Tue)
+ scores2 <- data.table::data.table(
+ model = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ date = c("Mon", "Tue", "Mon"),
+ wis = c(1, 2, 3)
+ )
+ scores2 <- new_scores(scores2, "wis")
+ missing2 <- build_missing_grid(scores2, compare = "model")
+ expect_equal(nrow(missing2), 1)
+ expect_equal(missing2$model, "B")
+ expect_equal(missing2$location, "US")
+ expect_equal(missing2$date, "Tue")
+})
+
+
+# ==============================================================================
+# filter_scores()
+# ==============================================================================
+test_that("filter_scores() drops incomplete targets", {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ scores <- new_scores(scores, "wis")
+ result <- suppressMessages(filter_scores(scores))
+ # Only DE should remain (both include have it)
+ expect_equal(nrow(result), 2)
+ expect_true(all(result$location == "DE"))
+})
+
+test_that("filter_scores() preserves class and metrics", {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ scores <- new_scores(scores, "wis")
+ result <- suppressMessages(filter_scores(scores))
+ expect_s3_class(result, "scores")
+ expect_equal(attr(result, "metrics"), "wis")
+})
+
+test_that("filter_scores() unchanged when nothing missing", {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B", "B"),
+ location = c("DE", "US", "DE", "US"),
+ wis = c(1, 2, 3, 4)
+ )
+ scores <- new_scores(scores, "wis")
+ expect_message(
+ result <- filter_scores(scores),
+ "No rows filtered"
+ )
+ expect_equal(nrow(result), 4)
+})
+
+test_that("filter_scores() reports rows dropped", {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ scores <- new_scores(scores, "wis")
+ expect_message(
+ filter_scores(scores),
+ "Filtered out 1 row"
+ )
+})
+
+test_that("filter_scores() errors on strategy with wrong formals", {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ scores <- new_scores(scores, "wis")
+ bad_strategy <- function(scores) scores
+ expect_error(
+ filter_scores(scores, strategy = bad_strategy),
+ "missing required"
+ )
+})
+
+test_that("filter_scores() errors on invalid compare column", {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ scores <- new_scores(scores, "wis")
+ expect_error(
+ filter_scores(scores, compare = "nonexistent"),
+ "nonexistent"
+ )
+})
+
+
+# ==============================================================================
+# filter_to_intersection()
+# ==============================================================================
+test_that("filter_to_intersection(min_coverage=0.5) works", {
+ scores <- data.table::data.table(
+ model = c("A", "A", "A", "B", "C"),
+ location = c("DE", "US", "FR", "DE", "DE"),
+ wis = c(1, 2, 3, 4, 5)
+ )
+ scores <- new_scores(scores, "wis")
+ strategy <- filter_to_intersection(min_coverage = 0.5)
+ result <- strategy(scores, compare = "model")
+ # DE covered by 3/3, US by 1/3, FR by 1/3
+ # At min_coverage = 0.5, only DE qualifies
+ expect_true(all(result$location == "DE"))
+
+ strategy2 <- filter_to_intersection(min_coverage = 1 / 3)
+ result2 <- strategy2(scores, compare = "model")
+ # All locations have coverage >= 1/3
+ expect_equal(nrow(result2), 5)
+})
+
+test_that("filter_to_include() keeps targets of single model", {
+ scores <- data.table::data.table(
+ model = c("m1", "m1", "m2", "m2", "m3"),
+ location = c("DE", "US", "DE", "FR", "DE"),
+ wis = c(1, 2, 3, 4, 5)
+ )
+ scores <- new_scores(scores, "wis")
+ strategy <- filter_to_include("m1")
+ result <- strategy(scores, compare = "model")
+ # m1 covers DE and US, so keep all rows with DE or US
+ expect_true(all(result$location %in% c("DE", "US")))
+ # FR should be dropped
+ expect_false("FR" %in% result$location)
+})
+
+test_that("filter_to_include() intersects multiple models", {
+ scores <- data.table::data.table(
+ model = c("m1", "m1", "m2", "m2", "m3"),
+ location = c("DE", "US", "DE", "FR", "DE"),
+ wis = c(1, 2, 3, 4, 5)
+ )
+ scores <- new_scores(scores, "wis")
+ strategy <- filter_to_include(c("m1", "m2"))
+ result <- strategy(scores, compare = "model")
+ # m1 covers DE, US; m2 covers DE, FR; intersection = DE
+ expect_true(all(result$location == "DE"))
+})
+
+test_that("filter_to_include() errors on unknown compare value", {
+ scores <- data.table::data.table(
+ model = c("A", "B"),
+ location = c("DE", "DE"),
+ wis = c(1, 2)
+ )
+ scores <- new_scores(scores, "wis")
+ expect_error(
+ filter_scores(
+ scores,
+ strategy = filter_to_include("Z")
+ ),
+ "not found"
+ )
+})
+
+test_that("filter_scores() works with non-default compare", {
+ scores <- data.table::data.table(
+ forecaster = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ scores <- new_scores(scores, "wis")
+ result <- suppressMessages(filter_scores(
+ scores, compare = "forecaster"
+ ))
+ expect_true(all(result$location == "DE"))
+ expect_equal(nrow(result), 2)
+})
+
+
+# ==============================================================================
+# Integration tests with scores_quantile
+# ==============================================================================
+test_that(
+ "filter_to_include() with real data filters
+ to model's targets",
+ {
+ scores <- scores_quantile
+ fu <- get_forecast_unit(scores)
+ target_cols <- setdiff(fu, "model")
+
+ # EpiNow2 is missing 9 death targets out of 256
+ epinow2_targets <- unique(
+ scores[
+ model == "epiforecasts-EpiNow2",
+ target_cols,
+ with = FALSE
+ ]
+ )
+ n_epinow2 <- nrow(epinow2_targets)
+
+ result <- suppressMessages(filter_scores(
+ scores,
+ strategy = filter_to_include(
+ "epiforecasts-EpiNow2"
+ )
+ ))
+
+ # All remaining targets should be EpiNow2's targets
+ result_targets <- unique(
+ result[, target_cols, with = FALSE]
+ )
+ expect_equal(nrow(result_targets), n_epinow2)
+
+ # The 9 death targets EpiNow2 doesn't cover
+ # should have been dropped
+ all_targets <- unique(
+ scores[, target_cols, with = FALSE]
+ )
+ n_dropped_targets <- nrow(all_targets) - n_epinow2
+ expect_equal(n_dropped_targets, 9)
+
+ # Every model in the result should only have
+ # targets that EpiNow2 covers
+ data.table::setkeyv(result_targets, target_cols)
+ data.table::setkeyv(epinow2_targets, target_cols)
+ expect_equal(result_targets, epinow2_targets)
+ }
+)
+
+test_that(
+ "filter_to_intersection(min_coverage) boundary with
+ real data",
+ {
+ scores <- scores_quantile
+ fu <- get_forecast_unit(scores)
+ target_cols <- setdiff(fu, "model")
+
+ # 4 models total. UMass-MechBayes has no case targets
+ # (128/256), so case targets are covered by 3/4 = 0.75.
+ # At min_coverage = 0.75 case targets should be kept.
+ result_relaxed <- suppressMessages(filter_scores(
+ scores,
+ strategy = filter_to_intersection(
+ min_coverage = 0.75
+ )
+ ))
+ relaxed_types <- unique(result_relaxed$target_type)
+ expect_true("Cases" %in% relaxed_types)
+ expect_true("Deaths" %in% relaxed_types)
+
+ # At min_coverage = 1.0 (default), case targets
+ # should be dropped because UMass-MechBayes lacks them.
+ result_strict <- suppressMessages(filter_scores(scores))
+ strict_types <- unique(result_strict$target_type)
+ expect_false("Cases" %in% strict_types)
+ expect_true("Deaths" %in% strict_types)
+
+ # Relaxed should have strictly more rows
+ expect_gt(nrow(result_relaxed), nrow(result_strict))
+ }
+)
+
+test_that(
+ "filter_scores then summarise_scores gives equal target
+ counts per model",
+ {
+ scores <- scores_quantile
+ fu <- get_forecast_unit(scores)
+ target_cols <- setdiff(fu, "model")
+
+ filtered <- suppressMessages(filter_scores(scores))
+
+ # Count distinct targets per model
+ targets_per_model <- filtered[,
+ .(n_targets = data.table::uniqueN(
+ .SD[, target_cols, with = FALSE]
+ )),
+ by = "model"
+ ]
+ # All models should have the same number of targets
+ expect_length(unique(targets_per_model$n_targets), 1)
+
+ # Summarise should work and give a row per model
+ summary <- summarise_scores(filtered, by = "model")
+ expect_equal(
+ nrow(summary),
+ length(unique(filtered$model))
+ )
+ }
+)
diff --git a/tests/testthat/test-get-protected-columns.R b/tests/testthat/test-get-protected-columns.R
index 896dfa6f9..729a5b54b 100644
--- a/tests/testthat/test-get-protected-columns.R
+++ b/tests/testthat/test-get-protected-columns.R
@@ -6,9 +6,11 @@ test_that("get_protected_columns() works as expected", {
get_protected_columns(),
c(
".mv_group_id",
+ ".imputed",
"predicted", "observed", "sample_id",
"quantile_level", "upper", "lower", "pit_value",
- "interval_range", "boundary", "predicted_label", "interval_coverage",
+ "interval_range", "boundary", "predicted_label",
+ "interval_coverage",
"interval_coverage_deviation", "quantile_coverage",
"quantile_coverage_deviation"
)
diff --git a/tests/testthat/test-impute-missing-scores.R b/tests/testthat/test-impute-missing-scores.R
new file mode 100644
index 000000000..c0327bc6d
--- /dev/null
+++ b/tests/testthat/test-impute-missing-scores.R
@@ -0,0 +1,547 @@
+# ==============================================================================
+# impute_missing_scores()
+# ==============================================================================
+test_that(
+ "impute_missing_scores adds .imputed = FALSE when nothing missing",
+ {
+ # Use only models with complete targets
+ scores <- scores_quantile[
+ model %in% c(
+ "EuroCOVIDhub-ensemble",
+ "EuroCOVIDhub-baseline"
+ )
+ ]
+ scores <- new_scores(
+ scores, get_metrics.scores(scores_quantile)
+ )
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_na_score()
+ ))
+ expect_true(".imputed" %in% names(result))
+ expect_false(any(result$.imputed))
+ }
+)
+
+test_that(
+ "impute_missing_scores preserves scores class and metrics",
+ {
+ scores <- scores_quantile
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_na_score()
+ ))
+ expect_s3_class(result, "scores")
+ expect_identical(
+ get_metrics.scores(result),
+ get_metrics.scores(scores)
+ )
+ }
+)
+
+test_that(".imputed is not in get_metrics.scores output", {
+ scores <- scores_quantile
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_na_score()
+ ))
+ metrics <- get_metrics.scores(result)
+ expect_false(".imputed" %in% metrics)
+})
+
+test_that(".imputed is not in get_forecast_unit output", {
+ # Requires .imputed in get_protected_columns()
+ # (added by parallel agent)
+ skip_if_not(
+ ".imputed" %in% get_protected_columns(),
+ ".imputed not yet in get_protected_columns"
+ )
+ scores <- scores_quantile
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_na_score()
+ ))
+ fu <- get_forecast_unit(result)
+ expect_false(".imputed" %in% fu)
+})
+
+# ==============================================================================
+# Strategy factories with missing data
+# ==============================================================================
+
+# scores_quantile already has missing combinations:
+# UMass-MechBayes has 128 of 256 targets,
+# epiforecasts-EpiNow2 has 247 of 256.
+# Use it directly for tests needing missing data.
+
+test_that("impute_worst_score fills with max observed score", {
+ skip_if_not(
+ exists("build_missing_grid",
+ where = asNamespace("scoringutils")),
+ "build_missing_grid not yet available"
+ )
+ scores <- scores_quantile
+ metrics <- get_metrics.scores(scores)
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_worst_score()
+ ))
+ # Imputed rows should exist
+
+ imputed <- result[(.imputed)]
+ if (nrow(imputed) > 0) {
+ fu <- get_forecast_unit(scores)
+ target_cols <- setdiff(fu, "model")
+
+ # Each imputed metric value should equal the max of
+ # that metric within the same target combination
+ for (m in metrics) {
+ if (!(m %in% names(imputed)) ||
+ !(m %in% names(scores))) next
+ max_per_target <- scores[,
+ .(..max = max(get(m), na.rm = TRUE)),
+ by = target_cols
+ ]
+ merged <- merge(
+ imputed, max_per_target,
+ by = target_cols, all.x = TRUE
+ )
+ expect_true(
+ all(
+ merged[[m]] == merged[["..max"]] |
+ is.na(merged[[m]])
+ ),
+ info = paste(
+ "metric", m,
+ "does not match per-target max"
+ )
+ )
+ }
+ }
+})
+
+test_that("impute_mean_score fills with mean observed score", {
+ skip_if_not(
+ exists("build_missing_grid",
+ where = asNamespace("scoringutils")),
+ "build_missing_grid not yet available"
+ )
+ scores <- scores_quantile
+ metrics <- get_metrics.scores(scores)
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_mean_score()
+ ))
+ imputed <- result[(.imputed)]
+ expect_gt(nrow(imputed), 0)
+
+ fu <- get_forecast_unit(scores)
+ target_cols <- setdiff(fu, "model")
+
+ for (m in metrics) {
+ if (!(m %in% names(imputed)) ||
+ !(m %in% names(scores))) next
+ mean_per_target <- scores[,
+ .(..mean = mean(get(m), na.rm = TRUE)),
+ by = target_cols
+ ]
+ merged <- merge(
+ imputed, mean_per_target,
+ by = target_cols, all.x = TRUE
+ )
+ expect_true(
+ all(
+ abs(merged[[m]] - merged[["..mean"]]) < 1e-10 |
+ is.na(merged[[m]])
+ ),
+ info = paste(
+ "metric", m,
+ "does not match per-target mean"
+ )
+ )
+ }
+})
+
+test_that("impute_na_score fills with NA_real_", {
+ skip_if_not(
+ exists("build_missing_grid",
+ where = asNamespace("scoringutils")),
+ "build_missing_grid not yet available"
+ )
+ scores <- scores_quantile
+ metrics <- get_metrics.scores(scores)
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_na_score()
+ ))
+ imputed <- result[(.imputed)]
+ if (nrow(imputed) > 0) {
+ for (m in metrics) {
+ if (m %in% names(imputed)) {
+ expect_true(
+ all(is.na(imputed[[m]])),
+ info = paste("metric", m, "should be NA")
+ )
+ }
+ }
+ }
+})
+
+test_that(
+ "impute_model_score fills with reference model scores",
+ {
+ skip_if_not(
+ exists("build_missing_grid",
+ where = asNamespace("scoringutils")),
+ "build_missing_grid not yet available"
+ )
+ scores <- scores_quantile
+ # EuroCOVIDhub-baseline has all 256 targets so can
+ # serve as reference for all missing combinations
+ result <- suppressMessages(impute_missing_scores(
+ scores,
+ strategy = impute_model_score(
+ "EuroCOVIDhub-baseline"
+ )
+ ))
+ expect_s3_class(result, "scores")
+ }
+)
+
+test_that(
+ "impute_model_score errors when ref model missing target",
+ {
+ skip_if_not(
+ exists("build_missing_grid",
+ where = asNamespace("scoringutils")),
+ "build_missing_grid not yet available"
+ )
+ scores <- scores_quantile
+ # UMass-MechBayes only has 128 of 256 targets, so
+ # it cannot serve as reference for imputing all missing
+ # target combinations
+ expect_error(
+ impute_missing_scores(
+ scores,
+ strategy = impute_model_score("UMass-MechBayes")
+ )
+ )
+ }
+)
+
+test_that("custom strategy function works", {
+ skip_if_not(
+ exists("build_missing_grid",
+ where = asNamespace("scoringutils")),
+ "build_missing_grid not yet available"
+ )
+ custom_strategy <- function(scores, missing_rows, metrics,
+ compare) {
+ for (m in metrics) {
+ data.table::set(missing_rows, j = m, value = 999)
+ }
+ return(missing_rows)
+ }
+ scores <- scores_quantile
+ metrics <- get_metrics.scores(scores)
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = custom_strategy
+ ))
+ imputed <- result[(.imputed)]
+ if (nrow(imputed) > 0) {
+ for (m in metrics) {
+ if (m %in% names(imputed)) {
+ expect_true(all(imputed[[m]] == 999))
+ }
+ }
+ }
+})
+
+# ==============================================================================
+# Integration tests
+# ==============================================================================
+test_that(
+ "impute_missing_scores |> summarise_scores works",
+ {
+ skip_if_not(
+ exists("build_missing_grid",
+ where = asNamespace("scoringutils")),
+ "build_missing_grid not yet available"
+ )
+ scores <- scores_quantile
+ imputed <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_na_score()
+ ))
+ result <- summarise_scores(imputed, by = "model")
+ expect_s3_class(result, "data.table")
+ }
+)
+
+test_that(
+ "filter then impute pipeline works",
+ {
+ skip_if_not(
+ exists("build_missing_grid",
+ where = asNamespace("scoringutils")),
+ "build_missing_grid not yet available"
+ )
+ skip_if_not(
+ exists("filter_scores",
+ where = asNamespace("scoringutils")),
+ "filter_scores not yet available"
+ )
+ skip_if_not(
+ exists("filter_to_include",
+ where = asNamespace("scoringutils")),
+ "filter_to_include not yet available"
+ )
+ scores <- scores_quantile
+ ref_model <- "EuroCOVIDhub-baseline"
+
+ filtered <- suppressMessages(filter_scores(
+ scores,
+ strategy = filter_to_include(ref_model)
+ ))
+ result <- suppressMessages(impute_missing_scores(
+ filtered,
+ strategy = impute_model_score(ref_model)
+ ))
+ expect_s3_class(result, "scores")
+ }
+)
+
+test_that(
+ "impute_worst_score skips metrics not in columns",
+ {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ # Attribute claims "wis" and "fake" but "fake" is
+ # not a column
+ scores <- new_scores(scores, c("wis", "fake"))
+ result <- suppressWarnings(suppressMessages(
+ impute_missing_scores(
+ scores, strategy = impute_worst_score()
+ )
+ ))
+ imputed <- result[result$.imputed]
+ expect_equal(nrow(imputed), 1)
+ expect_false("fake" %in% names(imputed))
+ }
+)
+
+test_that(
+ "impute_mean_score skips metrics not in columns",
+ {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ scores <- new_scores(scores, c("wis", "fake"))
+ result <- suppressWarnings(suppressMessages(
+ impute_missing_scores(
+ scores, strategy = impute_mean_score()
+ )
+ ))
+ imputed <- result[result$.imputed]
+ expect_equal(nrow(imputed), 1)
+ expect_false("fake" %in% names(imputed))
+ }
+)
+
+test_that(
+ "impute_model_score errors for nonexistent model",
+ {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ scores <- new_scores(scores, "wis")
+ expect_error(
+ impute_missing_scores(
+ scores,
+ strategy = impute_model_score("nonexistent")
+ ),
+ "not found"
+ )
+ }
+)
+
+test_that(
+ "impute_worst_score returns NA when all scores for a target are NA",
+ {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B", "C"),
+ location = c("DE", "US", "DE", "DE"),
+ wis = c(NA_real_, 2, NA_real_, NA_real_)
+ )
+ scores <- new_scores(scores, "wis")
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_worst_score()
+ ))
+ # B and C are missing US; all DE wis are NA so the
+ # imputed DE rows for any model should be NA, not -Inf.
+ imputed <- result[(.imputed) & location == "DE"]
+ expect_true(all(is.na(imputed$wis)))
+ expect_false(any(is.infinite(imputed$wis)))
+ }
+)
+
+test_that(
+ "impute_mean_score returns NA when all scores for a target are NA",
+ {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B", "C"),
+ location = c("DE", "US", "DE", "DE"),
+ wis = c(NA_real_, 2, NA_real_, NA_real_)
+ )
+ scores <- new_scores(scores, "wis")
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_mean_score()
+ ))
+ imputed <- result[(.imputed) & location == "DE"]
+ expect_true(all(is.na(imputed$wis)))
+ expect_false(any(is.nan(imputed$wis)))
+ }
+)
+
+test_that(
+ "impute_missing_scores errors on strategy with wrong formals",
+ {
+ scores <- data.table::data.table(
+ model = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ scores <- new_scores(scores, "wis")
+ bad_strategy <- function(scores, missing_rows) missing_rows
+ expect_error(
+ impute_missing_scores(scores, strategy = bad_strategy),
+ "missing required"
+ )
+ }
+)
+
+test_that(
+ "impute_missing_scores works with non-default compare",
+ {
+ scores <- data.table::data.table(
+ forecaster = c("A", "A", "B"),
+ location = c("DE", "US", "DE"),
+ wis = c(1, 2, 3)
+ )
+ scores <- new_scores(scores, "wis")
+ result <- suppressMessages(impute_missing_scores(
+ scores,
+ strategy = impute_worst_score(),
+ compare = "forecaster"
+ ))
+ expect_true(".imputed" %in% names(result))
+ imputed <- result[result$.imputed]
+ expect_equal(nrow(imputed), 1)
+ expect_equal(imputed$forecaster, "B")
+ expect_equal(imputed$location, "US")
+ }
+)
+
+
+# ==============================================================================
+# Additional integration tests
+# ==============================================================================
+test_that(
+ "impute then summarise gives equal row counts per model",
+ {
+ scores <- scores_quantile
+ fu <- get_forecast_unit(scores)
+ target_cols <- setdiff(fu, "model")
+
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_na_score()
+ ))
+
+ # After imputation, every model should have the same
+ # number of rows (one per target combination)
+ rows_per_model <- result[, .N, by = "model"]
+ expect_equal(
+ length(unique(rows_per_model$N)), 1,
+ info = paste(
+ "Expected equal rows per model, got:",
+ paste(
+ rows_per_model$model,
+ rows_per_model$N,
+ sep = "=", collapse = ", "
+ )
+ )
+ )
+ }
+)
+
+test_that(
+ "impute_model_score values match reference model scores",
+ {
+ scores <- scores_quantile
+ metrics <- attr(scores, "metrics")
+ fu <- get_forecast_unit(scores)
+ target_cols <- setdiff(fu, "model")
+ ref_name <- "EuroCOVIDhub-baseline"
+
+ result <- suppressMessages(impute_missing_scores(
+ scores,
+ strategy = impute_model_score(ref_name)
+ ))
+ imputed <- result[(.imputed)]
+ expect_gt(nrow(imputed), 0)
+
+ # Get the reference model's actual scores
+ ref_scores <- scores[model == ref_name]
+
+ # For each imputed row, the metric values should equal
+ # the reference model's scores for the same target
+ merged <- merge(
+ imputed,
+ ref_scores,
+ by = target_cols,
+ suffixes = c(".imputed", ".ref")
+ )
+ expect_equal(nrow(merged), nrow(imputed))
+
+ for (m in metrics) {
+ col_imp <- paste0(m, ".imputed")
+ col_ref <- paste0(m, ".ref")
+ if (col_imp %in% names(merged) &&
+ col_ref %in% names(merged)) {
+ expect_equal(
+ merged[[col_imp]],
+ merged[[col_ref]],
+ info = paste(
+ "Imputed", m,
+ "should match reference model"
+ )
+ )
+ }
+ }
+ }
+)
+
+test_that(
+ "imputation does not alter original score values",
+ {
+ scores <- scores_quantile
+ metrics <- get_metrics.scores(scores)
+ result <- suppressMessages(impute_missing_scores(
+ scores, strategy = impute_worst_score()
+ ))
+
+ originals <- result[!(.imputed)]
+ originals[, .imputed := NULL]
+
+ # Original metric values should be unchanged
+ for (m in metrics) {
+ if (!(m %in% names(originals))) next
+ data.table::setkeyv(originals, names(scores))
+ data.table::setkeyv(scores, names(scores))
+ expect_equal(
+ as.numeric(originals[[m]]),
+ as.numeric(scores[[m]]),
+ info = paste("metric", m, "changed")
+ )
+ }
+ }
+)
diff --git a/vignettes/handling-missing-forecasts.Rmd b/vignettes/handling-missing-forecasts.Rmd
new file mode 100644
index 000000000..5658119a9
--- /dev/null
+++ b/vignettes/handling-missing-forecasts.Rmd
@@ -0,0 +1,202 @@
+---
+title: "Handling missing forecasts"
+output: rmarkdown::html_vignette
+vignette: >
+ %\VignetteIndexEntry{Handling missing forecasts}
+ %\VignetteEngine{knitr::rmarkdown}
+ %\VignetteEncoding{UTF-8}
+---
+
+```{r setup, include = FALSE}
+knitr::opts_chunk$set(
+ collapse = TRUE, comment = "#>"
+)
+data.table::setDTthreads(2)
+```
+
+When comparing forecast models, not all models will have made predictions for every target.
+Naively averaging scores across different sets of targets can produce misleading summaries.
+This vignette walks through diagnosing which targets each model covers, scoring the forecasts, and then adjusting the scores using filtering and imputation so that summaries are comparable.
+
+The approaches here were initially inspired by [Kim, Ray & Reich (2026)](https://doi.org/10.1016/j.ijforecast.2025.12.006), who discuss the importance of handling missing forecasts when evaluating model contributions beyond simple leaderboard rankings.
+
+## Diagnosing missingness
+
+Before adjusting scores we should aim to understand the patterns of potential missingness.
+Models may have different coverage for legitimate reasons (a model may only forecast deaths, not cases) or because of operational failures (a missed submission deadline).
+
+`get_forecast_counts()` tabulates how many forecasts each model has, grouped by the columns you choose.
+
+```{r counts}
+library(scoringutils)
+fc <- as_forecast_quantile(example_quantile)
+get_forecast_counts(fc, by = c("model", "target_type"))
+```
+
+`UMass-MechBayes` does not forecast cases at all, and `epiforecasts-EpiNow2` has fewer death forecasts than the other models.
+
+To see exactly which death targets `epiforecasts-EpiNow2` is missing, we can request counts at a finer level and filter to zero-count rows.
+
+```{r missing-detail}
+death_counts <- get_forecast_counts(
+ fc,
+ by = c("model", "target_type", "location", "target_end_date")
+)
+death_counts[
+ model == "epiforecasts-EpiNow2" &
+ target_type == "Deaths" &
+ count == 0
+]
+```
+
+Note that `get_pairwise_comparisons()` handles missingness internally, restricting each pair of models to their shared set of targets before comparing.
+The functions below give you explicit control over how to handle missingness when computing score summaries.
+
+## Scoring
+
+We score all forecasts first, then adjust the resulting scores table.
+Both `filter_scores()` and `impute_missing_scores()` accept a `compare` argument (default `"model"`) that specifies which column identifies the units being compared.
+
+```{r score}
+scores <- score(fc)
+```
+
+A naive summary averages over different numbers of targets per model, which can make direct comparison misleading.
+
+```{r naive-summary}
+summarise_scores(scores, by = "model")
+```
+
+The sections below show two approaches to addressing this: filtering scores to a common set of targets, and imputing scores for missing targets.
+
+## Filtering to a common set of targets
+
+`filter_scores()` removes scores based on the supplied strategy.
+This can be appropriate when a model legitimately does not cover certain targets and you want to compare only on shared ground.
+
+The default strategy, `filter_to_intersection()`, keeps only targets covered by **all** models.
+
+```{r filter-default}
+scores_filtered <- filter_scores(scores)
+summarise_scores(scores_filtered, by = "model")
+```
+
+This drops all case targets (since `UMass-MechBayes` has no case forecasts) and the death targets `epiforecasts-EpiNow2` missed.
+
+### Requiring partial coverage
+
+The default requires every model to cover a target for it to be kept.
+The `min_coverage` argument relaxes this by keeping targets covered by at least a given proportion of models.
+With four models, `min_coverage = 0.75` requires coverage by at least three.
+
+```{r filter-relaxed}
+scores_relaxed <- filter_scores(
+ scores,
+ strategy = filter_to_intersection(min_coverage = 0.75)
+)
+summarise_scores(scores_relaxed, by = "model")
+```
+
+In this example, case targets are covered by three of four models and death targets by three or four, so `min_coverage = 0.75` retains all targets.
+The default (`min_coverage = 1`) is stricter and drops all case targets because `UMass-MechBayes` has no case forecasts.
+Between 0.75 and 1.0 no intermediate threshold changes the result here because no target is covered by exactly three out of four models while being missing for one that isn't `UMass-MechBayes`.
+
+### Filtering to a specific model's targets
+
+`filter_to_include()` restricts to targets covered by named models.
+For example, to evaluate all models only on the targets `epiforecasts-EpiNow2` covered:
+
+```{r filter-include}
+scores_epinow2 <- filter_scores(
+ scores,
+ strategy = filter_to_include("epiforecasts-EpiNow2")
+)
+summarise_scores(scores_epinow2, by = "model")
+```
+
+This keeps both case and death targets where EpiNow2 submitted forecasts, but `UMass-MechBayes` will still be missing case scores in the result.
+
+## Imputing missing scores
+
+Instead of dropping data, `impute_missing_scores()` fills in scores for target combinations a model did not cover.
+Imputed rows are marked with `.imputed = TRUE` so they can be identified later.
+
+### NA
+
+The simplest option: fill missing scores with `NA`.
+This preserves the structure of the data without making assumptions about what the score would have been.
+`NA` values propagate through summaries, so this can also serve as a diagnostic check to confirm where missingness exists.
+
+```{r impute-na}
+scores_na <- impute_missing_scores(
+ scores,
+ strategy = impute_na_score()
+)
+summarise_scores(scores_na, by = "model")
+```
+
+### Worst score
+
+Fill each missing score with the worst (maximum) observed score for that target across all models.
+This penalises models most heavily for missing targets.
+
+```{r impute-worst}
+scores_worst <- impute_missing_scores(
+ scores,
+ strategy = impute_worst_score()
+)
+summarise_scores(scores_worst, by = "model")
+```
+
+We can check that the imputed rows match the models and targets we identified as missing earlier.
+
+```{r imputed-check}
+scores_worst[
+ (.imputed),
+ .(n_imputed = .N),
+ by = c("model", "target_type")
+]
+```
+
+### Reference model
+
+Fill with the scores of a named baseline model, treating a missing forecast as performing no better than that baseline.
+
+```{r impute-model}
+scores_ref <- impute_missing_scores(
+ scores,
+ strategy = impute_model_score("EuroCOVIDhub-baseline")
+)
+summarise_scores(scores_ref, by = "model")
+```
+
+This is a reasonable default when a suitable baseline exists, though more research is needed on best practice for choosing the reference model and understanding the impact of this choice.
+
+### Mean score
+
+Fill with the mean score across models that did forecast each target.
+This is the least severe penalty as it assigns the average performance, which may be close to the ensemble performance.
+
+```{r impute-mean}
+scores_mean <- impute_missing_scores(
+ scores,
+ strategy = impute_mean_score()
+)
+summarise_scores(scores_mean, by = "model")
+```
+
+## Combining filter and impute
+
+Consider combining filtering and imputation when you want to focus on a specific model's targets but still need complete scores for all models.
+For example, to evaluate on the targets `epiforecasts-EpiNow2` covered and then impute scores for models that are missing forecasts within that set:
+
+```{r pipeline}
+result <- scores |>
+ filter_scores(
+ strategy = filter_to_include("epiforecasts-EpiNow2")
+ ) |>
+ impute_missing_scores(
+ strategy = impute_worst_score()
+ )
+summarise_scores(result, by = "model")
+```