MeteoSwiss · Louis-Frey · May 11, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
diff --git a/config/multipanel_example.yaml b/config/multipanel_example.yaml
@@ -0,0 +1,117 @@
+# yaml-language-server: $schema=../workflow/tools/config.schema.json
+description: |
+  Evaluate skill of Stage E with/without cutoff edges trained with and without subgrid orography.
+
+dates:
+  start: 2025-01-01T06:00
+  end: 2025-12-26T00:00
+  frequency: 30h
+
+runs:
+
+  - forecaster:
+      inference_resources:
+          slurm_partition: normal-shared
+      checkpoint: https://service.meteoswiss.ch/mlstore#/experiments/602/runs/fd63e17043014af59170c7beca516b95
+      label: stage_E_realch1
+      steps: 0/120/6
+      config: resources/inference/configs/sgm-multidataset-forecaster-global-ich1-oper.yaml
+      extra_requirements:
+        - git+https://github.com/ecmwf/anemoi-inference.git@0.10.0
+
+  - forecaster:
+      inference_resources:
+          slurm_partition: normal-shared
+      checkpoint: https://service.meteoswiss.ch/mlstore#/experiments/602/runs/c30490b6ba064e4db03b430f3a2595ad
+      label: stage_E_icon_1km_cutoff_edges_subgrid_horography
+      steps: 0/120/6
+      config: resources/inference/configs/sgm-multidataset-forecaster-global-ich1-oper.yaml
+      extra_requirements:
+        - git+https://github.com/ecmwf/anemoi-inference.git@b9aaee5df86614cad9d8d08b76876a4be4e980db
+
+  # - forecaster:
+  #     inference_resources:
+  #         slurm_partition: normal-shared
+  #     checkpoint: https://service.meteoswiss.ch/mlstore#/experiments/602/runs/57684b20f64f414b937cce10e5ceeb68
+  #     label: stage_E_realch1_new
+  #     steps: 0/120/6
+  #     config: resources/inference/configs/sgm-multidataset-forecaster-global-ich1-oper.yaml
+  #     extra_requirements:
+  #       - git+https://github.com/ecmwf/anemoi-inference.git@b9aaee5df86614cad9d8d08b76876a4be4e980db
+
+  # - forecaster:
+  #     inference_resources:
+  #         slurm_partition: normal-shared
+  #     checkpoint: https://service.meteoswiss.ch/mlstore#/experiments/602/runs/2265ae18b04e4470ab89314a85a822ae
+  #     label: stage_E_icon_1km_cutoff_edges_KNN_5_dec
+  #     steps: 0/120/6
+  #     config: resources/inference/configs/sgm-multidataset-forecaster-global-ich1-oper.yaml
+  #     extra_requirements:
+  #       - git+https://github.com/ecmwf/anemoi-inference.git@b9aaee5df86614cad9d8d08b76876a4be4e980db
+
+
+baselines:
+  - baseline:
+      baseline_id: ICON-CH1-EPS
+      label: ICON-CH1-ctrl
+      root: /scratch/mch/cmerker/ICON-CH1-EPS
+      steps: 0/33/6
+
+  - baseline:
+      baseline_id: ICON-CH2-EPS
+      label: ICON-CH2-ctrl
+      root: /scratch/mch/cmerker/ICON-CH2-EPS
+      steps: 0/120/6
+
+
+truth:
+  label: KENDA-CH1
+  root: /store_new/mch/msopr/ml/datasets/mch-ich1-1km-2024-2025-1h-pl13-v1.0.zarr
+
+stratification:
+  regions:
+    - jura
+  root: /scratch/mch/bhendj/regions/Prognoseregionen_LV95_20220517
+
+dashboard:
+  stratification:
+    - season
+
+locations:
+  output_root: ./output
+
+profile:
+  executor: slurm
+  global_resources:
+    gpus: 16
+  default_resources:
+    slurm_partition: "postproc"
+    cpus_per_task: 1
+    mem_mb_per_cpu: 1800
+    runtime: "1h"
+    gpus: 0
+  jobs: 50
+  batch_rules:
+    plot_frame: 32
+
+multipanel_plots:
+  bias_overview:
+    rows: 2
+    cols: 2
+    figsize: [12, 8]
+    title: "BIAS vs lead time"
+    panels:
+      - {metric: BIAS, param: T_2M,     season: all, title: "T_2M — all"}
+      - {metric: BIAS, param: T_2M,     season: JJA, title: "T_2M — JJA"}
+      - {metric: BIAS, param: PMSL, season: all, title: "PMSL — all"}
+      - {metric: BIAS, param: PMSL, season: JJA, title: "PMSL — JJA"}
+  rmse_overview:
+    rows: 2
+    cols: 2
+    figsize: [12, 8]
+    title: "RMSE vs lead time"
+    panels:
+      - {metric: RMSE, param: T_2M,     init_hour: -999,  title: "T_2M — 00 UTC"}
+      - {metric: RMSE, param: T_2M,     init_hour: 12, title: "T_2M — 12 UTC"}
+      - {metric: RMSE, param: PMSL, init_hour: -999,  title: "PMSL — 00 UTC"}
+      - {metric: RMSE, param: PMSL, init_hour: 12, title: "PMSL — 12 UTC"}
diff --git a/resources/report/dashboard/script.js b/resources/report/dashboard/script.js
@@ -62,6 +62,12 @@ document.getElementById("param-select").addEventListener("change", updateChart);
 data = JSON.parse(document.getElementById("verif-data").textContent)
 header = document.getElementById("header-text").textContent.trim()
 
+// Pin the source -> color mapping to the full, alphabetically-sorted source
+// list so it stays bijective even when sources are toggled in the UI. Must
+// match src/plotting/source_colors.py to keep the dashboard and the static
+// matplotlib figures consistent.
+const allSources = [...new Set(data.map(d => d.source))].sort();
+
 // Define base spec
 var spec = {
   "data": { "values": data },
@@ -106,6 +112,7 @@ var spec = {
       "color": {
         "field": "source",
         "type": "nominal",
+        "scale": { "scheme": "tableau10", "domain": allSources },
         "legend": { "orient": "top", "title": "Data Source", "offset": 0, "padding": 10 }
       },
       "shape": {

diff --git a/src/evalml/config.py b/src/evalml/config.py
@@ -1,7 +1,7 @@
 from pathlib import Path
 from typing import Dict, List, Any, ClassVar, FrozenSet
 
-from pydantic import BaseModel, Field, RootModel, field_validator
+from pydantic import BaseModel, Field, RootModel, field_validator, model_validator
 
 PROJECT_ROOT = Path(__file__).parents[2]
 
@@ -227,6 +227,66 @@ class Stratification(BaseModel):
     )
 
 
+class MultipanelPanelSpec(BaseModel):
+    """One panel inside a multi-panel metric-vs-lead-time figure."""
+
+    metric: str = Field(..., description="Metric name (e.g. 'rmse').")
+    param: str = Field(..., description="Parameter name (e.g. 'T_2M').")
+    region: str = Field(
+        "all",
+        description="Region to subset to. 'all' uses the unstratified aggregate.",
+    )
+    season: str = Field(
+        "all",
+        description="Season to subset to. 'all' uses the unstratified aggregate.",
+    )
+    init_hour: int = Field(
+        -999,
+        description="Init hour to subset to. -999 (sentinel) uses the unstratified aggregate.",
+    )
+    title: str | None = Field(
+        None,
+        description="Panel title. Defaults to '<metric> - <param>'.",
+    )
+    ylim: List[float] | None = Field(
+        None,
+        description="Optional [ymin, ymax] for this panel's y-axis.",
+        min_length=2,
+        max_length=2,
+    )
+
+    model_config = {"extra": "forbid"}
+
+
+class MultipanelPlotSpec(BaseModel):
+    """Layout for a single multi-panel metric-vs-lead-time figure."""
+
+    rows: int = Field(..., ge=1, description="Number of subplot rows.")
+    cols: int = Field(..., ge=1, description="Number of subplot columns.")
+    figsize: List[float] | None = Field(
+        None,
+        description="Optional [width, height] in inches. Defaults to (4.5*cols, 3.5*rows).",
+        min_length=2,
+        max_length=2,
+    )
+    title: str | None = Field(None, description="Optional figure-level title.")
+    panels: List[MultipanelPanelSpec] = Field(
+        ...,
+        description="Per-panel specs in row-major order. Length must equal rows*cols.",
+    )
+
+    model_config = {"extra": "forbid"}
+
+    @model_validator(mode="after")
+    def _check_panel_count(self) -> "MultipanelPlotSpec":
+        expected = self.rows * self.cols
+        if len(self.panels) != expected:
+            raise ValueError(
+                f"panels has length {len(self.panels)}, expected rows*cols = {expected}"
+            )
+        return self
+
+
 class Dashboard(BaseModel):
     """Settings for the dashboard"""
 
@@ -351,6 +411,14 @@ def validate_threshold_operators(
     dashboard: Dashboard
     locations: Locations
     profile: Profile
+    multipanel_plots: Dict[str, MultipanelPlotSpec] = Field(
+        default_factory=dict,
+        description=(
+            "Optional named multi-panel metric-vs-lead-time figures. "
+            "Each entry produces one PNG under results/<experiment>/multipanel/<name>.png "
+            "when the verification_metrics_multipanel_plot_all target is built."
+        ),
+    )
 
     model_config = {
         "extra": "forbid",  # fail on misspelled keys

diff --git a/src/plotting/metric_lead_time_panel.py b/src/plotting/metric_lead_time_panel.py
@@ -0,0 +1,64 @@
+"""Per-axes plotting helper for verification metrics vs. lead time."""
+
+import pandas as pd
+from matplotlib.axes import Axes
+
+from verification import decode_metric
+
+from .units import metric_units
+
+
+def _default_ylabel(metric: str, param: str | None) -> str:
+    label = decode_metric(metric)
+    units = metric_units(metric, param) if param is not None else ""
+    return f"{label} [{units}]" if units else label
+
+
+def plot_panel(
+    ax: Axes,
+    sub_df: pd.DataFrame,
+    *,
+    metric: str,
+    param: str | None = None,
+    title: str | None = None,
+    panel_label: str | None = None,
+    xlabel: str | None = "Lead Time [h]",
+    ylabel: str | None = None,
+    show_legend: bool = True,
+    color_map: dict[str, str] | None = None,
+) -> None:
+    """Plot one metric-vs-lead-time panel onto `ax`.
+
+    `sub_df` must already be filtered to a single (metric, param, region, season,
+    init_hour) combo and contain at least the columns: source, lead_time, value.
+    One line per source is drawn.
+
+    If `ylabel` is None and `param` is provided, the y-axis label is built as
+    "<decoded metric> [<units>]" via plotting.units.metric_units.
+
+    `panel_label` (e.g. "a)") is rendered left-aligned at the same height as
+    the centred title.
+
+    If `color_map` is given, each source's line is drawn in
+    ``color_map[source]``; sources missing from the map fall back to
+    matplotlib's default color cycle. Use ``plotting.source_colors.source_color_map``
+    to build a map that matches the dashboard.
+    """
+    if ylabel is None:
+        ylabel = _default_ylabel(metric, param)
+    for source, df in sub_df.groupby("source"):
+        df.plot(
+            x="lead_time",
+            y="value",
+            kind="line",
+            marker="o",
+            title=title,
+            xlabel=xlabel or "",
+            ylabel=ylabel or "",
+            label=source,
+            color=(color_map or {}).get(source),
+            ax=ax,
+            legend=show_legend,
+        )
+    if panel_label:
+        ax.set_title(panel_label, loc="left", fontweight="bold")
diff --git a/src/plotting/source_colors.py b/src/plotting/source_colors.py
@@ -0,0 +1,38 @@
+"""Stable source -> color mapping shared with the dashboard.
+
+The dashboard uses Vega-Lite's ``tableau10`` categorical scheme and pins its
+``color.scale.domain`` to the alphabetically-sorted full source list so the
+mapping stays bijective regardless of dashboard filters. The matplotlib plots
+use the same palette and ordering so a given source has the same color in
+every figure produced from a verification run.
+
+Both the dashboard and the matplotlib side wrap around when there are more
+than ``len(TABLEAU10)`` sources, at which point two sources will share a
+color. Switch palettes (e.g. to ``tableau20`` or a deterministic HSV ramp)
+if that becomes a problem.
+"""
+
+# Vega-Lite "tableau10" scheme:
+# https://vega.github.io/vega/docs/schemes/#tableau10
+TABLEAU10: list[str] = [
+    "#4c78a8",
+    "#f58518",
+    "#e45756",
+    "#72b7b2",
+    "#54a24b",
+    "#eeca3b",
+    "#b279a2",
+    "#ff9da6",
+    "#9d755d",
+    "#bab0ac",
+]
+
+
+def source_color_map(sources) -> dict[str, str]:
+    """Return ``{source: color}`` over unique sources, ordered alphabetically.
+
+    Wraps around for more than ``len(TABLEAU10)`` sources, matching Vega-Lite's
+    behaviour for a categorical scale whose domain exceeds the scheme.
+    """
+    ordered = sorted(set(sources))
+    return {s: TABLEAU10[i % len(TABLEAU10)] for i, s in enumerate(ordered)}
diff --git a/src/plotting/units.py b/src/plotting/units.py
@@ -0,0 +1,26 @@
+"""Canonical units for verification parameters and metrics.
+
+Storage units in the verification netCDFs (BIAS, RMSE, MAE, STDE, ... all
+inherit these). Update the dict if a parameter's internal representation
+changes.
+"""
+
+PARAM_UNITS: dict[str, str] = {
+    "T_2M": "K",
+    "TD_2M": "K",
+    "PMSL": "Pa",
+    "PS": "Pa",
+    "TOT_PREC": "mm",
+    "U_10M": "m/s",
+    "V_10M": "m/s",
+    "SP_10M": "m/s",
+}
+
+UNITLESS_METRICS: set[str] = {"CORR", "R2"}
+
+
+def metric_units(metric: str, param: str) -> str:
+    """Return the canonical units of (metric, param), or '' if unitless/unknown."""
+    if metric.upper() in UNITLESS_METRICS:
+        return ""
+    return PARAM_UNITS.get(param, "")