pymc-labs
diff --git a/‎causalpy/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎causalpy/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎causalpy/data/geolift1.csv‎
Lines changed: 209 additions & 209 deletions b/‎causalpy/data/geolift1.csv‎
Lines changed: 209 additions & 209 deletions
diff --git a/‎causalpy/data/simulate_data.py‎
Lines changed: 45 additions & 32 deletions b/‎causalpy/data/simulate_data.py‎
Lines changed: 45 additions & 32 deletions
diff --git a/‎causalpy/experiments/synthetic_control.py‎
Lines changed: 27 additions & 0 deletions b/‎causalpy/experiments/synthetic_control.py‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎causalpy/tests/test_utils.py‎
Lines changed: 87 additions & 0 deletions b/‎causalpy/tests/test_utils.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎causalpy/utils.py‎
Lines changed: 81 additions & 1 deletion b/‎causalpy/utils.py‎
Lines changed: 81 additions & 1 deletion
diff --git a/‎docs/source/knowledgebase/glossary.rst‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/knowledgebase/glossary.rst‎
Lines changed: 4 additions & 0 deletions
@@ -30,13 +30,14 @@
 from .experiments.regression_kink import RegressionKink
 from .experiments.staggered_did import StaggeredDifferenceInDifferences
 from .experiments.synthetic_control import SyntheticControl
-from .utils import extract_lift_for_mmm
+from .utils import extract_lift_for_mmm, plot_correlations
 
 __all__ = [
     "__version__",
     "create_causalpy_compatible_class",
     "DifferenceInDifferences",
     "extract_lift_for_mmm",
+    "plot_correlations",
     "InstrumentalVariable",
     "InterruptedTimeSeries",
     "InversePropensityWeighting",
 
@@ -333,53 +333,66 @@ def generate_ancova_data(
     return df
 
 
-def generate_geolift_data() -> pd.DataFrame:
-    """Generate synthetic data for a geolift example. This will consists of 6 untreated
-    countries. The treated unit `Denmark` is a weighted combination of the untreated
-    units. We additionally specify a treatment effect which takes effect after the
-    `treatment_time`. The timeseries data is observed at weekly resolution and has
-    annual seasonality, with this seasonality being a drawn from a Gaussian Process with
-    a periodic kernel."""
+def generate_geolift_data(seed: int | None = None) -> pd.DataFrame:
+    """Generate synthetic geolift data using a latent factor model.
+
+    Each unit's time series is a linear combination of K=3 shared seasonal
+    factors (GP draws) with unit-specific loadings, plus observation noise.
+    Most countries share positive loadings and are therefore positively
+    correlated, while 2 "contrarian" countries carry a negative loading on
+    one factor, making them negatively correlated with the majority. The
+    treated unit (Denmark) is a Dirichlet-weighted combination of the
+    positively-loaded countries only, so it is well-reconstructed by good
+    donors but poorly correlated with the contrarian ones.
+
+    This mirrors the latent factor DGP used to motivate synthetic control
+    methods in Abadie (2010, 2021).
+    """
+    rng = np.random.default_rng(seed)
     n_years = 4
     treatment_time = pd.to_datetime("2022-01-01")
     causal_impact = 0.2
-
     time = pd.date_range(start="2019-01-01", periods=52 * n_years, freq="W")
+    n_obs = len(time)
 
-    untreated = [
+    K = 3
+    factors = np.column_stack(
+        [create_series(n_years=n_years, intercept=0) for _ in range(K)]
+    )  # (n_obs, K)
+
+    similar = [
         "Austria",
         "Belgium",
         "Bulgaria",
         "Croatia",
         "Cyprus",
         "Czech_Republic",
+        "Estonia",
+        "Finland",
     ]
-
-    df = (
-        pd.DataFrame(
-            {
-                country: create_series(n_years=n_years, intercept=3)
-                for country in untreated
-            }
-        )
-        .assign(time=time)
-        .set_index("time")
-    )
-
-    # create treated unit as a weighted sum of the untreated units
-    weights = np.random.dirichlet(np.ones(len(untreated)), size=1)[0]
-    df = df.assign(Denmark=np.dot(df[untreated].values, weights))
-
-    # add observation noise
-    for col in untreated + ["Denmark"]:
-        df[col] += np.random.normal(size=len(df), scale=0.1)
-
-    # add treatment effect
+    contrarian = ["Greece", "Hungary"]
+    untreated = similar + contrarian
+
+    # Positive loadings for similar countries, one negative loading for contrarians
+    loadings: dict[str, np.ndarray] = {}
+    for country in similar:
+        loadings[country] = rng.uniform(0.3, 1.0, size=K)
+    loadings["Greece"] = np.array([-0.6, -0.3, 0.8])
+    loadings["Hungary"] = np.array([0.3, -0.7, -0.5])
+
+    df = pd.DataFrame(index=time)
+    df.index.name = "time"
+    for country in untreated:
+        df[country] = factors @ loadings[country] + 3 + rng.normal(0, 0.1, size=n_obs)
+
+    # Denmark as a weighted sum of similar countries only
+    w = rng.dirichlet(np.ones(len(similar)))
+    df["Denmark"] = df[similar].values @ w + rng.normal(0, 0.1, size=n_obs)
+
+    # treatment effect
     df["Denmark"] += np.where(df.index < treatment_time, 0, causal_impact)
 
-    # ensure we never see any negative sales
     df = df.clip(lower=0)
-
     return df
 
 
 
@@ -267,6 +267,30 @@ def input_validation(
                 "If data.index is not DatetimeIndex, treatment_time must be pd.Timestamp."  # noqa: E501
             )
 
+    def _pre_treatment_correlations(self) -> dict[str, float]:
+        """Compute Pearson correlation between each treated unit and its
+        synthetic control prediction in the pre-treatment period.
+
+        Returns
+        -------
+        dict[str, float]
+            Mapping from treated unit name to correlation coefficient.
+        """
+        correlations: dict[str, float] = {}
+        for unit in self.treated_units:
+            observed = self.datapre_treated.sel(treated_units=unit).values.flatten()
+            if isinstance(self.model, PyMCModel):
+                predicted = (
+                    self.pre_pred["posterior_predictive"]["mu"]
+                    .sel(treated_units=unit)
+                    .mean(dim=["chain", "draw"])
+                    .values.flatten()
+                )
+            else:
+                predicted = np.asarray(self.pre_pred).flatten()
+            correlations[unit] = float(np.corrcoef(observed, predicted)[0, 1])
+        return correlations
+
     def summary(self, round_to: int | None = None) -> None:
         """Print summary of main results and model coefficients.
 
@@ -280,6 +304,9 @@ def summary(self, round_to: int | None = None) -> None:
         else:
             print(f"Treated unit: {self.treated_units[0]}")
         self.print_coefficients(round_to)
+        corrs = self._pre_treatment_correlations()
+        for unit, r in corrs.items():
+            print(f"Pre-treatment correlation ({unit}): {r:.4f}")
 
     @staticmethod
     def _convert_treatment_time_for_axis(
 
@@ -15,6 +15,8 @@
 Tests for utility functions
 """
 
+import matplotlib
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import pytest
@@ -26,6 +28,7 @@
     check_convex_hull_violation,
     extract_lift_for_mmm,
     get_interaction_terms,
+    plot_correlations,
     round_num,
 )
 
@@ -369,3 +372,87 @@ def test_extract_lift_for_mmm_raises_for_ols():
             x=0.0,
             delta_x=1000,
         )
+
+
+# ============================================================================
+# Tests for plot_correlations
+# ============================================================================
+
+
+@pytest.fixture
+def panel_data():
+    """Simple wide-format panel data for correlation tests."""
+    rng = np.random.default_rng(0)
+    n = 50
+    base = np.sin(np.linspace(0, 4 * np.pi, n))
+    return pd.DataFrame(
+        {
+            "A": base + rng.normal(0, 0.1, n),
+            "B": base + rng.normal(0, 0.1, n),
+            "C": -base + rng.normal(0, 0.1, n),
+        }
+    )
+
+
+def test_plot_correlations_returns_matrix_and_axes(panel_data):
+    corr, ax = plot_correlations(panel_data)
+    assert isinstance(corr, pd.DataFrame)
+    assert corr.shape == (3, 3)
+    assert isinstance(ax, matplotlib.axes.Axes)
+    plt.close("all")
+
+
+def test_plot_correlations_diagonal_is_one(panel_data):
+    corr, _ = plot_correlations(panel_data)
+    np.testing.assert_allclose(np.diag(corr.values), 1.0)
+    plt.close("all")
+
+
+def test_plot_correlations_symmetric(panel_data):
+    corr, _ = plot_correlations(panel_data)
+    np.testing.assert_allclose(corr.values, corr.values.T)
+    plt.close("all")
+
+
+def test_plot_correlations_column_subset(panel_data):
+    corr, _ = plot_correlations(panel_data, columns=["A", "B"])
+    assert corr.shape == (2, 2)
+    assert list(corr.columns) == ["A", "B"]
+    plt.close("all")
+
+
+def test_plot_correlations_custom_ax(panel_data):
+    fig, provided_ax = plt.subplots()
+    _, returned_ax = plot_correlations(panel_data, ax=provided_ax)
+    assert returned_ax is provided_ax
+    plt.close("all")
+
+
+def test_plot_correlations_kwargs_forwarded(panel_data):
+    corr, _ = plot_correlations(panel_data, annot=False, vmin=0)
+    assert isinstance(corr, pd.DataFrame)
+    plt.close("all")
+
+
+# ============================================================================
+# Tests for SyntheticControl._pre_treatment_correlations
+# ============================================================================
+
+
+def test_pre_treatment_correlations_single_unit(sc_result_single_unit):
+    corrs = sc_result_single_unit._pre_treatment_correlations()
+    assert "actual" in corrs
+    assert 0 < corrs["actual"] <= 1.0
+
+
+def test_pre_treatment_correlations_multi_unit(sc_result_multi_unit):
+    corrs = sc_result_multi_unit._pre_treatment_correlations()
+    assert set(corrs.keys()) == {"t1", "t2"}
+    for r in corrs.values():
+        assert -1 <= r <= 1
+
+
+def test_summary_prints_correlation(sc_result_single_unit, capsys):
+    sc_result_single_unit.summary()
+    captured = capsys.readouterr()
+    assert "Pre-treatment correlation" in captured.out
@@ -18,10 +18,12 @@
 from __future__ import annotations
 
 import re
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any, Literal
 
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import seaborn as sns
 import xarray as xr
 
 if TYPE_CHECKING:
@@ -220,6 +222,84 @@ def check_convex_hull_violation(
     }
 
 
+def plot_correlations(
+    data: pd.DataFrame,
+    columns: list[str] | None = None,
+    method: Literal["pearson", "kendall", "spearman"] = "pearson",
+    figsize: tuple[float, float] | None = None,
+    ax: plt.Axes | None = None,
+    **kwargs: Any,
+) -> tuple[pd.DataFrame, plt.Axes]:
+    """Plot a pairwise correlation heatmap for panel data columns.
+
+    Computes the pairwise correlation matrix between the specified columns
+    (typically geographic units or time series) and displays it as a
+    lower-triangle heatmap. This is a pre-experiment diagnostic for
+    synthetic control analyses: markets that are highly correlated in the
+    pre-treatment period are more likely to produce reliable counterfactuals.
+
+    Parameters
+    ----------
+    data : pd.DataFrame
+        Wide-format panel data with time as the index and locations/units
+        as columns.
+    columns : list[str], optional
+        Subset of columns to include. If ``None``, all numeric columns
+        are used.
+    method : {"pearson", "kendall", "spearman"}, default "pearson"
+        Correlation method passed to :meth:`pandas.DataFrame.corr`.
+    figsize : tuple[float, float], optional
+        Width and height in inches for the figure. Only used when ``ax``
+        is not provided. If ``None``, matplotlib's default is used.
+    ax : matplotlib.axes.Axes, optional
+        Axes on which to draw the heatmap. If ``None``, a new figure and
+        axes are created (sized according to ``figsize``).
+    **kwargs
+        Additional keyword arguments forwarded to :func:`seaborn.heatmap`
+        (e.g., ``vmin``, ``vmax``, ``annot``, ``annot_kws``).
+
+    Returns
+    -------
+    tuple[pd.DataFrame, matplotlib.axes.Axes]
+        The correlation matrix and the axes containing the heatmap.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        import causalpy as cp
+
+        df = cp.load_data("geolift1")
+        corr, ax = cp.plot_correlations(df)
+
+        # Larger figure with smaller annotation text
+        corr, ax = cp.plot_correlations(df, figsize=(10, 8), annot_kws={"size": 7})
+    """
+    subset = data[columns] if columns is not None else data.select_dtypes("number")
+    corr = subset.corr(method=method)
+    mask = np.triu(np.ones_like(corr, dtype=bool))
+
+    if ax is None:
+        _, ax = plt.subplots(figsize=figsize)
+
+    defaults: dict[str, Any] = {
+        "mask": mask,
+        "cmap": sns.diverging_palette(230, 20, as_cmap=True),
+        "vmin": -1,
+        "vmax": 1,
+        "center": 0,
+        "square": True,
+        "linewidths": 0.5,
+        "cbar_kws": {"shrink": 0.8},
+        "annot": True,
+        "fmt": ".2f",
+    }
+    defaults.update(kwargs)
+
+    sns.heatmap(corr, ax=ax, **defaults)
+    return corr, ax
+
+
 def extract_lift_for_mmm(
     sc_result: SyntheticControl,
     channel: str,
 
@@ -45,6 +45,10 @@ Glossary
    DiD
       Analysis where the treatment effect is estimated as a difference between treatment conditions in the differences between pre-treatment to post treatment observations.
 
+   Donor pool
+   Donor pool selection
+      In synthetic control methods, the donor pool is the set of untreated units available to construct the synthetic control. Donor pool selection (or curation) is the process of choosing which untreated units to include. Units that are structurally dissimilar to the treated unit -- for example, those with negative pre-treatment correlations -- should be excluded because they can introduce interpolation bias and degrade the synthetic control fit. This is especially important in Bayesian implementations where priors (e.g. Dirichlet) assign non-zero weight to every donor by construction :footcite:p:`abadie2021using,abadie2010synthetic`.
+
    Donut regression discontinuity
    Donut RDD
       A robustness approach for regression discontinuity designs where observations within a specified distance from the treatment threshold are excluded from model fitting. This technique is used when observations closest to the cutoff may be problematic due to manipulation, sorting, or heaping/rounding of the running variable. By excluding the "donut hole" around the threshold, the analysis relies on observations that are less likely to be affected by such issues. See :footcite:t:`noack2024donut` for formal discussion of donut RDD properties.