Source code for causaldag.utils.invariance_tests.gauss_invariance

import numpy as np
from typing import Union, List, Optional
from sklearn.linear_model import LinearRegression
from causaldag.utils.core_utils import to_list
from scipy.special import stdtr, ncfdtr
from numpy.linalg import pinv

lr = LinearRegression()


[docs]def gauss_invariance_suffstat(
        obs_samples,
        context_samples_list
):
    """
    Helper function to compute the sufficient statistics for the gauss_invariance_test from data.

    Parameters
    ----------
    obs_samples:
        (n x p) matrix, where n is the number of samples and p is the number of variables.
    context_samples_list:
        list of (n x p) matrices, one for each context besides observational

    Return
    ------
    dict
        dictionary of sufficient statistics
    """
    obs_samples = np.hstack((obs_samples, np.ones([obs_samples.shape[0], 1])))
    obs_cov = np.cov(obs_samples, rowvar=False)
    obs_suffstat = dict(samples=obs_samples, G=obs_samples.T @ obs_samples, S=obs_cov)
    context_suffstats = []
    for context_samples in context_samples_list:
        context_samples = np.hstack((context_samples, np.ones([context_samples.shape[0], 1])))
        context_cov = np.cov(context_samples, rowvar=False)
        context_suffstats.append(dict(samples=context_samples, G=context_samples.T @ context_samples, S=context_cov))

    return dict(obs=obs_suffstat, contexts=context_suffstats)


[docs]def gauss_invariance_test(
        suffstat,
        context,
        i: int,
        cond_set: Optional[Union[List[int], int]] = None,
        alpha: float = 0.05,
        zero_mean=False,
        same_coeffs=False
):
    """
    Test the null hypothesis that two Gaussian distributions are equal.

    Parameters
    ----------
    suffstat:
        dictionary containing:

        * ``obs`` -- number of samples
        * ``G`` -- Gram matrix
        * ``contexts``
    context:
        which context to test.
    i:
        position of marginal distribution.
    cond_set:
        positions of conditioning set in correlation matrix.
    alpha:
        Significance level.
    zero_mean:
        If True, assume that the regression residual has zero mean.
    same_coeffs:
        If True, assume that the regression coefficients have not changed.

    Return
    ------
    dict
        dictionary containing ttest_stat, ftest_stat, f_pvalue, t_pvalue, and reject.
    """
    cond_set = to_list(cond_set)
    obs_samples = suffstat['obs']['samples']
    iv_samples = suffstat['contexts'][context]['samples']
    n1, p = obs_samples.shape
    n2 = iv_samples.shape[0]

    # === FIND REGRESSION COEFFICIENTS AND RESIDUALS
    if len(cond_set) != 0:
        cond_ix = cond_set if zero_mean else [*cond_set, -1]
        gram1 = suffstat['obs']['G'][np.ix_(cond_ix, cond_ix)]
        gram2 = suffstat['contexts'][context]['G'][np.ix_(cond_ix, cond_ix)]
        coefs1 = pinv(gram1) @ obs_samples[:, cond_ix].T @ obs_samples[:, i]
        coefs2 = pinv(gram2) @ iv_samples[:, cond_ix].T @ iv_samples[:, i]

        residuals1 = obs_samples[:, i] - obs_samples[:, cond_ix] @ coefs1
        residuals2 = iv_samples[:, i] - iv_samples[:, cond_ix] @ coefs2
    elif not zero_mean:
        gram1 = n1 * np.ones([1, 1])
        gram2 = n2 * np.ones([1, 1])
        cond_ix = [-1]
        coefs1 = np.array([np.mean(obs_samples[:, i])]) if not zero_mean else 0
        coefs2 = np.array([np.mean(iv_samples[:, i])]) if not zero_mean else 0
        residuals1 = obs_samples[:, i] - coefs1
        residuals2 = iv_samples[:, i] - coefs2
    else:
        residuals1 = obs_samples[:, i]
        residuals2 = iv_samples[:, i]

    # means and variances of residuals
    var1, var2 = np.var(residuals1, ddof=len(cond_ix)), np.var(residuals2, ddof=len(cond_ix))

    # calculate regression coefficient invariance statistic
    if len(cond_ix) != 0 and not same_coeffs:
        p = len(cond_ix)
        rc_stat = (coefs1 - coefs2) @ pinv(var1 * pinv(gram1) + var2 * pinv(gram2)) @ (coefs1 - coefs2).T / p
        rc_pvalue = ncfdtr(p, n1 + n2 - p, 0, rc_stat)
        rc_pvalue = 2 * min(rc_pvalue, 1 - rc_pvalue)

    # calculate statistic for F-Test
    ftest_stat = var1 / var2
    f_pvalue = ncfdtr(n1 - 1, n2 - 1, 0, ftest_stat)
    f_pvalue = 2 * min(f_pvalue, 1 - f_pvalue)

    # === ACCEPT/REJECT INVARIANCE HYPOTHESIS BASED ON P-VALUES WITH BONFERRONI CORRECTION
    if len(cond_ix) != 0 and not same_coeffs:
        reject = f_pvalue < alpha / 2 or rc_pvalue < alpha / 2
    else:
        reject = f_pvalue < alpha

    # === FORM RESULT DICT AND RETURN
    result_dict = dict(
        ftest_stat=ftest_stat,
        f_pvalue=f_pvalue,
        reject=reject
    )
    if len(cond_ix) != 0 and not same_coeffs:
        result_dict['rc_stat'] = rc_stat
        result_dict['rc_pvalue'] = rc_pvalue

    # print(result_dict)
    return result_dict