# Source code for statsmodels.stats.multivariate

# -*- coding: utf-8 -*-
"""
Created on Sun Nov  5 14:48:19 2017

Author: Josef Perktold
"""

import numpy as np
from scipy import stats

from statsmodels.stats.moment_helpers import cov2corr
from statsmodels.stats.base import HolderTuple
from statsmodels.tools.validation import array_like

# shortcut function
def _logdet(x):
return np.linalg.slogdet(x)

[docs]def test_mvmean(data, mean_null=0, return_results=True):
"""Hotellings test for multivariate mean in one sample

Parameters
----------
data : array_like
data with observations in rows and variables in columns
mean_null : array_like
mean of the multivariate data under the null hypothesis
return_results : bool
If true, then a results instance is returned. If False, then only
the test statistic and pvalue are returned.

Returns
-------
results : instance of a results class with attributes
statistic, pvalue, t2 and df
(statistic, pvalue) : tuple
If return_results is false, then only the test statistic and the
pvalue are returned.

"""
x = np.asarray(data)
nobs, k_vars = x.shape
mean = x.mean(0)
cov = np.cov(x, rowvar=False, ddof=1)
diff = mean - mean_null
t2 = nobs * diff.dot(np.linalg.solve(cov, diff))
factor = (nobs - 1) * k_vars / (nobs - k_vars)
statistic = t2 / factor
df = (k_vars, nobs - k_vars)
pvalue = stats.f.sf(statistic, df, df)
if return_results:
res = HolderTuple(statistic=statistic,
pvalue=pvalue,
df=df,
t2=t2,
distr="F")
return res
else:
return statistic, pvalue

[docs]def test_mvmean_2indep(data1, data2):
"""Hotellings test for multivariate mean in two independent samples

The null hypothesis is that both samples have the same mean.
The alternative hypothesis is that means differ.

Parameters
----------
data1 : array_like
first sample data with observations in rows and variables in columns
data2 : array_like
second sample data with observations in rows and variables in columns

Returns
-------
results : instance of a results class with attributes
statistic, pvalue, t2 and df
"""
x1 = array_like(data1, "x1", ndim=2)
x2 = array_like(data2, "x2", ndim=2)
nobs1, k_vars = x1.shape
nobs2, k_vars2 = x2.shape
if k_vars2 != k_vars:
msg = "both samples need to have the same number of columns"
raise ValueError(msg)
mean1 = x1.mean(0)
mean2 = x2.mean(0)
cov1 = np.cov(x1, rowvar=False, ddof=1)
cov2 = np.cov(x2, rowvar=False, ddof=1)
nobs_t = nobs1 + nobs2
combined_cov = ((nobs1 - 1) * cov1 + (nobs2 - 1) * cov2) / (nobs_t - 2)
diff = mean1 - mean2
t2 = (nobs1 * nobs2) / nobs_t * diff @ np.linalg.solve(combined_cov, diff)
factor = ((nobs_t - 2) * k_vars) / (nobs_t - k_vars - 1)
statistic = t2 / factor
df = (k_vars, nobs_t - 1 - k_vars)
pvalue = stats.f.sf(statistic, df, df)
return HolderTuple(statistic=statistic,
pvalue=pvalue,
df=df,
t2=t2,
distr="F")

[docs]def confint_mvmean(data, lin_transf=None, alpha=0.5, simult=False):
"""Confidence interval for linear transformation of a multivariate mean

Either pointwise or simultaneous confidence intervals are returned.

Parameters
----------
data : array_like
data with observations in rows and variables in columns
lin_transf : array_like or None
The linear transformation or contrast matrix for transforming the
vector of means. If this is None, then the identity matrix is used
which specifies the means themselves.
alpha : float in (0, 1)
confidence level for the confidence interval, commonly used is
alpha=0.05.
simult : bool
If simult is False (default), then the pointwise confidence
interval is returned.
Otherwise, a simultaneous confidence interval is returned.
and the default for those might change.

Returns
-------
low : ndarray
lower confidence bound on the linear transformed
upp : ndarray
upper confidence bound on the linear transformed
values : ndarray
mean or their linear transformation, center of the confidence region

Notes
-----
Pointwise confidence interval is based on Johnson and Wichern
equation (5-21) page 224.

Simultaneous confidence interval is based on Johnson and Wichern
Result 5.3 page 225.
This looks like Sheffe simultaneous confidence intervals.

Bonferroni corrected simultaneous confidence interval might be added in
future

References
----------
Johnson, Richard A., and Dean W. Wichern. 2007. Applied Multivariate
Statistical Analysis. 6th ed. Upper Saddle River, N.J: Pearson Prentice
Hall.
"""
x = np.asarray(data)
nobs, k_vars = x.shape
if lin_transf is None:
lin_transf = np.eye(k_vars)
mean = x.mean(0)
cov = np.cov(x, rowvar=False, ddof=0)

ci = confint_mvmean_fromstats(mean, cov, nobs, lin_transf=lin_transf,
alpha=alpha, simult=simult)
return ci

[docs]def confint_mvmean_fromstats(mean, cov, nobs, lin_transf=None, alpha=0.05,
simult=False):
"""Confidence interval for linear transformation of a multivariate mean

Either pointwise or simultaneous confidence intervals are returned.
Data is provided in the form of summary statistics, mean, cov, nobs.

Parameters
----------
mean : ndarray
cov : ndarray
nobs : int
lin_transf : array_like or None
The linear transformation or contrast matrix for transforming the
vector of means. If this is None, then the identity matrix is used
which specifies the means themselves.
alpha : float in (0, 1)
confidence level for the confidence interval, commonly used is
alpha=0.05.
simult : bool
If simult is False (default), then pointwise confidence interval is
returned.
Otherwise, a simultaneous confidence interval is returned.
and the default for those might change.

Notes
-----
Pointwise confidence interval is based on Johnson and Wichern
equation (5-21) page 224.

Simultaneous confidence interval is based on Johnson and Wichern
Result 5.3 page 225.
This looks like Sheffe simultaneous confidence intervals.

Bonferroni corrected simultaneous confidence interval might be added in
future

References
----------
Johnson, Richard A., and Dean W. Wichern. 2007. Applied Multivariate
Statistical Analysis. 6th ed. Upper Saddle River, N.J: Pearson Prentice
Hall.

"""
mean = np.asarray(mean)
cov = np.asarray(cov)
c = np.atleast_2d(lin_transf)
k_vars = len(mean)

if simult is False:
values = c.dot(mean)
df = nobs - 1
t_critval = stats.t.isf(alpha / 2, df)
ci_diff = np.sqrt(quad_form / df) * t_critval
low = values - ci_diff
upp = values + ci_diff
else:
values = c.dot(mean)
factor = (nobs - 1) * k_vars / (nobs - k_vars) / nobs
df = (k_vars, nobs - k_vars)
f_critval = stats.f.isf(alpha, df, df)
ci_diff = np.sqrt(factor * quad_form * f_critval)
low = values - ci_diff
upp = values + ci_diff

return low, upp, values  # , (f_critval, factor, quad_form, df)

"""
Created on Tue Nov  7 13:22:44 2017

Author: Josef Perktold

References
----------
Stata manual for mvtest covariances
Rencher and Christensen 2012
Bartlett 1954

Stata refers to Rencher and Christensen for the formulas. Those correspond
to the formula collection in Bartlett 1954 for several of them.

"""  # pylint: disable=W0105

[docs]def test_cov(cov, nobs, cov_null):
"""One sample hypothesis test for covariance equal to null covariance

The Null hypothesis is that cov = cov_null, against the alternative that
it is not equal to cov_null

Parameters
----------
cov : array_like
Covariance matrix of the data, estimated with denominator (N - 1),
i.e. ddof=1.
nobs : int
number of observations used in the estimation of the covariance
cov_null : nd_array
covariance under the null hypothesis

Returns
-------
res : instance of HolderTuple
results with statistic, pvalue and other attributes like df

References
----------
Bartlett, M. S. 1954. “A Note on the Multiplying Factors for Various Χ2
Approximations.” Journal of the Royal Statistical Society. Series B
(Methodological) 16 (2): 296–98.

Rencher, Alvin C., and William F. Christensen. 2012. Methods of
Multivariate Analysis: Rencher/Methods. Wiley Series in Probability and
Statistics. Hoboken, NJ, USA: John Wiley & Sons, Inc.
https://doi.org/10.1002/9781118391686.

StataCorp, L. P. Stata Multivariate Statistics: Reference Manual.
Stata Press Publication.

"""
# using Stata formulas where cov_sample use nobs in denominator
# Bartlett 1954 has fewer terms

S = np.asarray(cov) * (nobs - 1) / nobs
S0 = np.asarray(cov_null)
k = cov.shape
n = nobs

fact = nobs - 1.
fact *= 1 - (2 * k + 1 - 2 / (k + 1)) / (6 * (n - 1) - 1)
fact2 = _logdet(S0) - _logdet(n / (n - 1) * S)
fact2 += np.trace(n / (n - 1) * np.linalg.solve(S0, S)) - k
statistic = fact * fact2
df = k * (k + 1) / 2
pvalue = stats.chi2.sf(statistic, df)
return HolderTuple(statistic=statistic,
pvalue=pvalue,
df=df,
distr="chi2",
null="equal value",
cov_null=cov_null
)

[docs]def test_cov_spherical(cov, nobs):
r"""One sample hypothesis test that covariance matrix is spherical

The Null and alternative hypotheses are

.. math::

H0 &: \Sigma = \sigma I \\
H1 &: \Sigma \neq \sigma I

where :math:\sigma_i is the common variance with unspecified value.

Parameters
----------
cov : array_like
Covariance matrix of the data, estimated with denominator (N - 1),
i.e. ddof=1.
nobs : int
number of observations used in the estimation of the covariance

Returns
-------
res : instance of HolderTuple
results with statistic, pvalue and other attributes like df

References
----------
Bartlett, M. S. 1954. “A Note on the Multiplying Factors for Various Χ2
Approximations.” Journal of the Royal Statistical Society. Series B
(Methodological) 16 (2): 296–98.

Rencher, Alvin C., and William F. Christensen. 2012. Methods of
Multivariate Analysis: Rencher/Methods. Wiley Series in Probability and
Statistics. Hoboken, NJ, USA: John Wiley & Sons, Inc.
https://doi.org/10.1002/9781118391686.

StataCorp, L. P. Stata Multivariate Statistics: Reference Manual.
Stata Press Publication.
"""

# unchanged Stata formula, but denom is cov cancels, AFAICS
# Bartlett 1954 correction factor in IIIc
cov = np.asarray(cov)
k = cov.shape

statistic = nobs - 1 - (2 * k**2 + k + 2) / (6 * k)
statistic *= k * np.log(np.trace(cov)) - _logdet(cov) - k * np.log(k)
df = k * (k + 1) / 2 - 1
pvalue = stats.chi2.sf(statistic, df)
return HolderTuple(statistic=statistic,
pvalue=pvalue,
df=df,
distr="chi2",
null="spherical"
)

[docs]def test_cov_diagonal(cov, nobs):
r"""One sample hypothesis test that covariance matrix is diagonal matrix.

The Null and alternative hypotheses are

.. math::

H0 &: \Sigma = diag(\sigma_i) \\
H1 &: \Sigma \neq diag(\sigma_i)

where :math:\sigma_i are the variances with unspecified values.

Parameters
----------
cov : array_like
Covariance matrix of the data, estimated with denominator (N - 1),
i.e. ddof=1.
nobs : int
number of observations used in the estimation of the covariance

Returns
-------
res : instance of HolderTuple
results with statistic, pvalue and other attributes like df

References
----------
Rencher, Alvin C., and William F. Christensen. 2012. Methods of
Multivariate Analysis: Rencher/Methods. Wiley Series in Probability and
Statistics. Hoboken, NJ, USA: John Wiley & Sons, Inc.
https://doi.org/10.1002/9781118391686.

StataCorp, L. P. Stata Multivariate Statistics: Reference Manual.
Stata Press Publication.
"""
cov = np.asarray(cov)
k = cov.shape
R = cov2corr(cov)

statistic = -(nobs - 1 - (2 * k + 5) / 6) * _logdet(R)
df = k * (k - 1) / 2
pvalue = stats.chi2.sf(statistic, df)
return HolderTuple(statistic=statistic,
pvalue=pvalue,
df=df,
distr="chi2",
null="diagonal"
)

def _get_blocks(mat, block_len):
"""get diagonal blocks from matrix
"""
k = len(mat)
idx = np.cumsum(block_len)
if idx[-1] == k:
idx = idx[:-1]
elif idx[-1] > k:
raise ValueError("sum of block_len larger than shape of mat")
else:
# allow one missing block that is the remainder
pass
idx_blocks = np.split(np.arange(k), idx)
blocks = []
for ii in idx_blocks:
blocks.append(mat[ii[:, None], ii])
return blocks, idx_blocks

[docs]def test_cov_blockdiagonal(cov, nobs, block_len):
r"""One sample hypothesis test that covariance is block diagonal.

The Null and alternative hypotheses are

.. math::

H0 &: \Sigma = diag(\Sigma_i) \\
H1 &: \Sigma \neq diag(\Sigma_i)

where :math:\Sigma_i are covariance blocks with unspecified values.

Parameters
----------
cov : array_like
Covariance matrix of the data, estimated with denominator (N - 1),
i.e. ddof=1.
nobs : int
number of observations used in the estimation of the covariance
block_len : list
list of length of each square block

Returns
-------
res : instance of HolderTuple
results with statistic, pvalue and other attributes like df

References
----------
Rencher, Alvin C., and William F. Christensen. 2012. Methods of
Multivariate Analysis: Rencher/Methods. Wiley Series in Probability and
Statistics. Hoboken, NJ, USA: John Wiley & Sons, Inc.
https://doi.org/10.1002/9781118391686.

StataCorp, L. P. Stata Multivariate Statistics: Reference Manual.
Stata Press Publication.
"""
cov = np.asarray(cov)
cov_blocks = _get_blocks(cov, block_len)
k = cov.shape
k_blocks = [c.shape for c in cov_blocks]
if k != sum(k_blocks):
msg = "sample covariances and blocks do not have matching shape"
raise ValueError(msg)
logdet_blocks = sum(_logdet(c) for c in cov_blocks)
a2 = k**2 - sum(ki**2 for ki in k_blocks)
a3 = k**3 - sum(ki**3 for ki in k_blocks)

statistic = (nobs - 1 - (2 * a3 + 3 * a2) / (6. * a2))
statistic *= logdet_blocks - _logdet(cov)

df = a2 / 2
pvalue = stats.chi2.sf(statistic, df)
return HolderTuple(statistic=statistic,
pvalue=pvalue,
df=df,
distr="chi2",
null="block-diagonal"
)

[docs]def test_cov_oneway(cov_list, nobs_list):
r"""Multiple sample hypothesis test that covariance matrices are equal.

This is commonly known as Box-M test.

The Null and alternative hypotheses are

.. math::

H0 &: \Sigma_i = \Sigma_j  \text{ for all i and j} \\
H1 &: \Sigma_i \neq \Sigma_j \text{ for at least one i and j}

where :math:\Sigma_i is the covariance of sample i.

Parameters
----------
cov_list : list of array_like
Covariance matrices of the sample, estimated with denominator
(N - 1), i.e. ddof=1.
nobs_list : list
List of the number of observations used in the estimation of the
covariance for each sample.

Returns
-------
res : instance of HolderTuple
Results contains test statistic and pvalues for both chisquare and F
distribution based tests, identified by the name ending "_chi2" and
"_f".
Attributes statistic, pvalue refer to the F-test version.

Notes
-----
approximations to distribution of test statistic is by Box

References
----------
Rencher, Alvin C., and William F. Christensen. 2012. Methods of
Multivariate Analysis: Rencher/Methods. Wiley Series in Probability and
Statistics. Hoboken, NJ, USA: John Wiley & Sons, Inc.
https://doi.org/10.1002/9781118391686.

StataCorp, L. P. Stata Multivariate Statistics: Reference Manual.
Stata Press Publication.
"""
# Note stata uses nobs in cov, this uses nobs - 1
cov_list = list(map(np.asarray, cov_list))
m = len(cov_list)
nobs = sum(nobs_list)  # total number of observations
k = cov_list.shape

cov_pooled = sum((n - 1) * c for (n, c) in zip(nobs_list, cov_list))
cov_pooled /= (nobs - m)
stat0 = (nobs - m) * _logdet(cov_pooled)
stat0 -= sum((n - 1) * _logdet(c) for (n, c) in zip(nobs_list, cov_list))

# Box's chi2
c1 = sum(1 / (n - 1) for n in nobs_list) - 1 / (nobs - m)
c1 *= (2 * k*k + 3 * k - 1) / (6 * (k + 1) * (m - 1))
df_chi2 = (m - 1) * k * (k + 1) / 2
statistic_chi2 = (1 - c1) * stat0
pvalue_chi2 = stats.chi2.sf(statistic_chi2, df_chi2)

c2 = sum(1 / (n - 1)**2 for n in nobs_list) - 1 / (nobs - m)**2
c2 *= (k - 1) * (k + 2) / (6 * (m - 1))
a1 = df_chi2
a2 = (a1 + 2) / abs(c2 - c1**2)
b1 = (1 - c1 - a1 / a2) / a1
b2 = (1 - c1 + 2 / a2) / a2
if c2 > c1**2:
statistic_f = b1 * stat0
else:
tmp = b2 * stat0
statistic_f = a2 / a1 * tmp / (1 + tmp)
df_f = (a1, a2)
pvalue_f = stats.f.sf(statistic_f, *df_f)
return HolderTuple(statistic=statistic_f,  # name convention, using F here
pvalue=pvalue_f,   # name convention, using F here
statistic_base=stat0,
statistic_chi2=statistic_chi2,
pvalue_chi2=pvalue_chi2,
df_chi2=df_chi2,
distr_chi2='chi2',
statistic_f=statistic_f,
pvalue_f=pvalue_f,
df_f=df_f,
distr_f='F')