Source code for statsmodels.stats.correlation_tools

# -*- coding: utf-8 -*-
"""

Created on Fri Aug 17 13:10:52 2012

Author: Josef Perktold
"""

import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import svds
from scipy.optimize import fminbound

from statsmodels.tools.tools import Bunch
from statsmodels.tools.sm_exceptions import (IterationLimitWarning,
iteration_limit_doc)

def clip_evals(x, value=0): #threshold=0, value=0):
evals, evecs = np.linalg.eigh(x)
clipped = np.any(evals < 0)
x_new = np.dot(evecs * np.maximum(evals, value), evecs.T)
return x_new, clipped

[docs]def corr_nearest(corr, threshold=1e-15, n_fact=100):
'''
Find the nearest correlation matrix that is positive semi-definite.

The function iteratively adjust the correlation matrix by clipping the
eigenvalues of a difference matrix. The diagonal elements are set to one.

Parameters
----------
corr : ndarray, (k, k)
initial correlation matrix
threshold : float
clipping threshold for smallest eigenvalue, see Notes
n_fact : int or float
factor to determine the maximum number of iterations. The maximum
number of iterations is the integer part of the number of columns in
the correlation matrix times n_fact.

Returns
-------
corr_new : ndarray, (optional)
corrected correlation matrix

Notes
-----
The smallest eigenvalue of the corrected correlation matrix is
approximately equal to the threshold.
If the threshold=0, then the smallest eigenvalue of the correlation matrix
might be negative, but zero within a numerical error, for example in the
range of -1e-16.

Assumes input correlation matrix is symmetric.

Stops after the first step if correlation matrix is already positive
semi-definite or positive definite, so that smallest eigenvalue is above
threshold. In this case, the returned array is not the original, but
is equal to it within numerical precision.

--------
corr_clipped
cov_nearest

'''
k_vars = corr.shape[0]
if k_vars != corr.shape[1]:
raise ValueError("matrix is not square")

diff = np.zeros(corr.shape)
x_new = corr.copy()
diag_idx = np.arange(k_vars)

for ii in range(int(len(corr) * n_fact)):
if not clipped:
x_new = x_psd
break
x_new = x_psd.copy()
x_new[diag_idx, diag_idx] = 1
else:
import warnings
warnings.warn(iteration_limit_doc, IterationLimitWarning)

return x_new

[docs]def corr_clipped(corr, threshold=1e-15):
'''
Find a near correlation matrix that is positive semi-definite

This function clips the eigenvalues, replacing eigenvalues smaller than
the threshold by the threshold. The new matrix is normalized, so that the
diagonal elements are one.
Compared to corr_nearest, the distance between the original correlation
matrix and the positive definite correlation matrix is larger, however,
it is much faster since it only computes eigenvalues once.

Parameters
----------
corr : ndarray, (k, k)
initial correlation matrix
threshold : float
clipping threshold for smallest eigenvalue, see Notes

Returns
-------
corr_new : ndarray, (optional)
corrected correlation matrix

Notes
-----
The smallest eigenvalue of the corrected correlation matrix is
approximately equal to the threshold. In examples, the
smallest eigenvalue can be by a factor of 10 smaller than the threshold,
e.g. threshold 1e-8 can result in smallest eigenvalue in the range
between 1e-9 and 1e-8.
If the threshold=0, then the smallest eigenvalue of the correlation matrix
might be negative, but zero within a numerical error, for example in the
range of -1e-16.

Assumes input correlation matrix is symmetric. The diagonal elements of
returned correlation matrix is set to ones.

If the correlation matrix is already positive semi-definite given the
threshold, then the original correlation matrix is returned.

cov_clipped is 40 or more times faster than cov_nearest in simple
example, but has a slightly larger approximation error.

--------
corr_nearest
cov_nearest

'''
x_new, clipped = clip_evals(corr, value=threshold)
if not clipped:
return corr

#cov2corr
x_std = np.sqrt(np.diag(x_new))
x_new = x_new / x_std / x_std[:,None]
return x_new

[docs]def cov_nearest(cov, method='clipped', threshold=1e-15, n_fact=100,
return_all=False):

'''
Find the nearest covariance matrix that is postive (semi-) definite

This leaves the diagonal, i.e. the variance, unchanged

Parameters
----------
cov : ndarray, (k,k)
initial covariance matrix
method : string
if "clipped", then the faster but less accurate corr_clipped is used.
if "nearest", then corr_nearest is used
threshold : float
clipping threshold for smallest eigen value, see Notes
nfact : int or float
factor to determine the maximum number of iterations in
corr_nearest. See its doc string
return_all : bool
if False (default), then only the covariance matrix is returned.
If True, then correlation matrix and standard deviation are

Returns
-------
cov_ : ndarray
corrected covariance matrix
corr_ : ndarray, (optional)
corrected correlation matrix
std_ : ndarray, (optional)
standard deviation

Notes
-----
This converts the covariance matrix to a correlation matrix. Then, finds
the nearest correlation matrix that is positive semidefinite and converts
it back to a covariance matrix using the initial standard deviation.

The smallest eigenvalue of the intermediate correlation matrix is
approximately equal to the threshold.
If the threshold=0, then the smallest eigenvalue of the correlation matrix
might be negative, but zero within a numerical error, for example in the
range of -1e-16.

Assumes input covariance matrix is symmetric.

--------
corr_nearest
corr_clipped

'''

from statsmodels.stats.moment_helpers import cov2corr, corr2cov
cov_, std_ = cov2corr(cov, return_std=True)
if method == 'clipped':
corr_ = corr_clipped(cov_, threshold=threshold)
elif method == 'nearest':
corr_ = corr_nearest(cov_, threshold=threshold, n_fact=n_fact)

cov_ = corr2cov(corr_, std_)

if return_all:
return cov_, corr_, std_
else:
return cov_

def _nmono_linesearch(obj, grad, x, d, obj_hist, M=10, sig1=0.1,
sig2=0.9, gam=1e-4, maxiter=100):
"""
Implements the non-monotone line search of Grippo et al. (1986),
as described in Birgin, Martinez and Raydan (2013).

Parameters
----------
obj : real-valued function
The objective function, to be minimized
The gradient of the objective function
x : array_like
The starting point for the line search
d : array_like
The search direction
obj_hist : array_like
Objective function history (must contain at least one value)
M : positive integer
Number of previous function points to consider (see references
for details).
sig1 : real
Tuning parameter, see references for details.
sig2 : real
Tuning parameter, see references for details.
gam : real
Tuning parameter, see references for details.
maxiter : positive integer
The maximum number of iterations; returns Nones if convergence
does not occur by this point

Returns
-------
alpha : real
The step value
x : Array_like
The function argument at the final step
obval : Real
The function value at the final step
g : Array_like
The gradient at the final step

Notes
-----
The basic idea is to take a big step in the direction of the
gradient, even if the function value is not decreased (but there
is a maximum allowed increase in terms of the recent history of
the iterates).

References
----------
Grippo L, Lampariello F, Lucidi S (1986). A Nonmonotone Line
Search Technique for Newton's Method. SIAM Journal on Numerical
Analysis, 23, 707-716.

E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected
gradient methods: Review and perspectives. Journal of Statistical
Software (preprint).
"""

alpha = 1.
last_obval = obj(x)
obj_max = max(obj_hist[-M:])

for iter in range(maxiter):

obval = obj(x + alpha*d)
gtd = (g * d).sum()

if obval <= obj_max + gam*alpha*gtd:
return alpha, x + alpha*d, obval, g

a1 = -0.5*alpha**2*gtd / (obval - last_obval - alpha*gtd)

if (sig1 <= a1) and (a1 <= sig2*alpha):
alpha = a1
else:
alpha /= 2.

last_obval = obval

return None, None, None, None

def _spg_optim(func, grad, start, project, maxiter=1e4, M=10,
ctol=1e-3, maxiter_nmls=200, lam_min=1e-30,
lam_max=1e30, sig1=0.1, sig2=0.9, gam=1e-4):
"""
Implements the spectral projected gradient method for minimizing a
differentiable function on a convex domain.

Parameters
----------
func : real valued function
The objective function to be minimized.
The gradient of the objective function
start : array_like
The starting point
project : function
In-place projection of the argument to the domain
of func.
... See notes regarding additional arguments

Returns
-------
rslt : Bunch
rslt.params is the final iterate, other fields describe
convergence status.

Notes
-----
This can be an effective heuristic algorithm for problems where no
gauranteed algorithm for computing a global minimizer is known.

There are a number of tuning parameters, but these generally
should not be changed except for maxiter (positive integer) and
ctol (small positive real).  See the Birgin et al reference for

Reference
---------
E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected
gradient methods: Review and perspectives. Journal of Statistical
Software (preprint).  Available at:
http://www.ime.usp.br/~egbirgin/publications/bmr5.pdf
"""

lam = min(10*lam_min, lam_max)

params = start.copy()

obj_hist = [func(params),]

for itr in range(int(maxiter)):

# Check convergence
df = params - gval
project(df)
df -= params
if np.max(np.abs(df)) < ctol:
return Bunch(**{"Converged": True, "params": params,
"objective_values": obj_hist,
"Message": "Converged successfully"})

# The line search direction
d = params - lam*gval
project(d)
d -= params

# Carry out the nonmonotone line search
alpha, params1, fval, gval1 = _nmono_linesearch(func, grad, params, d,
obj_hist, M=M,
sig1=sig1,
sig2=sig2,
gam=gam,
maxiter=maxiter_nmls)
if alpha is None:
return Bunch(**{"Converged": False, "params": params,
"objective_values": obj_hist,
"Message": "Failed in nmono_linesearch"})

obj_hist.append(fval)
s = params1 - params
y = gval1 - gval

sy = (s*y).sum()
if sy <= 0:
lam = lam_max
else:
ss = (s*s).sum()
lam = max(lam_min, min(ss/sy, lam_max))

params = params1
gval = gval1

return Bunch(**{"Converged": False, "params": params,
"objective_values": obj_hist,
"Message": "spg_optim did not converge"})

def _project_correlation_factors(X):
"""
Project a matrix into the domain of matrices whose row-wise sums
of squares are less than or equal to 1.

The input matrix is modified in-place.
"""
nm = np.sqrt((X*X).sum(1))
ii = np.flatnonzero(nm > 1)
if len(ii) > 0:
X[ii,:] /= nm[ii][:, None]

[docs]class FactoredPSDMatrix:
"""
Representation of a positive semidefinite matrix in factored form.

The representation is constructed based on a vector diag and
rectangular matrix root, such that the PSD matrix represented by
the class instance is Diag + root * root', where Diag is the
square diagonal matrix with diag on its main diagonal.

Parameters
----------
diag : 1d array-like
See above
root : 2d array-like
See above

Notes
-----
The matrix is represented internally in the form Diag^{1/2}(I +
factor * scales * factor')Diag^{1/2}, where Diag and scales
are diagonal matrices, and factor is an orthogonal matrix.
"""

def __init__(self, diag, root):
self.diag = diag
self.root = root
root = root / np.sqrt(diag)[:, None]
u, s, vt = np.linalg.svd(root, 0)
self.factor = u
self.scales = s**2

[docs]    def to_matrix(self):
"""
Returns the PSD matrix represented by this instance as a full
(square) matrix.
"""
return np.diag(self.diag) + np.dot(self.root, self.root.T)

[docs]    def decorrelate(self, rhs):
"""
Decorrelate the columns of rhs.

Parameters
----------
rhs : array-like
A 2 dimensional array with the same number of rows as the
PSD matrix represented by the class instance.

Returns
-------
C^{-1/2} * rhs, where C is the covariance matrix represented
by this class instance.

Notes
-----
The returned matrix has the identity matrix as its row-wise
population covariance matrix.

This function exploits the factor structure for efficiency.
"""

# I + factor * qval * factor' is the inverse square root of
# the covariance matrix in the homogeneous case where diag =
# 1.
qval = -1 + 1 / np.sqrt(1 + self.scales)

# Decorrelate in the general case.
rhs = rhs / np.sqrt(self.diag)[:, None]
rhs1 = np.dot(self.factor.T, rhs)
rhs1 *= qval[:, None]
rhs1 = np.dot(self.factor, rhs1)
rhs += rhs1

return rhs

[docs]    def solve(self, rhs):
"""
Solve a linear system of equations with factor-structured
coefficients.

Parameters
----------
rhs : array-like
A 2 dimensional array with the same number of rows as the
PSD matrix represented by the class instance.

Returns
-------
C^{-1} * rhs, where C is the covariance matrix represented
by this class instance.

Notes
-----
This function exploits the factor structure for efficiency.
"""

qval = -self.scales / (1 + self.scales)
dr = np.sqrt(self.diag)
rhs = rhs / dr[:, None]
mat = qval[:, None] * np.dot(self.factor.T, rhs)
rhs = rhs + np.dot(self.factor, mat)
return rhs / dr[:, None]

[docs]    def logdet(self):
"""
Returns the logarithm of the determinant of a
factor-structured matrix.
"""

logdet = np.sum(np.log(self.diag))
logdet += np.sum(np.log(self.scales))
logdet += np.sum(np.log(1 + 1 / self.scales))

return logdet

[docs]def corr_nearest_factor(corr, rank, ctol=1e-6, lam_min=1e-30,
lam_max=1e30, maxiter=1000):
"""
Find the nearest correlation matrix with factor structure to a
given square matrix.

Parameters
----------
corr : square array
The target matrix (to which the nearest correlation matrix is
sought).  Must be square, but need not be positive
semidefinite.
rank : positive integer
The rank of the factor structure of the solution, i.e., the
number of linearly independent columns of X.
ctol : positive real
Convergence criterion.
lam_min : float
Tuning parameter for spectral projected gradient optimization
(smallest allowed step in the search direction).
lam_max : float
Tuning parameter for spectral projected gradient optimization
(largest allowed step in the search direction).
maxiter : integer
Maximum number of iterations in spectral projected gradient
optimization.

Returns
-------
rslt : Bunch
rslt.corr is a FactoredPSDMatrix defining the estimated
correlation structure.  Other fields of rslt contain
returned values from spg_optim.

Notes
-----
A correlation matrix has factor structure if it can be written in
the form I + XX' - diag(XX'), where X is n x k with linearly
independent columns, and with each row having sum of squares at
most equal to 1.  The approximation is made in terms of the
Frobenius norm.

This routine is useful when one has an approximate correlation
matrix that is not positive semidefinite, and there is need to
estimate the inverse, square root, or inverse square root of the
population correlation matrix.  The factor structure allows these
tasks to be done without constructing any n x n matrices.

This is a non-convex problem with no known gauranteed globally
convergent algorithm for computing the solution.  Borsdof, Higham
and Raydan (2010) compared several methods for this problem and
found the spectral projected gradient (SPG) method (used here) to
perform best.

The input matrix corr can be a dense numpy array or any scipy
sparse matrix.  The latter is useful if the input matrix is
obtained by thresholding a very large sample correlation matrix.
If corr is sparse, the calculations are optimized to save
memory, so no working matrix with more than 10^6 elements is
constructed.

References
----------
R Borsdof, N Higham, M Raydan (2010).  Computing a nearest
correlation matrix with factor structure. SIAM J Matrix Anal
Appl, 31:5, 2603-2622.
http://eprints.ma.man.ac.uk/1523/01/covered/MIMS_ep2009_87.pdf

Examples
--------
Hard thresholding a correlation matrix may result in a matrix that
is not positive semidefinite.  We can approximate a hard
thresholded correlation matrix with a PSD matrix as follows, where
corr is the input correlation matrix.

>>> import numpy as np
>>> from statsmodels.stats.correlation_tools import corr_nearest_factor
>>> np.random.seed(1234)
>>> b = 1.5 - np.random.rand(10, 1)
>>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10)
>>> corr = np.corrcoef(x.T)
>>> corr = corr * (np.abs(corr) >= 0.3)
>>> rslt = corr_nearest_factor(corr, 3)
"""

p, _ = corr.shape

# Starting values (following the PCA method in BHR).
u,s,vt = svds(corr, rank)
X = u * np.sqrt(s)
nm = np.sqrt((X**2).sum(1))
ii = np.flatnonzero(nm > 1e-5)
X[ii,:] /= nm[ii][:, None]

# Zero the diagonal
corr1 = corr.copy()
if type(corr1) == np.ndarray:
np.fill_diagonal(corr1, 0)
elif sparse.issparse(corr1):
corr1.setdiag(np.zeros(corr1.shape[0]))
corr1.eliminate_zeros()
corr1.sort_indices()
else:
raise ValueError("Matrix type not supported")

# The gradient, from lemma 4.1 of BHR.
gr = np.dot(X, np.dot(X.T, X))
if type(corr1) == np.ndarray:
gr -= np.dot(corr1, X)
else:
gr -= corr1.dot(X)
gr -= (X*X).sum(1)[:, None] * X
return 4*gr

# The objective function (sum of squared deviations between fitted
# and observed arrays).
def func(X):
if type(corr1) == np.ndarray:
M = np.dot(X, X.T)
np.fill_diagonal(M, 0)
M -= corr1
fval = (M*M).sum()
return fval
else:
fval = 0.
# Control the size of intermediates
max_ws = 1e6
bs = int(max_ws / X.shape[0])
ir = 0
while ir < X.shape[0]:
ir2 = min(ir+bs, X.shape[0])
u = np.dot(X[ir:ir2, :], X.T)
ii = np.arange(u.shape[0])
u[ii, ir+ii] = 0
u -= np.asarray(corr1[ir:ir2, :].todense())
fval += (u*u).sum()
ir += bs
return fval

rslt = _spg_optim(func, grad, X, _project_correlation_factors)
root = rslt.params
diag = 1 - (root**2).sum(1)
soln = FactoredPSDMatrix(diag, root)
rslt.corr = soln
del rslt.params
return rslt

[docs]def cov_nearest_factor_homog(cov, rank):
"""
Approximate an arbitrary square matrix with a factor-structured
matrix of the form k*I + XX'.

Parameters
----------
cov : array-like
The input array, must be square but need not be positive
semidefinite
rank : positive integer
The rank of the fitted factor structure

Returns
-------
A FactoredPSDMatrix instance containing the fitted matrix

Notes
-----
This routine is useful if one has an estimated covariance matrix
that is not SPD, and the ultimate goal is to estimate the inverse,
square root, or inverse square root of the true covariance
matrix. The factor structure allows these tasks to be performed
without constructing any n x n matrices.

The calculations use the fact that if k is known, then X can be
determined from the eigen-decomposition of cov - k*I, which can
in turn be easily obtained form the eigen-decomposition of cov.
Thus the problem can be reduced to a 1-dimensional search for k
that does not require repeated eigen-decompositions.

If the input matrix is sparse, then cov - k*I is also sparse, so
the eigen-decomposition can be done effciciently using sparse
routines.

The one-dimensional search for the optimal value of k is not
convex, so a local minimum could be obtained.

Examples
--------
Hard thresholding a covariance matrix may result in a matrix that
is not positive semidefinite.  We can approximate a hard
thresholded covariance matrix with a PSD matrix as follows:

>>> import numpy as np
>>> np.random.seed(1234)
>>> b = 1.5 - np.random.rand(10, 1)
>>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10)
>>> cov = np.cov(x)
>>> cov = cov * (np.abs(cov) >= 0.3)
>>> rslt = cov_nearest_factor_homog(cov, 3)
"""

m, n = cov.shape

Q, Lambda, _ = svds(cov, rank)

if sparse.issparse(cov):
QSQ = np.dot(Q.T, cov.dot(Q))
ts = cov.diagonal().sum()
tss = cov.dot(cov).diagonal().sum()
else:
QSQ = np.dot(Q.T, np.dot(cov, Q))
ts = np.trace(cov)
tss = np.trace(np.dot(cov, cov))

def fun(k):
Lambda_t = Lambda - k
v = tss + m*(k**2) + np.sum(Lambda_t**2) - 2*k*ts
v += 2*k*np.sum(Lambda_t) - 2*np.sum(np.diag(QSQ) * Lambda_t)
return v

# Get the optimal decomposition
k_opt = fminbound(fun, 0, 1e5)
Lambda_opt = Lambda - k_opt
fac_opt = Q * np.sqrt(Lambda_opt)

diag = k_opt * np.ones(m, dtype=np.float64) #- (fac_opt**2).sum(1)
return FactoredPSDMatrix(diag, fac_opt)

[docs]def corr_thresholded(data, minabs=None, max_elt=1e7):
"""
Construct a sparse matrix containing the thresholded row-wise
correlation matrix from a data array.

Parameters
----------
data : array_like
The data from which the row-wise thresholded correlation
matrix is to be computed.
minabs : non-negative real
The threshold value; correlation coefficients smaller in
magnitude than minabs are set to zero.  If None, defaults

Returns
-------
cormat : sparse.coo_matrix
The thresholded correlation matrix, in COO format.

Notes
-----
This is an alternative to C = np.corrcoef(data); C \*= (np.abs(C)
>= absmin), suitable for very tall data matrices.

If the data are jointly Gaussian, the marginal sampling
distributions of the elements of the sample correlation matrix are
approximately Gaussian with standard deviation 1 / sqrt(n).  The
default value of minabs is thus equal to 1 standard error, which
will set to zero approximately 68% of the estimated correlation
coefficients for which the population value is zero.

No intermediate matrix with more than max_elt values will be
constructed.  However memory use could still be high if a large
number of correlation values exceed minabs in magnitude.

The thresholded matrix is returned in COO format, which can easily
be converted to other sparse formats.

Examples
--------
Here X is a tall data matrix (e.g. with 100,000 rows and 50
columns).  The row-wise correlation matrix of X is calculated
and stored in sparse form, with all entries smaller than 0.3
treated as 0.

>>> import numpy as np
>>> np.random.seed(1234)
>>> b = 1.5 - np.random.rand(10, 1)
>>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10)
>>> cmat = corr_thresholded(x, 0.3)
"""

nrow, ncol = data.shape

if minabs is None:
minabs = 1. / float(ncol)

# Row-standardize the data
data = data.copy()
data -= data.mean(1)[:, None]
sd = data.std(1, ddof=1)
ii = np.flatnonzero(sd > 1e-5)
data[ii, :] /= sd[ii][:, None]
ii = np.flatnonzero(sd <= 1e-5)
data[ii, :] = 0

# Number of rows to process in one pass
bs = int(np.floor(max_elt / nrow))

ipos_all, jpos_all, cor_values = [], [], []

ir = 0
while ir < nrow:
ir2 = min(data.shape[0], ir + bs)
cm = np.dot(data[ir:ir2,:], data.T) / (ncol - 1)
cma = np.abs(cm)
ipos, jpos = np.nonzero(cma >= minabs)
ipos_all.append(ipos + ir)
jpos_all.append(jpos)
cor_values.append(cm[ipos, jpos])
ir += bs

ipos = np.concatenate(ipos_all)
jpos = np.concatenate(jpos_all)
cor_values = np.concatenate(cor_values)

cmat = sparse.coo_matrix((cor_values, (ipos, jpos)), (nrow, nrow))

return cmat

if __name__ == '__main__':
pass