# Source code for statsmodels.stats.correlation_tools

```
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 17 13:10:52 2012
Author: Josef Perktold
License: BSD-3
"""
from statsmodels.tools.sm_exceptions import (IterationLimitWarning,
iteration_limit_doc)
import numpy as np
import scipy.sparse as sparse
from scipy.sparse.linalg import svds
from scipy.optimize import fminbound
def clip_evals(x, value=0): #threshold=0, value=0):
evals, evecs = np.linalg.eigh(x)
clipped = np.any(evals < 0)
x_new = np.dot(evecs * np.maximum(evals, value), evecs.T)
return x_new, clipped
[docs]def corr_nearest(corr, threshold=1e-15, n_fact=100):
'''
Find the nearest correlation matrix that is positive semi-definite.
The function iteratively adjust the correlation matrix by clipping the
eigenvalues of a difference matrix. The diagonal elements are set to one.
Parameters
----------
corr : ndarray, (k, k)
initial correlation matrix
threshold : float
clipping threshold for smallest eigenvalue, see Notes
n_fact : int or float
factor to determine the maximum number of iterations. The maximum
number of iterations is the integer part of the number of columns in
the correlation matrix times n_fact.
Returns
-------
corr_new : ndarray, (optional)
corrected correlation matrix
Notes
-----
The smallest eigenvalue of the corrected correlation matrix is
approximately equal to the ``threshold``.
If the threshold=0, then the smallest eigenvalue of the correlation matrix
might be negative, but zero within a numerical error, for example in the
range of -1e-16.
Assumes input correlation matrix is symmetric.
Stops after the first step if correlation matrix is already positive
semi-definite or positive definite, so that smallest eigenvalue is above
threshold. In this case, the returned array is not the original, but
is equal to it within numerical precision.
See Also
--------
corr_clipped
cov_nearest
'''
k_vars = corr.shape[0]
if k_vars != corr.shape[1]:
raise ValueError("matrix is not square")
diff = np.zeros(corr.shape)
x_new = corr.copy()
diag_idx = np.arange(k_vars)
for ii in range(int(len(corr) * n_fact)):
x_adj = x_new - diff
x_psd, clipped = clip_evals(x_adj, value=threshold)
if not clipped:
x_new = x_psd
break
diff = x_psd - x_adj
x_new = x_psd.copy()
x_new[diag_idx, diag_idx] = 1
else:
import warnings
warnings.warn(iteration_limit_doc, IterationLimitWarning)
return x_new
[docs]def corr_clipped(corr, threshold=1e-15):
'''
Find a near correlation matrix that is positive semi-definite
This function clips the eigenvalues, replacing eigenvalues smaller than
the threshold by the threshold. The new matrix is normalized, so that the
diagonal elements are one.
Compared to corr_nearest, the distance between the original correlation
matrix and the positive definite correlation matrix is larger, however,
it is much faster since it only computes eigenvalues once.
Parameters
----------
corr : ndarray, (k, k)
initial correlation matrix
threshold : float
clipping threshold for smallest eigenvalue, see Notes
Returns
-------
corr_new : ndarray, (optional)
corrected correlation matrix
Notes
-----
The smallest eigenvalue of the corrected correlation matrix is
approximately equal to the ``threshold``. In examples, the
smallest eigenvalue can be by a factor of 10 smaller than the threshold,
e.g. threshold 1e-8 can result in smallest eigenvalue in the range
between 1e-9 and 1e-8.
If the threshold=0, then the smallest eigenvalue of the correlation matrix
might be negative, but zero within a numerical error, for example in the
range of -1e-16.
Assumes input correlation matrix is symmetric. The diagonal elements of
returned correlation matrix is set to ones.
If the correlation matrix is already positive semi-definite given the
threshold, then the original correlation matrix is returned.
``cov_clipped`` is 40 or more times faster than ``cov_nearest`` in simple
example, but has a slightly larger approximation error.
See Also
--------
corr_nearest
cov_nearest
'''
x_new, clipped = clip_evals(corr, value=threshold)
if not clipped:
return corr
#cov2corr
x_std = np.sqrt(np.diag(x_new))
x_new = x_new / x_std / x_std[:,None]
return x_new
[docs]def cov_nearest(cov, method='clipped', threshold=1e-15, n_fact=100,
return_all=False):
'''
Find the nearest covariance matrix that is postive (semi-) definite
This leaves the diagonal, i.e. the variance, unchanged
Parameters
----------
cov : ndarray, (k,k)
initial covariance matrix
method : string
if "clipped", then the faster but less accurate ``corr_clipped`` is used.
if "nearest", then ``corr_nearest`` is used
threshold : float
clipping threshold for smallest eigen value, see Notes
nfact : int or float
factor to determine the maximum number of iterations in
``corr_nearest``. See its doc string
return_all : bool
if False (default), then only the covariance matrix is returned.
If True, then correlation matrix and standard deviation are
additionally returned.
Returns
-------
cov_ : ndarray
corrected covariance matrix
corr_ : ndarray, (optional)
corrected correlation matrix
std_ : ndarray, (optional)
standard deviation
Notes
-----
This converts the covariance matrix to a correlation matrix. Then, finds
the nearest correlation matrix that is positive semidefinite and converts
it back to a covariance matrix using the initial standard deviation.
The smallest eigenvalue of the intermediate correlation matrix is
approximately equal to the ``threshold``.
If the threshold=0, then the smallest eigenvalue of the correlation matrix
might be negative, but zero within a numerical error, for example in the
range of -1e-16.
Assumes input covariance matrix is symmetric.
See Also
--------
corr_nearest
corr_clipped
'''
from statsmodels.stats.moment_helpers import cov2corr, corr2cov
cov_, std_ = cov2corr(cov, return_std=True)
if method == 'clipped':
corr_ = corr_clipped(cov_, threshold=threshold)
elif method == 'nearest':
corr_ = corr_nearest(cov_, threshold=threshold, n_fact=n_fact)
cov_ = corr2cov(corr_, std_)
if return_all:
return cov_, corr_, std_
else:
return cov_
def _nmono_linesearch(obj, grad, x, d, obj_hist, M=10, sig1=0.1,
sig2=0.9, gam=1e-4, maxiter=100):
"""
Implements the non-monotone line search of Grippo et al. (1986),
as described in Birgin, Martinez and Raydan (2013).
Parameters
----------
obj : real-valued function
The objective function, to be minimized
grad : vector-valued function
The gradient of the objective function
x : array_like
The starting point for the line search
d : array_like
The search direction
obj_hist : array_like
Objective function history (must contain at least one value)
M : positive integer
Number of previous function points to consider (see references
for details).
sig1 : real
Tuning parameter, see references for details.
sig2 : real
Tuning parameter, see references for details.
gam : real
Tuning parameter, see references for details.
maxiter : positive integer
The maximum number of iterations; returns Nones if convergence
does not occur by this point
Returns
-------
alpha : real
The step value
x : Array_like
The function argument at the final step
obval : Real
The function value at the final step
g : Array_like
The gradient at the final step
Notes
-----
The basic idea is to take a big step in the direction of the
gradient, even if the function value is not decreased (but there
is a maximum allowed increase in terms of the recent history of
the iterates).
References
----------
Grippo L, Lampariello F, Lucidi S (1986). A Nonmonotone Line
Search Technique for Newton's Method. SIAM Journal on Numerical
Analysis, 23, 707-716.
E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected
gradient methods: Review and perspectives. Journal of Statistical
Software (preprint).
"""
alpha = 1.
last_obval = obj(x)
obj_max = max(obj_hist[-M:])
for iter in range(maxiter):
obval = obj(x + alpha*d)
g = grad(x)
gtd = (g * d).sum()
if obval <= obj_max + gam*alpha*gtd:
return alpha, x + alpha*d, obval, g
a1 = -0.5*alpha**2*gtd / (obval - last_obval - alpha*gtd)
if (sig1 <= a1) and (a1 <= sig2*alpha):
alpha = a1
else:
alpha /= 2.
last_obval = obval
return None, None, None, None
def _spg_optim(func, grad, start, project, maxiter=1e4, M=10,
ctol=1e-3, maxiter_nmls=200, lam_min=1e-30,
lam_max=1e30, sig1=0.1, sig2=0.9, gam=1e-4):
"""
Implements the spectral projected gradient method for minimizing a
differentiable function on a convex domain.
Parameters
----------
func : real valued function
The objective function to be minimized.
grad : real array-valued function
The gradient of the objective function
start : array_like
The starting point
project : function
In-place projection of the argument to the domain
of func.
... See notes regarding additional arguments
Returns
-------
rslt : Bunch
rslt.params is the final iterate, other fields describe
convergence status.
Notes
-----
This can be an effective heuristic algorithm for problems where no
gauranteed algorithm for computing a global minimizer is known.
There are a number of tuning parameters, but these generally
should not be changed except for `maxiter` (positive integer) and
`ctol` (small positive real). See the Birgin et al reference for
more information about the tuning parameters.
Reference
---------
E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected
gradient methods: Review and perspectives. Journal of Statistical
Software (preprint). Available at:
http://www.ime.usp.br/~egbirgin/publications/bmr5.pdf
"""
lam = min(10*lam_min, lam_max)
params = start.copy()
gval = grad(params)
obj_hist = [func(params),]
for itr in range(int(maxiter)):
# Check convergence
df = params - gval
project(df)
df -= params
if np.max(np.abs(df)) < ctol:
return Bunch(**{"Converged": True, "params": params,
"objective_values": obj_hist,
"Message": "Converged successfully"})
# The line search direction
d = params - lam*gval
project(d)
d -= params
# Carry out the nonmonotone line search
alpha, params1, fval, gval1 = _nmono_linesearch(func, grad, params, d,
obj_hist, M=M,
sig1=sig1,
sig2=sig2,
gam=gam,
maxiter=maxiter_nmls)
if alpha is None:
return Bunch(**{"Converged": False, "params": params,
"objective_values": obj_hist,
"Message": "Failed in nmono_linesearch"})
obj_hist.append(fval)
s = params1 - params
y = gval1 - gval
sy = (s*y).sum()
if sy <= 0:
lam = lam_max
else:
ss = (s*s).sum()
lam = max(lam_min, min(ss/sy, lam_max))
params = params1
gval = gval1
return Bunch(**{"Converged": False, "params": params,
"objective_values": obj_hist,
"Message": "spg_optim did not converge"})
def _project_correlation_factors(X):
"""
Project a matrix into the domain of matrices whose row-wise sums
of squares are less than or equal to 1.
The input matrix is modified in-place.
"""
nm = np.sqrt((X*X).sum(1))
ii = np.flatnonzero(nm > 1)
if len(ii) > 0:
X[ii,:] /= nm[ii][:, None]
#TODO does this belong in a tools module somewhere?
class Bunch(object):
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
class FactoredPSDMatrix:
"""
Representation of a positive semidefinite matrix in factored form.
The representation is constructed based on a vector `diag` and
rectangular matrix `root`, such that the PSD matrix represented by
the class instance is Diag + root * root', where Diag is the
square diagonal matrix with `diag` on its main diagonal.
Parameters
----------
diag : 1d array-like
See above
root : 2d array-like
See above
Notes
-----
The matrix is represented internally in the form Diag^{1/2}(I +
factor * scales * factor')Diag^{1/2}, where `Diag` and `scales`
are diagonal matrices, and `factor` is an orthogonal matrix.
"""
def __init__(self, diag, root):
self.diag = diag
self.root = root
root = root / np.sqrt(diag)[:, None]
u, s, vt = np.linalg.svd(root, 0)
self.factor = u
self.scales = s**2
def to_matrix(self):
"""
Returns the PSD matrix represented by this instance as a full
(square) matrix.
"""
return np.diag(self.diag) + np.dot(self.root, self.root.T)
def decorrelate(self, rhs):
"""
Decorrelate the columns of `rhs`.
Parameters
----------
rhs : array-like
A 2 dimensional array with the same number of rows as the
PSD matrix represented by the class instance.
Returns
-------
C^{-1/2} * rhs, where C is the covariance matrix represented
by this class instance.
Notes
-----
The returned matrix has the identity matrix as its row-wise
population covariance matrix.
This function exploits the factor structure for efficiency.
"""
# I + factor * qval * factor' is the inverse square root of
# the covariance matrix in the homogeneous case where diag =
# 1.
qval = -1 + 1 / np.sqrt(1 + self.scales)
# Decorrelate in the general case.
rhs = rhs / np.sqrt(self.diag)[:, None]
rhs1 = np.dot(self.factor.T, rhs)
rhs1 *= qval[:, None]
rhs1 = np.dot(self.factor, rhs1)
rhs += rhs1
return rhs
def solve(self, rhs):
"""
Solve a linear system of equations with factor-structured
coefficients.
Parameters
----------
rhs : array-like
A 2 dimensional array with the same number of rows as the
PSD matrix represented by the class instance.
Returns
-------
C^{-1} * rhs, where C is the covariance matrix represented
by this class instance.
Notes
-----
This function exploits the factor structure for efficiency.
"""
qval = -self.scales / (1 + self.scales)
dr = np.sqrt(self.diag)
rhs = rhs / dr[:, None]
mat = qval[:, None] * np.dot(self.factor.T, rhs)
rhs = rhs + np.dot(self.factor, mat)
return rhs / dr[:, None]
def logdet(self):
"""
Returns the logarithm of the determinant of a
factor-structured matrix.
"""
logdet = np.sum(np.log(self.diag))
logdet += np.sum(np.log(self.scales))
logdet += np.sum(np.log(1 + 1 / self.scales))
return logdet
def corr_nearest_factor(corr, rank, ctol=1e-6, lam_min=1e-30,
lam_max=1e30, maxiter=1000):
"""
Find the nearest correlation matrix with factor structure to a
given square matrix.
Parameters
----------
corr : square array
The target matrix (to which the nearest correlation matrix is
sought). Must be square, but need not be positive
semidefinite.
rank : positive integer
The rank of the factor structure of the solution, i.e., the
number of linearly independent columns of X.
ctol : positive real
Convergence criterion.
lam_min : float
Tuning parameter for spectral projected gradient optimization
(smallest allowed step in the search direction).
lam_max : float
Tuning parameter for spectral projected gradient optimization
(largest allowed step in the search direction).
maxiter : integer
Maximum number of iterations in spectral projected gradient
optimization.
Returns
-------
rslt : Bunch
rslt.corr is a FactoredPSDMatrix defining the estimated
correlation structure. Other fields of `rslt` contain
returned values from spg_optim.
Notes
-----
A correlation matrix has factor structure if it can be written in
the form I + XX' - diag(XX'), where X is n x k with linearly
independent columns, and with each row having sum of squares at
most equal to 1. The approximation is made in terms of the
Frobenius norm.
This routine is useful when one has an approximate correlation
matrix that is not positive semidefinite, and there is need to
estimate the inverse, square root, or inverse square root of the
population correlation matrix. The factor structure allows these
tasks to be done without constructing any n x n matrices.
This is a non-convex problem with no known gauranteed globally
convergent algorithm for computing the solution. Borsdof, Higham
and Raydan (2010) compared several methods for this problem and
found the spectral projected gradient (SPG) method (used here) to
perform best.
The input matrix `corr` can be a dense numpy array or any scipy
sparse matrix. The latter is useful if the input matrix is
obtained by thresholding a very large sample correlation matrix.
If `corr` is sparse, the calculations are optimized to save
memory, so no working matrix with more than 10^6 elements is
constructed.
References
----------
R Borsdof, N Higham, M Raydan (2010). Computing a nearest
correlation matrix with factor structure. SIAM J Matrix Anal
Appl, 31:5, 2603-2622.
http://eprints.ma.man.ac.uk/1523/01/covered/MIMS_ep2009_87.pdf
Examples
--------
Hard thresholding a correlation matrix may result in a matrix that
is not positive semidefinite. We can approximate a hard
thresholded correlation matrix with a PSD matrix as follows, where
`corr` is the input correlation matrix.
>>> corr = corr * (np.abs(corr) >= 0.3)
>>> rslt = corr_nearest_factor(corr, 3)
"""
p, _ = corr.shape
# Starting values (following the PCA method in BHR).
u,s,vt = svds(corr, rank)
X = u * np.sqrt(s)
nm = np.sqrt((X**2).sum(1))
ii = np.flatnonzero(nm > 1e-5)
X[ii,:] /= nm[ii][:, None]
# Zero the diagonal
corr1 = corr.copy()
if type(corr1) == np.ndarray:
np.fill_diagonal(corr1, 0)
elif sparse.issparse(corr1):
corr1.setdiag(np.zeros(corr1.shape[0]))
corr1.eliminate_zeros()
corr1.sort_indices()
else:
raise ValueError("Matrix type not supported")
# The gradient, from lemma 4.1 of BHR.
def grad(X):
gr = np.dot(X, np.dot(X.T, X))
if type(corr1) == np.ndarray:
gr -= np.dot(corr1, X)
else:
gr -= corr1.dot(X)
gr -= (X*X).sum(1)[:, None] * X
return 4*gr
# The objective function (sum of squared deviations between fitted
# and observed arrays).
def func(X):
if type(corr1) == np.ndarray:
M = np.dot(X, X.T)
np.fill_diagonal(M, 0)
M -= corr1
fval = (M*M).sum()
return fval
else:
fval = 0.
# Control the size of intermediates
max_ws = 1e6
bs = int(max_ws / X.shape[0])
ir = 0
while ir < X.shape[0]:
ir2 = min(ir+bs, X.shape[0])
u = np.dot(X[ir:ir2, :], X.T)
ii = np.arange(u.shape[0])
u[ii, ir+ii] = 0
u -= np.asarray(corr1[ir:ir2, :].todense())
fval += (u*u).sum()
ir += bs
return fval
rslt = _spg_optim(func, grad, X, _project_correlation_factors)
root = rslt.params
diag = 1 - (root**2).sum(1)
soln = FactoredPSDMatrix(diag, root)
rslt.corr = soln
del rslt.params
return rslt
def cov_nearest_factor_homog(cov, rank):
"""
Approximate an arbitrary square matrix with a factor-structured
matrix of the form k*I + XX'.
Parameters
----------
cov : array-like
The input array, must be square but need not be positive
semidefinite
rank : positive integer
The rank of the fitted factor structure
Returns
-------
A FactoredPSDMatrix instance containing the fitted matrix
Notes
-----
This routine is useful if one has an estimated covariance matrix
that is not SPD, and the ultimate goal is to estimate the inverse,
square root, or inverse square root of the true covariance
matrix. The factor structure allows these tasks to be performed
without constructing any n x n matrices.
The calculations use the fact that if k is known, then X can be
determined from the eigen-decomposition of cov - k*I, which can
in turn be easily obtained form the eigen-decomposition of `cov`.
Thus the problem can be reduced to a 1-dimensional search for k
that does not require repeated eigen-decompositions.
If the input matrix is sparse, then cov - k*I is also sparse, so
the eigen-decomposition can be done effciciently using sparse
routines.
The one-dimensional search for the optimal value of k is not
convex, so a local minimum could be obtained.
Examples
--------
Hard thresholding a covariance matrix may result in a matrix that
is not positive semidefinite. We can approximate a hard
thresholded covariance matrix with a PSD matrix as follows:
>>> cov = cov * (np.abs(cov) >= 0.3)
>>> rslt = cov_nearest_factor_homog(cov, 3)
"""
m, n = cov.shape
Q, Lambda, _ = svds(cov, rank)
if sparse.issparse(cov):
QSQ = np.dot(Q.T, cov.dot(Q))
ts = cov.diagonal().sum()
tss = cov.dot(cov).diagonal().sum()
else:
QSQ = np.dot(Q.T, np.dot(cov, Q))
ts = np.trace(cov)
tss = np.trace(np.dot(cov, cov))
def fun(k):
Lambda_t = Lambda - k
v = tss + m*(k**2) + np.sum(Lambda_t**2) - 2*k*ts
v += 2*k*np.sum(Lambda_t) - 2*np.sum(np.diag(QSQ) * Lambda_t)
return v
# Get the optimal decomposition
k_opt = fminbound(fun, 0, 1e5)
Lambda_opt = Lambda - k_opt
fac_opt = Q * np.sqrt(Lambda_opt)
diag = k_opt * np.ones(m, dtype=np.float64) #- (fac_opt**2).sum(1)
return FactoredPSDMatrix(diag, fac_opt)
def corr_thresholded(data, minabs=None, max_elt=1e7):
"""
Construct a sparse matrix containing the thresholded row-wise
correlation matrix from a data array.
Parameters
----------
data : array_like
The data from which the row-wise thresholded correlation
matrix is to be computed.
minabs : non-negative real
The threshold value; correlation coefficients smaller in
magnitude than minabs are set to zero. If None, defaults
to 1 / sqrt(n), see Notes for more information.
Returns
-------
cormat : sparse.coo_matrix
The thresholded correlation matrix, in COO format.
Notes
-----
This is an alternative to C = np.corrcoef(data); C *= (np.abs(C)
>= absmin), suitable for very tall data matrices.
If the data are jointly Gaussian, the marginal sampling
distributions of the elements of the sample correlation matrix are
approximately Gaussian with standard deviation 1 / sqrt(n). The
default value of `minabs` is thus equal to 1 standard error, which
will set to zero approximately 68% of the estimated correlation
coefficients for which the population value is zero.
No intermediate matrix with more than `max_elt` values will be
constructed. However memory use could still be high if a large
number of correlation values exceed `minabs` in magnitude.
The thresholded matrix is returned in COO format, which can easily
be converted to other sparse formats.
Examples
--------
Here X is a tall data matrix (e.g. with 100,000 rows and 50
columns). The row-wise correlation matrix of X is calculated
and stored in sparse form, with all entries smaller than 0.3
treated as 0.
>>> cmat = corr_thresholded(X, 0.3)
"""
nrow, ncol = data.shape
if minabs is None:
minabs = 1. / float(ncol)
# Row-standardize the data
data = data.copy()
data -= data.mean(1)[:, None]
sd = data.std(1, ddof=1)
ii = np.flatnonzero(sd > 1e-5)
data[ii, :] /= sd[ii][:, None]
ii = np.flatnonzero(sd <= 1e-5)
data[ii, :] = 0
# Number of rows to process in one pass
bs = int(np.floor(max_elt / nrow))
ipos_all, jpos_all, cor_values = [], [], []
ir = 0
while ir < nrow:
ir2 = min(data.shape[0], ir + bs)
cm = np.dot(data[ir:ir2,:], data.T) / (ncol - 1)
cma = np.abs(cm)
ipos, jpos = np.nonzero(cma >= minabs)
ipos_all.append(ipos + ir)
jpos_all.append(jpos)
cor_values.append(cm[ipos, jpos])
ir += bs
ipos = np.concatenate(ipos_all)
jpos = np.concatenate(jpos_all)
cor_values = np.concatenate(cor_values)
cmat = sparse.coo_matrix((cor_values, (ipos, jpos)), (nrow, nrow))
return cmat
if __name__ == '__main__':
pass
```