Source code for statsmodels.sandbox.tools.tools_pca

# -*- coding: utf-8 -*-
"""Principal Component Analysis


Created on Tue Sep 29 20:11:23 2009
Author: josef-pktd

TODO : add class for better reuse of results
"""

import numpy as np


[docs]def pca(data, keepdim=0, normalize=0, demean=True):
    '''principal components with eigenvector decomposition
    similar to princomp in matlab

    Parameters
    ----------
    data : ndarray, 2d
        data with observations by rows and variables in columns
    keepdim : integer
        number of eigenvectors to keep
        if keepdim is zero, then all eigenvectors are included
    normalize : boolean
        if true, then eigenvectors are normalized by sqrt of eigenvalues
    demean : boolean
        if true, then the column mean is subtracted from the data

    Returns
    -------
    xreduced : ndarray, 2d, (nobs, nvars)
        projection of the data x on the kept eigenvectors
    factors : ndarray, 2d, (nobs, nfactors)
        factor matrix, given by np.dot(x, evecs)
    evals : ndarray, 2d, (nobs, nfactors)
        eigenvalues
    evecs : ndarray, 2d, (nobs, nfactors)
        eigenvectors, normalized if normalize is true

    Notes
    -----

    See Also
    --------
    pcasvd : principal component analysis using svd

    '''
    x = np.array(data)
    #make copy so original doesn't change, maybe not necessary anymore
    if demean:
        m = x.mean(0)
    else:
        m = np.zeros(x.shape[1])
    x -= m

    # Covariance matrix
    xcov = np.cov(x, rowvar=0)

    # Compute eigenvalues and sort into descending order
    evals, evecs = np.linalg.eig(xcov)
    indices = np.argsort(evals)
    indices = indices[::-1]
    evecs = evecs[:,indices]
    evals = evals[indices]

    if keepdim > 0 and keepdim < x.shape[1]:
        evecs = evecs[:,:keepdim]
        evals = evals[:keepdim]

    if normalize:
        #for i in range(shape(evecs)[1]):
        #    evecs[:,i] / linalg.norm(evecs[:,i]) * sqrt(evals[i])
        evecs = evecs/np.sqrt(evals) #np.sqrt(np.dot(evecs.T, evecs) * evals)

    # get factor matrix
    #x = np.dot(evecs.T, x.T)
    factors = np.dot(x, evecs)
    # get original data from reduced number of components
    #xreduced = np.dot(evecs.T, factors) + m
    #print x.shape, factors.shape, evecs.shape, m.shape
    xreduced = np.dot(factors, evecs.T) + m
    return xreduced, factors, evals, evecs



[docs]def pcasvd(data, keepdim=0, demean=True):
    '''principal components with svd

    Parameters
    ----------
    data : ndarray, 2d
        data with observations by rows and variables in columns
    keepdim : integer
        number of eigenvectors to keep
        if keepdim is zero, then all eigenvectors are included
    demean : boolean
        if true, then the column mean is subtracted from the data

    Returns
    -------
    xreduced : ndarray, 2d, (nobs, nvars)
        projection of the data x on the kept eigenvectors
    factors : ndarray, 2d, (nobs, nfactors)
        factor matrix, given by np.dot(x, evecs)
    evals : ndarray, 2d, (nobs, nfactors)
        eigenvalues
    evecs : ndarray, 2d, (nobs, nfactors)
        eigenvectors, normalized if normalize is true

    See Also
    -------
    pca : principal component analysis using eigenvector decomposition

    Notes
    -----
    This doesn't have yet the normalize option of pca.

    '''
    nobs, nvars = data.shape
    #print nobs, nvars, keepdim
    x = np.array(data)
    #make copy so original doesn't change
    if demean:
        m = x.mean(0)
    else:
        m = 0
##    if keepdim == 0:
##        keepdim = nvars
##        "print reassigning keepdim to max", keepdim
    x -= m
    U, s, v = np.linalg.svd(x.T, full_matrices=1)
    factors = np.dot(U.T, x.T).T #princomps
    if keepdim:
        xreduced = np.dot(factors[:,:keepdim], U[:,:keepdim].T) + m
    else:
        xreduced = data
        keepdim = nvars
        "print reassigning keepdim to max", keepdim

    # s = evals, U = evecs
    # no idea why denominator for s is with minus 1
    evals = s**2/(x.shape[0]-1)
    #print keepdim
    return xreduced, factors[:,:keepdim], evals[:keepdim], U[:,:keepdim] #, v


__all__ = ['pca', 'pcasvd']