# Source code for statsmodels.sandbox.tools.tools_pca

```# -*- coding: utf-8 -*-
"""Principal Component Analysis

Created on Tue Sep 29 20:11:23 2009
Author: josef-pktd

TODO : add class for better reuse of results
"""

import numpy as np

[docs]def pca(data, keepdim=0, normalize=0, demean=True):
'''principal components with eigenvector decomposition
similar to princomp in matlab

Parameters
----------
data : ndarray, 2d
data with observations by rows and variables in columns
keepdim : integer
number of eigenvectors to keep
if keepdim is zero, then all eigenvectors are included
normalize : boolean
if true, then eigenvectors are normalized by sqrt of eigenvalues
demean : boolean
if true, then the column mean is subtracted from the data

Returns
-------
xreduced : ndarray, 2d, (nobs, nvars)
projection of the data x on the kept eigenvectors
factors : ndarray, 2d, (nobs, nfactors)
factor matrix, given by np.dot(x, evecs)
evals : ndarray, 2d, (nobs, nfactors)
eigenvalues
evecs : ndarray, 2d, (nobs, nfactors)
eigenvectors, normalized if normalize is true

Notes
-----

--------
pcasvd : principal component analysis using svd

'''
x = np.array(data)
#make copy so original doesn't change, maybe not necessary anymore
if demean:
m = x.mean(0)
else:
m = np.zeros(x.shape[1])
x -= m

# Covariance matrix
xcov = np.cov(x, rowvar=0)

# Compute eigenvalues and sort into descending order
evals, evecs = np.linalg.eig(xcov)
indices = np.argsort(evals)
indices = indices[::-1]
evecs = evecs[:,indices]
evals = evals[indices]

if keepdim > 0 and keepdim < x.shape[1]:
evecs = evecs[:,:keepdim]
evals = evals[:keepdim]

if normalize:
#for i in range(shape(evecs)[1]):
#    evecs[:,i] / linalg.norm(evecs[:,i]) * sqrt(evals[i])
evecs = evecs/np.sqrt(evals) #np.sqrt(np.dot(evecs.T, evecs) * evals)

# get factor matrix
#x = np.dot(evecs.T, x.T)
factors = np.dot(x, evecs)
# get original data from reduced number of components
#xreduced = np.dot(evecs.T, factors) + m
#print x.shape, factors.shape, evecs.shape, m.shape
xreduced = np.dot(factors, evecs.T) + m
return xreduced, factors, evals, evecs

[docs]def pcasvd(data, keepdim=0, demean=True):
'''principal components with svd

Parameters
----------
data : ndarray, 2d
data with observations by rows and variables in columns
keepdim : integer
number of eigenvectors to keep
if keepdim is zero, then all eigenvectors are included
demean : boolean
if true, then the column mean is subtracted from the data

Returns
-------
xreduced : ndarray, 2d, (nobs, nvars)
projection of the data x on the kept eigenvectors
factors : ndarray, 2d, (nobs, nfactors)
factor matrix, given by np.dot(x, evecs)
evals : ndarray, 2d, (nobs, nfactors)
eigenvalues
evecs : ndarray, 2d, (nobs, nfactors)
eigenvectors, normalized if normalize is true

-------
pca : principal component analysis using eigenvector decomposition

Notes
-----
This doesn't have yet the normalize option of pca.

'''
nobs, nvars = data.shape
#print nobs, nvars, keepdim
x = np.array(data)
#make copy so original doesn't change
if demean:
m = x.mean(0)
else:
m = 0
##    if keepdim == 0:
##        keepdim = nvars
##        "print reassigning keepdim to max", keepdim
x -= m
U, s, v = np.linalg.svd(x.T, full_matrices=1)
factors = np.dot(U.T, x.T).T #princomps
if keepdim:
xreduced = np.dot(factors[:,:keepdim], U[:,:keepdim].T) + m
else:
xreduced = data
keepdim = nvars
"print reassigning keepdim to max", keepdim

# s = evals, U = evecs
# no idea why denominator for s is with minus 1
evals = s**2/(x.shape[0]-1)
#print keepdim
return xreduced, factors[:,:keepdim], evals[:keepdim], U[:,:keepdim] #, v

__all__ = ['pca', 'pcasvd']
```