Source code for statsmodels.sandbox.distributions.extras

'''Various extensions to distributions

* skew normal and skew t distribution by Azzalini, A. & Capitanio, A.
* Gram-Charlier expansion distribution (using 4 moments),
* distributions based on non-linear transformation
- Transf_gen
- ExpTransf_gen, LogTransf_gen
- TransfTwo_gen
(defines as examples: square, negative square and abs transformations)
- this versions are without __new__
* mnvormcdf, mvstdnormcdf : cdf, rectangular integral for multivariate normal
distribution

TODO:
* Where is Transf_gen for general monotonic transformation ? found and added it
* write some docstrings, some parts I don't remember
* add Box-Cox transformation, parameterized ?

this is only partially cleaned, still includes test examples as functions

main changes
* added separate example and tests (2010-05-09)
* collect transformation function into classes

Example
-------

>>> logtg = Transf_gen(stats.t, np.exp, np.log,
numargs = 1, a=0, name = 'lnnorm',
longname = 'Exp transformed normal',
extradoc = '\ndistribution of y = exp(x), with x standard normal'
'precision for moment andstats is not very high, 2-3 decimals')
>>> logtg.cdf(5, 6)
0.92067704211191848
>>> stats.t.cdf(np.log(5), 6)
0.92067704211191848

>>> logtg.pdf(5, 6)
0.021798547904239293
>>> stats.t.pdf(np.log(5), 6)
0.10899273954837908
>>> stats.t.pdf(np.log(5), 6)/5.  #derivative
0.021798547909675815

Author: josef-pktd

'''

#note copied from distr_skewnorm_0.py

from __future__ import print_function
from statsmodels.compat.python import range, iteritems
from scipy import stats, special, integrate  # integrate is for scipy 0.6.0 ???
from scipy.stats import distributions
from statsmodels.stats.moment_helpers import mvsk2mc, mc2mvsk
import numpy as np

[docs]class SkewNorm_gen(distributions.rv_continuous): '''univariate Skew-Normal distribution of Azzalini class follows scipy.stats.distributions pattern but with __init__ ''' def __init__(self): #super(SkewNorm_gen,self).__init__( distributions.rv_continuous.__init__(self, name = 'Skew Normal distribution', shapes = 'alpha', extradoc = ''' ''' ) def _argcheck(self, alpha): return 1 #(alpha >= 0) def _rvs(self, alpha): # see http://azzalini.stat.unipd.it/SN/faq.html delta = alpha/np.sqrt(1+alpha**2) u0 = stats.norm.rvs(size=self._size) u1 = delta*u0 + np.sqrt(1-delta**2)*stats.norm.rvs(size=self._size) return np.where(u0>0, u1, -u1) def _munp(self, n, alpha): # use pdf integration with _mom0_sc if only _pdf is defined. # default stats calculation uses ppf, which is much slower return self._mom0_sc(n, alpha) def _pdf(self,x,alpha): # 2*normpdf(x)*normcdf(alpha*x) return 2.0/np.sqrt(2*np.pi)*np.exp(-x**2/2.0) * special.ndtr(alpha*x) def _stats_skip(self,x,alpha,moments='mvsk'): #skip for now to force moment integration as check pass
skewnorm = SkewNorm_gen() # generated the same way as distributions in stats.distributions
[docs]class SkewNorm2_gen(distributions.rv_continuous): '''univariate Skew-Normal distribution of Azzalini class follows scipy.stats.distributions pattern ''' def _argcheck(self, alpha): return 1 #where(alpha>=0, 1, 0) def _pdf(self,x,alpha): # 2*normpdf(x)*normcdf(alpha*x return 2.0/np.sqrt(2*np.pi)*np.exp(-x**2/2.0) * special.ndtr(alpha*x)
skewnorm2 = SkewNorm2_gen(name = 'Skew Normal distribution', shapes = 'alpha', extradoc = ''' -inf < alpha < inf''')
[docs]class ACSkewT_gen(distributions.rv_continuous): '''univariate Skew-T distribution of Azzalini class follows scipy.stats.distributions pattern but with __init__ ''' def __init__(self): #super(SkewT_gen,self).__init__( distributions.rv_continuous.__init__(self, name = 'Skew T distribution', shapes = 'df, alpha', extradoc = ''' Skewed T distribution by Azzalini, A. & Capitanio, A. (2003)_ the pdf is given by: pdf(x) = 2.0 * t.pdf(x, df) * t.cdf(df+1, alpha*x*np.sqrt((1+df)/(x**2+df))) with alpha >=0 Note: different from skewed t distribution by Hansen 1999 .._ Azzalini, A. & Capitanio, A. (2003), Distributions generated by perturbation of symmetry with emphasis on a multivariate skew-t distribution, appears in J.Roy.Statist.Soc, series B, vol.65, pp.367-389 ''' ) def _argcheck(self, df, alpha): return (alpha == alpha)*(df>0) ## def _arg_check(self, alpha): ## return np.where(alpha>=0, 0, 1) ## def _argcheck(self, alpha): ## return np.where(alpha>=0, 1, 0) def _rvs(self, df, alpha): # see http://azzalini.stat.unipd.it/SN/faq.html #delta = alpha/np.sqrt(1+alpha**2) V = stats.chi2.rvs(df, size=self._size) z = skewnorm.rvs(alpha, size=self._size) return z/np.sqrt(V/df) def _munp(self, n, df, alpha): # use pdf integration with _mom0_sc if only _pdf is defined. # default stats calculation uses ppf return self._mom0_sc(n, df, alpha) def _pdf(self, x, df, alpha): # 2*normpdf(x)*normcdf(alpha*x) return 2.0*distributions.t._pdf(x, df) * special.stdtr(df+1, alpha*x*np.sqrt((1+df)/(x**2+df))) ## ##def mvsk2cm(*args): ## mu,sig,sk,kur = args ## # Get central moments ## cnt = [None]*4 ## cnt = mu ## cnt = sig #*sig ## cnt = sk * sig**1.5 ## cnt = (kur+3.0) * sig**2.0 ## return cnt ## ## ##def mvsk2m(args): ## mc, mc2, skew, kurt = args#= self._stats(*args,**mdict) ## mnc = mc ## mnc2 = mc2 + mc*mc ## mc3 = skew*(mc2**1.5) # 3rd central moment ## mnc3 = mc3+3*mc*mc2+mc**3 # 3rd non-central moment ## mc4 = (kurt+3.0)*(mc2**2.0) # 4th central moment ## mnc4 = mc4+4*mc*mc3+6*mc*mc*mc2+mc**4 ## return (mc, mc2, mc3, mc4), (mnc, mnc2, mnc3, mnc4) ## ##def mc2mvsk(args): ## mc, mc2, mc3, mc4 = args ## skew = mc3 / mc2**1.5 ## kurt = mc4 / mc2**2.0 - 3.0 ## return (mc, mc2, skew, kurt) ## ##def m2mc(args): ## mnc, mnc2, mnc3, mnc4 = args ## mc = mnc ## mc2 = mnc2 - mnc*mnc ## #mc3 = skew*(mc2**1.5) # 3rd central moment ## mc3 = mnc3 - (3*mc*mc2+mc**3) # 3rd central moment ## #mc4 = (kurt+3.0)*(mc2**2.0) # 4th central moment ## mc4 = mnc4 - (4*mc*mc3+6*mc*mc*mc2+mc**4) ## return (mc, mc2, mc3, mc4)
from numpy import poly1d,sqrt, exp import scipy def _hermnorm(N): # return the negatively normalized hermite polynomials up to order N-1 # (inclusive) # using the recursive relationship # p_n+1 = p_n(x)' - x*p_n(x) # and p_0(x) = 1 plist = [None]*N plist = poly1d(1) for n in range(1,N): plist[n] = plist[n-1].deriv() - poly1d([1,0])*plist[n-1] return plist
[docs]def pdf_moments_st(cnt): """Return the Gaussian expanded pdf function given the list of central moments (first one is mean). version of scipy.stats, any changes ? the scipy.stats version has a bug and returns normal distribution """ N = len(cnt) if N < 2: raise ValueError("At least two moments must be given to " "approximate the pdf.") totp = poly1d(1) sig = sqrt(cnt) mu = cnt if N > 2: Dvals = _hermnorm(N+1) for k in range(3,N+1): # Find Ck Ck = 0.0 for n in range((k-3)/2): m = k-2*n if m % 2: # m is odd momdiff = cnt[m-1] else: momdiff = cnt[m-1] - sig*sig*scipy.factorial2(m-1) Ck += Dvals[k][m] / sig**m * momdiff # Add to totp raise SystemError print(Dvals) print(Ck) totp = totp + Ck*Dvals[k] def thisfunc(x): xn = (x-mu)/sig return totp(xn)*exp(-xn*xn/2.0)/sqrt(2*np.pi)/sig return thisfunc, totp
[docs]def pdf_mvsk(mvsk): """Return the Gaussian expanded pdf function given the list of 1st, 2nd moment and skew and Fisher (excess) kurtosis. Parameters ---------- mvsk : list of mu, mc2, skew, kurt distribution is matched to these four moments Returns ------- pdffunc : function function that evaluates the pdf(x), where x is the non-standardized random variable. Notes ----- Changed so it works only if four arguments are given. Uses explicit formula, not loop. This implements a Gram-Charlier expansion of the normal distribution where the first 2 moments coincide with those of the normal distribution but skew and kurtosis can deviate from it. In the Gram-Charlier distribution it is possible that the density becomes negative. This is the case when the deviation from the normal distribution is too large. References ---------- http://en.wikipedia.org/wiki/Edgeworth_series Johnson N.L., S. Kotz, N. Balakrishnan: Continuous Univariate Distributions, Volume 1, 2nd ed., p.30 """ N = len(mvsk) if N < 4: raise ValueError("Four moments must be given to " "approximate the pdf.") mu, mc2, skew, kurt = mvsk totp = poly1d(1) sig = sqrt(mc2) if N > 2: Dvals = _hermnorm(N+1) C3 = skew/6.0 C4 = kurt/24.0 # Note: Hermite polynomial for order 3 in _hermnorm is negative # instead of positive totp = totp - C3*Dvals + C4*Dvals def pdffunc(x): xn = (x-mu)/sig return totp(xn)*np.exp(-xn*xn/2.0)/np.sqrt(2*np.pi)/sig return pdffunc
[docs]def pdf_moments(cnt): """Return the Gaussian expanded pdf function given the list of central moments (first one is mean). Changed so it works only if four arguments are given. Uses explicit formula, not loop. Notes ----- This implements a Gram-Charlier expansion of the normal distribution where the first 2 moments coincide with those of the normal distribution but skew and kurtosis can deviate from it. In the Gram-Charlier distribution it is possible that the density becomes negative. This is the case when the deviation from the normal distribution is too large. References ---------- http://en.wikipedia.org/wiki/Edgeworth_series Johnson N.L., S. Kotz, N. Balakrishnan: Continuous Univariate Distributions, Volume 1, 2nd ed., p.30 """ N = len(cnt) if N < 2: raise ValueError("At least two moments must be given to " "approximate the pdf.") mc, mc2, mc3, mc4 = cnt skew = mc3 / mc2**1.5 kurt = mc4 / mc2**2.0 - 3.0 # Fisher kurtosis, excess kurtosis totp = poly1d(1) sig = sqrt(cnt) mu = cnt if N > 2: Dvals = _hermnorm(N+1) ## for k in range(3,N+1): ## # Find Ck ## Ck = 0.0 ## for n in range((k-3)/2): ## m = k-2*n ## if m % 2: # m is odd ## momdiff = cnt[m-1] ## else: ## momdiff = cnt[m-1] - sig*sig*scipy.factorial2(m-1) ## Ck += Dvals[k][m] / sig**m * momdiff ## # Add to totp ## raise ## print Dvals ## print Ck ## totp = totp + Ck*Dvals[k] C3 = skew/6.0 C4 = kurt/24.0 totp = totp - C3*Dvals + C4*Dvals def thisfunc(x): xn = (x-mu)/sig return totp(xn)*np.exp(-xn*xn/2.0)/np.sqrt(2*np.pi)/sig return thisfunc
[docs]class NormExpan_gen(distributions.rv_continuous): '''Gram-Charlier Expansion of Normal distribution class follows scipy.stats.distributions pattern but with __init__ ''' def __init__(self,args, **kwds): #todo: replace with super call distributions.rv_continuous.__init__(self, name = 'Normal Expansion distribution', shapes = ' ', extradoc = ''' The distribution is defined as the Gram-Charlier expansion of the normal distribution using the first four moments. The pdf is given by pdf(x) = (1+ skew/6.0 * H(xc,3) + kurt/24.0 * H(xc,4))*normpdf(xc) where xc = (x-mu)/sig is the standardized value of the random variable and H(xc,3) and H(xc,4) are Hermite polynomials Note: This distribution has to be parameterized during initialization and instantiation, and does not have a shape parameter after instantiation (similar to frozen distribution except for location and scale.) Location and scale can be used as with other distributions, however note, that they are relative to the initialized distribution. ''' ) #print args, kwds mode = kwds.get('mode', 'sample') if mode == 'sample': mu,sig,sk,kur = stats.describe(args)[2:] self.mvsk = (mu,sig,sk,kur) cnt = mvsk2mc((mu,sig,sk,kur)) elif mode == 'mvsk': cnt = mvsk2mc(args) self.mvsk = args elif mode == 'centmom': cnt = args self.mvsk = mc2mvsk(cnt) else: raise ValueError("mode must be 'mvsk' or centmom") self.cnt = cnt #self.mvsk = (mu,sig,sk,kur) #self._pdf = pdf_moments(cnt) self._pdf = pdf_mvsk(self.mvsk) def _munp(self,n): # use pdf integration with _mom0_sc if only _pdf is defined. # default stats calculation uses ppf return self._mom0_sc(n) def _stats_skip(self): # skip for now to force numerical integration of pdf for testing return self.mvsk ## copied from nonlinear_transform_gen.py