# Source code for statsmodels.stats.stattools

"""
Statistical tests to be used in conjunction with the models

Notes
-----
These functions haven't been formally tested.
"""

from scipy import stats
import numpy as np

# TODO: these are pretty straightforward but they should be tested
[docs]def durbin_watson(resids, axis=0):
"""
Calculates the Durbin-Watson statistic

Parameters
-----------
resids : array-like

Returns
--------
dw : float, array-like

The Durbin-Watson statistic.

Notes
-----
The null hypothesis of the test is that there is no serial correlation.
The Durbin-Watson test statistics is defined as:

.. math::

\sum_{t=2}^T((e_t - e_{t-1})^2)/\sum_{t=1}^Te_t^2

The test statistic is approximately equal to 2*(1-r) where r is the
sample autocorrelation of the residuals. Thus, for r == 0, indicating no
serial correlation, the test statistic equals 2. This statistic will
always be between 0 and 4. The closer to 0 the statistic, the more
evidence for positive serial correlation. The closer to 4, the more
evidence for negative serial correlation.
"""
resids = np.asarray(resids)
diff_resids = np.diff(resids, 1, axis=axis)
dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis)
return dw

[docs]def omni_normtest(resids, axis=0):
"""
Omnibus test for normality

Parameters
-----------
resid : array-like
axis : int, optional
Default is 0

Returns
-------
Chi^2 score, two-tail probability
"""
#TODO: change to exception in summary branch and catch in summary()
#behavior changed between scipy 0.9 and 0.10
resids = np.asarray(resids)
n = resids.shape[axis]
if n < 8:
from warnings import warn

warn("omni_normtest is not valid with less than 8 observations; %i "
"samples were given." % int(n))
return np.nan, np.nan

return stats.normaltest(resids, axis=axis)

[docs]def jarque_bera(resids, axis=0):
"""
Calculate residual skewness, kurtosis, and do the JB test for normality

Parameters
-----------
resids : array-like
axis : int, optional
Default is 0

Returns
-------
JB, JBpv, skew, kurtosis

JB = n/6*(S^2 + (K-3)^2/4)

JBpv is the Chi^2 two-tail probability value

skew is the measure of skewness

kurtosis is the measure of kurtosis

"""
resids = np.asarray(resids)
# Calculate residual skewness and kurtosis
skew = stats.skew(resids, axis=axis)
kurtosis = 3 + stats.kurtosis(resids, axis=axis)

# Calculate the Jarque-Bera test for normality
n = resids.shape[axis]
jb = (n / 6.) * (skew**2 + (1 / 4.) * (kurtosis - 3)**2)
jb_pv = stats.chi2.sf(jb, 2)

return jb, jb_pv, skew, kurtosis

def robust_skewness(y, axis=0):
"""
Calculates the four skewness measures in Kim & White

Parameters
----------
y : array-like

axis : int or None, optional
Axis along which the skewness measures are computed.  If None, the
entire array is used.

Returns
-------
sk1 : ndarray
The standard skewness estimator.
sk2 : ndarray
Skewness estimator based on quartiles.
sk3 : ndarray
Skewness estimator based on mean-median difference, standardized by
absolute deviation.
sk4 : ndarray
Skewness estimator based on mean-median difference, standardized by
standard deviation.

Notes
-----
The robust skewness measures are defined

.. math ::

SK_{2}=\\frac{\\left(q_{.75}-q_{.5}\\right)
-\\left(q_{.5}-q_{.25}\\right)}{q_{.75}-q_{.25}}

.. math ::

SK_{3}=\\frac{\\mu-\\hat{q}_{0.5}}
{\\hat{E}\\left[\\left|y-\\hat{\\mu}\\right|\\right]}

.. math ::

SK_{4}=\\frac{\\mu-\\hat{q}_{0.5}}{\\hat{\\sigma}}

..  Tae-Hwan Kim and Halbert White, "On more robust estimation of
skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73,
March 2004.
"""

if axis is None:
y = y.ravel()
axis = 0

y = np.sort(y, axis)

q1, q2, q3 = np.percentile(y, [25.0, 50.0, 75.0], axis=axis)

mu = y.mean(axis)
shape = (y.size,)
if axis is not None:
shape = list(mu.shape)
shape.insert(axis, 1)
shape = tuple(shape)

mu_b = np.reshape(mu, shape)
q2_b = np.reshape(q2, shape)

sigma = np.mean(((y - mu_b)**2), axis)

sk1 = stats.skew(y, axis=axis)
sk2 = (q1 + q3 - 2.0 * q2) / (q3 - q1)
sk3 = (mu - q2) / np.mean(abs(y - q2_b), axis=axis)
sk4 = (mu - q2) / sigma

return sk1, sk2, sk3, sk4

def _kr3(y, alpha=5.0, beta=50.0):
"""
KR3 estimator from Kim & White

Parameters
----------
y : array-like, 1-d
alpha : float, optional
Lower cut-off for measuring expectation in tail.
beta :  float, optional
Lower cut-off for measuring expectation in center.

Returns
-------
kr3 : float
Robust kurtosis estimator based on standardized lower- and upper-tail
expected values

Notes
-----
..  Tae-Hwan Kim and Halbert White, "On more robust estimation of
skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73,
March 2004.
"""
perc = (alpha, 100.0 - alpha, beta, 100.0 - beta)
lower_alpha, upper_alpha, lower_beta, upper_beta = np.percentile(y, perc)
l_alpha = np.mean(y[y < lower_alpha])
u_alpha = np.mean(y[y > upper_alpha])

l_beta = np.mean(y[y < lower_beta])
u_beta = np.mean(y[y > upper_beta])

return (u_alpha - l_alpha) / (u_beta - l_beta)

def expected_robust_kurtosis(ab=(5.0, 50.0), dg=(2.5, 25.0)):
"""
Calculates the expected value of the robust kurtosis measures in Kim and
White assuming the data are normally distributed.

Parameters
----------
ab: iterable, optional
Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail
quantile cut-off for measuring the extreme tail and beta is the central
quantile cutoff for the standardization of the measure
db: iterable, optional
Contains 100*(delta, gamma) in the kr4 measure where delta is the tail
quantile for measuring extreme values and gamma is the central quantile
used in the the standardization of the measure

Returns
-------
ekr: array, 4-element
Contains the expected values of the 4 robust kurtosis measures

Notes
-----
See robust_kurtosis for definitions of the robust kurtosis measures
"""

alpha, beta = ab
delta, gamma = dg
expected_value = np.zeros(4)
ppf = stats.norm.ppf
pdf = stats.norm.pdf
q1, q2, q3, q5, q6, q7 = ppf(np.array((1.0, 2.0, 3.0, 5.0, 6.0, 7.0)) / 8)
expected_value = 3

expected_value = ((q7 - q5) + (q3 - q1)) / (q6 - q2)

q_alpha, q_beta = ppf(np.array((alpha / 100.0, beta / 100.0)))
expected_value = (2 * pdf(q_alpha) / alpha) / (2 * pdf(q_beta) / beta)

q_delta, q_gamma = ppf(np.array((delta / 100.0, gamma / 100.0)))
expected_value = (-2.0 * q_delta) / (-2.0 * q_gamma)

return expected_value

def robust_kurtosis(y, axis=0, ab=(5.0, 50.0), dg=(2.5, 25.0), excess=True):
"""
Calculates the four kurtosis measures in Kim & White

Parameters
----------
y : array-like
axis : int or None, optional
Axis along which the kurtoses are computed.  If None, the
entire array is used.
ab: iterable, optional
Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail
quantile cut-off for measuring the extreme tail and beta is the central
quantile cutoff for the standardization of the measure
db: iterable, optional
Contains 100*(delta, gamma) in the kr4 measure where delta is the tail
quantile for measuring extreme values and gamma is the central quantile
used in the the standardization of the measure
excess : bool, optional
If true (default), computed values are excess of those for a standard
normal distribution.

Returns
-------
kr1 : ndarray
The standard kurtosis estimator.
kr2 : ndarray
Kurtosis estimator based on octiles.
kr3 : ndarray
Kurtosis estimators based on exceedence expectations.
kr4 : ndarray
Kurtosis measure based on the spread between high and low quantiles.

Notes
-----
The robust kurtosis measures are defined

.. math::

KR_{2}=\\frac{\\left(\\hat{q}_{.875}-\\hat{q}_{.625}\\right)
+\\left(\\hat{q}_{.375}-\\hat{q}_{.125}\\right)}
{\\hat{q}_{.75}-\\hat{q}_{.25}}

.. math::

KR_{3}=\\frac{\\hat{E}\\left(y|y>\\hat{q}_{1-\\alpha}\\right)
-\\hat{E}\\left(y|y<\\hat{q}_{\\alpha}\\right)}
{\\hat{E}\\left(y|y>\\hat{q}_{1-\\beta}\\right)
-\\hat{E}\\left(y|y<\\hat{q}_{\\beta}\\right)}

.. math::

KR_{4}=\\frac{\\hat{q}_{1-\\delta}-\\hat{q}_{\\delta}}
{\\hat{q}_{1-\\gamma}-\\hat{q}_{\\gamma}}

where :math:\\hat{q}_{p} is the estimated quantile at :math:p.

..  Tae-Hwan Kim and Halbert White, "On more robust estimation of
skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73,
March 2004.
"""
if (axis is None or
(y.squeeze().ndim == 1 and y.ndim != 1)):
y = y.ravel()
axis = 0

alpha, beta = ab
delta, gamma = dg

perc = (12.5, 25.0, 37.5, 62.5, 75.0, 87.5,
delta, 100.0 - delta, gamma, 100.0 - gamma)
e1, e2, e3, e5, e6, e7, fd, f1md, fg, f1mg = np.percentile(y, perc,
axis=axis)

expected_value = expected_robust_kurtosis(ab, dg) if excess else np.zeros(4)

kr1 = stats.kurtosis(y, axis, False) - expected_value
kr2 = ((e7 - e5) + (e3 - e1)) / (e6 - e2) - expected_value
if y.ndim == 1:
kr3 = _kr3(y, alpha, beta)
else:
kr3 = np.apply_along_axis(_kr3, axis, y, alpha, beta)
kr3 -= expected_value
kr4 = (f1md - fd) / (f1mg - fg) - expected_value
return kr1, kr2, kr3, kr4

def _medcouple_1d(y):
"""
Calculates the medcouple robust measure of skew.

Parameters
----------
y : array-like, 1-d

Returns
-------
mc : float
The medcouple statistic

Notes
-----
The current algorithm requires a O(N**2) memory allocations, and so may
not work for very large arrays (N>10000).

..  M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed
distributions" Computational Statistics & Data Analysis, vol. 52,
pp. 5186-5201, August 2008.
"""

# Parameter changes the algorithm to the slower for large n

y = np.squeeze(np.asarray(y))
if y.ndim != 1:
raise ValueError("y must be squeezable to a 1-d array")

y = np.sort(y)

n = y.shape
if n % 2 == 0:
mf = (y[n // 2 - 1] + y[n // 2]) / 2
else:
mf = y[(n - 1) // 2]

z = y - mf
lower = z[z <= 0.0]
upper = z[z >= 0.0]
upper = upper[:, None]
standardization = upper - lower
is_zero = np.logical_and(lower == 0.0, upper == 0.0)
standardization[is_zero] = np.inf

def medcouple(y, axis=0):
"""
Calculates the medcouple robust measure of skew.

Parameters
----------
y : array-like
axis : int or None, optional
Axis along which the medcouple statistic is computed.  If None, the
entire array is used.

Returns
-------
mc : ndarray
The medcouple statistic with the same shape as y, with the specified
axis removed.

Notes
-----
The current algorithm requires a O(N**2) memory allocations, and so may
not work for very large arrays (N>10000).

..  M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed
distributions" Computational Statistics & Data Analysis, vol. 52,
pp. 5186-5201, August 2008.
"""
if axis is None:
return _medcouple_1d(y.ravel())

return np.apply_along_axis(_medcouple_1d, axis, y)