Source code for statsmodels.stats.stattools

"""
Statistical tests to be used in conjunction with the models

Notes
-----
These functions haven't been formally tested.
"""

from scipy import stats
import numpy as np
from statsmodels.tools.sm_exceptions import ValueWarning


# TODO: these are pretty straightforward but they should be tested
[docs]def durbin_watson(resids, axis=0): r""" Calculates the Durbin-Watson statistic Parameters ---------- resids : array-like Returns ------- dw : float, array-like The Durbin-Watson statistic. Notes ----- The null hypothesis of the test is that there is no serial correlation. The Durbin-Watson test statistics is defined as: .. math:: \sum_{t=2}^T((e_t - e_{t-1})^2)/\sum_{t=1}^Te_t^2 The test statistic is approximately equal to 2*(1-r) where ``r`` is the sample autocorrelation of the residuals. Thus, for r == 0, indicating no serial correlation, the test statistic equals 2. This statistic will always be between 0 and 4. The closer to 0 the statistic, the more evidence for positive serial correlation. The closer to 4, the more evidence for negative serial correlation. """ resids = np.asarray(resids) diff_resids = np.diff(resids, 1, axis=axis) dw = np.sum(diff_resids**2, axis=axis) / np.sum(resids**2, axis=axis) return dw
[docs]def omni_normtest(resids, axis=0): """ Omnibus test for normality Parameters ---------- resid : array-like axis : int, optional Default is 0 Returns ------- Chi^2 score, two-tail probability """ # TODO: change to exception in summary branch and catch in summary() # behavior changed between scipy 0.9 and 0.10 resids = np.asarray(resids) n = resids.shape[axis] if n < 8: from warnings import warn warn("omni_normtest is not valid with less than 8 observations; %i " "samples were given." % int(n), ValueWarning) return np.nan, np.nan return stats.normaltest(resids, axis=axis)
[docs]def jarque_bera(resids, axis=0): r""" Calculates the Jarque-Bera test for normality Parameters ---------- data : array-like Data to test for normality axis : int, optional Axis to use if data has more than 1 dimension. Default is 0 Returns ------- JB : float or array The Jarque-Bera test statistic JBpv : float or array The pvalue of the test statistic skew : float or array Estimated skewness of the data kurtosis : float or array Estimated kurtosis of the data Notes ----- Each output returned has 1 dimension fewer than data The Jarque-Bera test statistic tests the null that the data is normally distributed against an alternative that the data follow some other distribution. The test statistic is based on two moments of the data, the skewness, and the kurtosis, and has an asymptotic :math:`\chi^2_2` distribution. The test statistic is defined .. math:: JB = n(S^2/6+(K-3)^2/24) where n is the number of data points, S is the sample skewness, and K is the sample kurtosis of the data. """ resids = np.asarray(resids) # Calculate residual skewness and kurtosis skew = stats.skew(resids, axis=axis) kurtosis = 3 + stats.kurtosis(resids, axis=axis) # Calculate the Jarque-Bera test for normality n = resids.shape[axis] jb = (n / 6.) * (skew ** 2 + (1 / 4.) * (kurtosis - 3) ** 2) jb_pv = stats.chi2.sf(jb, 2) return jb, jb_pv, skew, kurtosis
[docs]def robust_skewness(y, axis=0): """ Calculates the four skewness measures in Kim & White Parameters ---------- y : array-like axis : int or None, optional Axis along which the skewness measures are computed. If `None`, the entire array is used. Returns ------- sk1 : ndarray The standard skewness estimator. sk2 : ndarray Skewness estimator based on quartiles. sk3 : ndarray Skewness estimator based on mean-median difference, standardized by absolute deviation. sk4 : ndarray Skewness estimator based on mean-median difference, standardized by standard deviation. Notes ----- The robust skewness measures are defined .. math:: SK_{2}=\\frac{\\left(q_{.75}-q_{.5}\\right) -\\left(q_{.5}-q_{.25}\\right)}{q_{.75}-q_{.25}} .. math:: SK_{3}=\\frac{\\mu-\\hat{q}_{0.5}} {\\hat{E}\\left[\\left|y-\\hat{\\mu}\\right|\\right]} .. math:: SK_{4}=\\frac{\\mu-\\hat{q}_{0.5}}{\\hat{\\sigma}} .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, March 2004. """ if axis is None: y = y.ravel() axis = 0 y = np.sort(y, axis) q1, q2, q3 = np.percentile(y, [25.0, 50.0, 75.0], axis=axis) mu = y.mean(axis) shape = (y.size,) if axis is not None: shape = list(mu.shape) shape.insert(axis, 1) shape = tuple(shape) mu_b = np.reshape(mu, shape) q2_b = np.reshape(q2, shape) sigma = np.mean(((y - mu_b)**2), axis) sk1 = stats.skew(y, axis=axis) sk2 = (q1 + q3 - 2.0 * q2) / (q3 - q1) sk3 = (mu - q2) / np.mean(abs(y - q2_b), axis=axis) sk4 = (mu - q2) / sigma return sk1, sk2, sk3, sk4
def _kr3(y, alpha=5.0, beta=50.0): """ KR3 estimator from Kim & White Parameters ---------- y : array-like, 1-d alpha : float, optional Lower cut-off for measuring expectation in tail. beta : float, optional Lower cut-off for measuring expectation in center. Returns ------- kr3 : float Robust kurtosis estimator based on standardized lower- and upper-tail expected values Notes ----- .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, March 2004. """ perc = (alpha, 100.0 - alpha, beta, 100.0 - beta) lower_alpha, upper_alpha, lower_beta, upper_beta = np.percentile(y, perc) l_alpha = np.mean(y[y < lower_alpha]) u_alpha = np.mean(y[y > upper_alpha]) l_beta = np.mean(y[y < lower_beta]) u_beta = np.mean(y[y > upper_beta]) return (u_alpha - l_alpha) / (u_beta - l_beta)
[docs]def expected_robust_kurtosis(ab=(5.0, 50.0), dg=(2.5, 25.0)): """ Calculates the expected value of the robust kurtosis measures in Kim and White assuming the data are normally distributed. Parameters ---------- ab: iterable, optional Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail quantile cut-off for measuring the extreme tail and beta is the central quantile cutoff for the standardization of the measure db: iterable, optional Contains 100*(delta, gamma) in the kr4 measure where delta is the tail quantile for measuring extreme values and gamma is the central quantile used in the the standardization of the measure Returns ------- ekr: array, 4-element Contains the expected values of the 4 robust kurtosis measures Notes ----- See `robust_kurtosis` for definitions of the robust kurtosis measures """ alpha, beta = ab delta, gamma = dg expected_value = np.zeros(4) ppf = stats.norm.ppf pdf = stats.norm.pdf q1, q2, q3, q5, q6, q7 = ppf(np.array((1.0, 2.0, 3.0, 5.0, 6.0, 7.0)) / 8) expected_value[0] = 3 expected_value[1] = ((q7 - q5) + (q3 - q1)) / (q6 - q2) q_alpha, q_beta = ppf(np.array((alpha / 100.0, beta / 100.0))) expected_value[2] = (2 * pdf(q_alpha) / alpha) / (2 * pdf(q_beta) / beta) q_delta, q_gamma = ppf(np.array((delta / 100.0, gamma / 100.0))) expected_value[3] = (-2.0 * q_delta) / (-2.0 * q_gamma) return expected_value
[docs]def robust_kurtosis(y, axis=0, ab=(5.0, 50.0), dg=(2.5, 25.0), excess=True): """ Calculates the four kurtosis measures in Kim & White Parameters ---------- y : array-like axis : int or None, optional Axis along which the kurtoses are computed. If `None`, the entire array is used. ab: iterable, optional Contains 100*(alpha, beta) in the kr3 measure where alpha is the tail quantile cut-off for measuring the extreme tail and beta is the central quantile cutoff for the standardization of the measure db: iterable, optional Contains 100*(delta, gamma) in the kr4 measure where delta is the tail quantile for measuring extreme values and gamma is the central quantile used in the the standardization of the measure excess : bool, optional If true (default), computed values are excess of those for a standard normal distribution. Returns ------- kr1 : ndarray The standard kurtosis estimator. kr2 : ndarray Kurtosis estimator based on octiles. kr3 : ndarray Kurtosis estimators based on exceedence expectations. kr4 : ndarray Kurtosis measure based on the spread between high and low quantiles. Notes ----- The robust kurtosis measures are defined .. math:: KR_{2}=\\frac{\\left(\\hat{q}_{.875}-\\hat{q}_{.625}\\right) +\\left(\\hat{q}_{.375}-\\hat{q}_{.125}\\right)} {\\hat{q}_{.75}-\\hat{q}_{.25}} .. math:: KR_{3}=\\frac{\\hat{E}\\left(y|y>\\hat{q}_{1-\\alpha}\\right) -\\hat{E}\\left(y|y<\\hat{q}_{\\alpha}\\right)} {\\hat{E}\\left(y|y>\\hat{q}_{1-\\beta}\\right) -\\hat{E}\\left(y|y<\\hat{q}_{\\beta}\\right)} .. math:: KR_{4}=\\frac{\\hat{q}_{1-\\delta}-\\hat{q}_{\\delta}} {\\hat{q}_{1-\\gamma}-\\hat{q}_{\\gamma}} where :math:`\\hat{q}_{p}` is the estimated quantile at :math:`p`. .. [*] Tae-Hwan Kim and Halbert White, "On more robust estimation of skewness and kurtosis," Finance Research Letters, vol. 1, pp. 56-73, March 2004. """ if (axis is None or (y.squeeze().ndim == 1 and y.ndim != 1)): y = y.ravel() axis = 0 alpha, beta = ab delta, gamma = dg perc = (12.5, 25.0, 37.5, 62.5, 75.0, 87.5, delta, 100.0 - delta, gamma, 100.0 - gamma) e1, e2, e3, e5, e6, e7, fd, f1md, fg, f1mg = np.percentile(y, perc, axis=axis) expected_value = (expected_robust_kurtosis(ab, dg) if excess else np.zeros(4)) kr1 = stats.kurtosis(y, axis, False) - expected_value[0] kr2 = ((e7 - e5) + (e3 - e1)) / (e6 - e2) - expected_value[1] if y.ndim == 1: kr3 = _kr3(y, alpha, beta) else: kr3 = np.apply_along_axis(_kr3, axis, y, alpha, beta) kr3 -= expected_value[2] kr4 = (f1md - fd) / (f1mg - fg) - expected_value[3] return kr1, kr2, kr3, kr4
def _medcouple_1d(y): """ Calculates the medcouple robust measure of skew. Parameters ---------- y : array-like, 1-d Returns ------- mc : float The medcouple statistic Notes ----- The current algorithm requires a O(N**2) memory allocations, and so may not work for very large arrays (N>10000). .. [*] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed distributions" Computational Statistics & Data Analysis, vol. 52, pp. 5186-5201, August 2008. """ # Parameter changes the algorithm to the slower for large n y = np.squeeze(np.asarray(y)) if y.ndim != 1: raise ValueError("y must be squeezable to a 1-d array") y = np.sort(y) n = y.shape[0] if n % 2 == 0: mf = (y[n // 2 - 1] + y[n // 2]) / 2 else: mf = y[(n - 1) // 2] z = y - mf lower = z[z <= 0.0] upper = z[z >= 0.0] upper = upper[:, None] standardization = upper - lower is_zero = np.logical_and(lower == 0.0, upper == 0.0) standardization[is_zero] = np.inf spread = upper + lower h = spread / standardization # GH5395 num_ties = np.sum(lower == 0.0) if num_ties: # Replacements has -1 above the anti-diagonal, 0 on the anti-diagonal, # and 1 below the anti-diagonal replacements = np.ones((num_ties, num_ties)) - np.eye(num_ties) replacements -= 2 * np.triu(replacements) # Convert diagonal to anti-diagonal replacements = np.fliplr(replacements) # Always replace upper right block h[:num_ties, -num_ties:] = replacements return np.median(h)
[docs]def medcouple(y, axis=0): """ Calculates the medcouple robust measure of skew. Parameters ---------- y : array-like axis : int or None, optional Axis along which the medcouple statistic is computed. If `None`, the entire array is used. Returns ------- mc : ndarray The medcouple statistic with the same shape as `y`, with the specified axis removed. Notes ----- The current algorithm requires a O(N**2) memory allocations, and so may not work for very large arrays (N>10000). .. [*] M. Huberta and E. Vandervierenb, "An adjusted boxplot for skewed distributions" Computational Statistics & Data Analysis, vol. 52, pp. 5186-5201, August 2008. """ y = np.asarray(y, dtype=np.double) # GH 4243 if axis is None: return _medcouple_1d(y.ravel()) return np.apply_along_axis(_medcouple_1d, axis, y)