Source code for statsmodels.stats.robust_compare

"""
Anova k-sample comparison without and with trimming

Created on Sun Jun 09 23:51:34 2013

Author: Josef Perktold
"""

import numbers

import numpy as np

# the trimboth and trim_mean are taken from scipy.stats.stats
# and enhanced by axis



[docs]
def trimboth(a, proportiontocut, axis=0):
    """
    Slice off a proportion of items from both ends of an array

    Slices off the passed proportion of items from both ends of the passed
    array (i.e., with `proportiontocut` = 0.1, slices leftmost 10% **and**
    rightmost 10% of scores).  You must pre-sort the array if you want
    'proper' trimming.  Slices off less if proportion results in a
    non-integer slice index (i.e., conservatively slices off
    `proportiontocut`).

    Parameters
    ----------
    a : array_like
        Data to trim.
    proportiontocut : float or int
        Proportion of data to trim at each end.
    axis : int or None
        Axis along which the observations are trimmed. The default is to trim
        along axis=0. If axis is None then the array will be flattened before
        trimming.

    Returns
    -------
    out : array-like
        Trimmed version of array `a`.

    Examples
    --------
    >>> from scipy import stats
    >>> a = np.arange(20)
    >>> b = stats.trimboth(a, 0.1)
    >>> b.shape
    (16,)
    """
    a = np.asarray(a)
    if axis is None:
        a = a.ravel()
        axis = 0
    nobs = a.shape[axis]
    lowercut = int(proportiontocut * nobs)
    uppercut = nobs - lowercut
    if lowercut >= uppercut:
        raise ValueError("Proportion too big.")

    sl = [slice(None)] * a.ndim
    sl[axis] = slice(lowercut, uppercut)
    return a[tuple(sl)]




[docs]
def trim_mean(a, proportiontocut, axis=0):
    """
    Return mean of array after trimming observations from both tails

    If `proportiontocut` = 0.1, slices off 'leftmost' and 'rightmost' 10% of
    scores. Slices off LESS if proportion results in a non-integer slice
    index (i.e., conservatively slices off `proportiontocut` ).

    Parameters
    ----------
    a : array_like
        Input array.
    proportiontocut : float
        Fraction to cut off at each tail of the sorted observations.
    axis : int or None
        Axis along which the trimmed means are computed. The default is axis=0.
        If axis is None then the trimmed mean will be computed for the
        flattened array.

    Returns
    -------
    trim_mean : ndarray
        Mean of trimmed array.
    """
    newa = trimboth(np.sort(a, axis), proportiontocut, axis=axis)
    return np.mean(newa, axis=axis)




[docs]
class TrimmedMean:
    """
    Class for trimmed and winsorized one sample statistics

    axis is None, i.e. ravelling, is not supported.

    Parameters
    ----------
    data : array-like
        The data, observations to analyze.
    fraction : float in (0, 0.5)
        The fraction of observations to trim at each tail.
        The number of observations trimmed at each tail is
        ``int(fraction * nobs)``.
    is_sorted : boolean
        Indicator if data is already sorted. By default the data is sorted
        along ``axis``.
    axis : int
        The axis of reduce operations. By default axis=0, that is observations
        are along the zero dimension, i.e. rows if 2-dim.
    """

    def __init__(self, data, fraction, is_sorted=False, axis=0):
        self.data = np.asarray(data)
        # TODO: add pandas handling, maybe not if this stays internal

        self.axis = axis
        self.fraction = fraction
        self.nobs = nobs = self.data.shape[axis]
        self.lowercut = lowercut = int(fraction * nobs)
        self.uppercut = uppercut = nobs - lowercut
        if lowercut >= uppercut:
            raise ValueError("Proportion too big.")
        self.nobs_reduced = nobs - 2 * lowercut

        self.sl = [slice(None)] * self.data.ndim
        self.sl[axis] = slice(self.lowercut, self.uppercut)
        # numpy requires now tuple for indexing, not list
        self.sl = tuple(self.sl)
        if not is_sorted:
            self.data_sorted = np.sort(self.data, axis=axis)
        else:
            self.data_sorted = self.data

        # this only works for axis=0
        self.lowerbound = np.take(self.data_sorted, lowercut, axis=axis)
        self.upperbound = np.take(self.data_sorted, uppercut - 1, axis=axis)
        # self.lowerbound = self.data_sorted[lowercut]
        # self.upperbound = self.data_sorted[uppercut - 1]

    @property
    def data_trimmed(self):
        """numpy array of trimmed and sorted data"""
        # returns a view
        return self.data_sorted[self.sl]

    @property  # cache
    def data_winsorized(self):
        """winsorized data"""
        lb = np.expand_dims(self.lowerbound, self.axis)
        ub = np.expand_dims(self.upperbound, self.axis)
        return np.clip(self.data_sorted, lb, ub)

    @property
    def mean_trimmed(self):
        """mean of trimmed data"""
        return np.mean(self.data_sorted[tuple(self.sl)], self.axis)

    @property
    def mean_winsorized(self):
        """mean of winsorized data"""
        return np.mean(self.data_winsorized, self.axis)

    @property
    def var_winsorized(self):
        """variance of winsorized data"""
        # hardcoded ddof = 1
        return np.var(self.data_winsorized, ddof=1, axis=self.axis)

    @property
    def std_mean_trimmed(self):
        """standard error of trimmed mean"""
        se = np.sqrt(self.var_winsorized / self.nobs_reduced)
        # trimming creates correlation across trimmed observations
        # trimming is based on order statistics of the data
        # wilcox 2012, p.61
        se *= np.sqrt(self.nobs / self.nobs_reduced)
        return se

    @property
    def std_mean_winsorized(self):
        """standard error of winsorized mean"""
        # the following matches Wilcox, WRS2
        std_ = np.sqrt(self.var_winsorized / self.nobs)
        std_ *= (self.nobs - 1) / (self.nobs_reduced - 1)
        # old version
        # tm = self
        # formula from an old SAS manual page, simplified
        # std_ = np.sqrt(tm.var_winsorized / (tm.nobs_reduced - 1) *
        #               (tm.nobs - 1.) / tm.nobs)
        return std_


[docs]
    def ttest_mean(self, value=0, transform="trimmed", alternative="two-sided"):
        """
        One sample t-test for trimmed or Winsorized mean

        Parameters
        ----------
        value : float
            Value of the mean under the Null hypothesis.
        transform : {'trimmed', 'winsorized'}
            Specified whether the mean test is based on trimmed or winsorized
            data.
        alternative : {'two-sided', 'larger', 'smaller'}
            Defines the alternative hypothesis. The following options are
            available (default is 'two-sided'):

            * 'two-sided' : H1: mean not equal to value
            * 'larger' : H1: mean larger than value
            * 'smaller' : H1: mean smaller than value

        Returns
        -------
        tuple
            The tuple contains statistic, pvalue and degrees of freedom of
            the one sample t-test.

        Notes
        -----
        p-value is based on the approximate t-distribution of the test
        statistic. The approximation is valid if the underlying distribution
        is symmetric.
        """
        import statsmodels.stats.weightstats as smws

        df = self.nobs_reduced - 1
        if transform == "trimmed":
            mean_ = self.mean_trimmed
            std_ = self.std_mean_trimmed
        elif transform == "winsorized":
            mean_ = self.mean_winsorized
            std_ = self.std_mean_winsorized
        else:
            raise ValueError("transform can only be 'trimmed' or 'winsorized'")

        res = smws._tstat_generic(
            mean_, 0, std_, df, alternative=alternative, diff=value
        )
        return (*res, df)



[docs]
    def reset_fraction(self, frac):
        """
        Create a TrimmedMean instance with a new trimming fraction

        This reuses the sorted array from the current instance.

        Parameters
        ----------
        frac : float in (0, 0.5)
            The fraction of observations to trim at each tail.

        Returns
        -------
        TrimmedMean
            Instance of TrimmedMean with the new trimming fraction.
        """
        tm = TrimmedMean(self.data_sorted, frac, is_sorted=True, axis=self.axis)
        tm.data = self.data
        # TODO: this will not work if there is processing of meta-information
        #       in __init__,
        #       for example storing a pandas DataFrame or Series index
        return tm





[docs]
def scale_transform(data, center="median", transform="abs", trim_frac=0.2, axis=0):
    """
    Transform data for variance comparison for Levene type tests

    Parameters
    ----------
    data : array_like
        Observations for the data.
    center : "median", "mean", "trimmed" or float
        Statistic used for centering observations. If a float, then this
        value is used to center. Default is median.
    transform : 'abs', 'square', 'identity' or a callable
        The transform for the centered data.
    trim_frac : float in [0, 0.5)
        Fraction of observations that are trimmed on each side of the sorted
        observations. This is only used if center is `trimmed`.
    axis : int
        Axis along which the data are transformed when centering.

    Returns
    -------
    res : ndarray
        Transformed data in the same shape as the original data.
    """
    x = np.asarray(data)  # x is shorthand from earlier code

    if transform == "abs":
        tfunc = np.abs
    elif transform == "square":

        def tfunc(x):
            return x * x

    elif transform == "identity":

        def tfunc(x):
            return x

    elif callable(transform):
        tfunc = transform
    else:
        raise ValueError("transform should be abs, square or exp")

    if center == "median":
        res = tfunc(x - np.expand_dims(np.median(x, axis=axis), axis))
    elif center == "mean":
        res = tfunc(x - np.expand_dims(np.mean(x, axis=axis), axis))
    elif center == "trimmed":
        center = trim_mean(x, trim_frac, axis=axis)
        res = tfunc(x - np.expand_dims(center, axis))
    elif isinstance(center, numbers.Number):
        res = tfunc(x - center)
    else:
        raise ValueError("center should be median, mean or trimmed")

    return res