Source code for statsmodels.stats.descriptivestats

from __future__ import annotations

from statsmodels.compat.pandas import PD_LT_2, is_numeric_dtype
from statsmodels.compat.scipy import SP_LT_19

from typing import TYPE_CHECKING

import numpy as np
import pandas as pd
from pandas.api.types import is_extension_array_dtype

if PD_LT_2:
    from pandas.core.dtypes.common import is_categorical_dtype
else:
    # After pandas 2 is the minium, use the isinstance check
    def is_categorical_dtype(dtype):
        return isinstance(dtype, pd.CategoricalDtype)


from scipy import stats

from statsmodels.iolib.table import SimpleTable
from statsmodels.stats.stattools import jarque_bera
from statsmodels.tools._decorators import cache_readonly
from statsmodels.tools.docstring import Docstring, Parameter
from statsmodels.tools.docstring_helpers import Appender
from statsmodels.tools.validation import (
    array_like,
    bool_like,
    float_like,
    int_like,
)

if TYPE_CHECKING:
    from collections.abc import Sequence

PERCENTILES = (1, 5, 10, 25, 50, 75, 90, 95, 99)
QUANTILES = np.array(PERCENTILES) / 100.0


def pd_ptp(df):
    return df.max() - df.min()


def nancount(x, axis=0):
    return (1 - np.isnan(x)).sum(axis=axis)


def nanptp(arr, axis=0):
    return np.nanmax(arr, axis=axis) - np.nanmin(arr, axis=axis)


def nanuss(arr, axis=0):
    return np.nansum(arr**2, axis=axis)


def nanpercentile(arr, axis=0):
    return np.nanpercentile(arr, PERCENTILES, axis=axis)


def nankurtosis(arr, axis=0):
    return stats.kurtosis(arr, axis=axis, nan_policy="omit")


def nanskewness(arr, axis=0):
    return stats.skew(arr, axis=axis, nan_policy="omit")


MISSING = {
    "obs": nancount,
    "mean": np.nanmean,
    "std": np.nanstd,
    "max": np.nanmax,
    "min": np.nanmin,
    "ptp": nanptp,
    "var": np.nanvar,
    "skew": nanskewness,
    "uss": nanuss,
    "kurtosis": nankurtosis,
    "percentiles": nanpercentile,
}


def _kurtosis(a):
    """
    Wrapper for scipy.stats.kurtosis that returns nan instead of raising
    Error

    Parameters
    ----------
    a : array_like
        Data for which the kurtosis is computed.

    Returns
    -------
    float
        The kurtosis of `a`, or nan if scipy.stats.kurtosis raises a
        ValueError.

    Notes
    -----
    Missing options.
    """
    try:
        res = stats.kurtosis(a)
    except ValueError:
        res = np.nan
    return res


def _skew(a):
    """
    Wrapper for scipy.stats.skew that returns nan instead of raising Error

    Parameters
    ----------
    a : array_like
        Data for which the skewness is computed.

    Returns
    -------
    float
        The skewness of `a`, or nan if scipy.stats.skew raises a
        ValueError.

    Notes
    -----
    Missing options.
    """
    try:
        res = stats.skew(a)
    except ValueError:
        res = np.nan
    return res



[docs]
def sign_test(samp, mu0=0):
    """
    Signs test

    Parameters
    ----------
    samp : array_like
        1d array. The sample for which you want to perform the sign test.
    mu0 : float
        See Notes for the definition of the sign test. mu0 is 0 by
        default, but it is common to set it to the median.

    Returns
    -------
    M : float
        The test statistic for the sign test.
    p : float
        The p-value for the test.

    Raises
    ------
    ValueError
        If no observation differs from `mu0`. All values are then discarded
        as ties and the test is not defined.

    Notes
    -----
    The signs test returns

    M = (N(+) - N(-))/2

    where N(+) is the number of values above `mu0`, N(-) is the number of
    values below.  Values equal to `mu0` are discarded.

    The p-value for M is calculated using the binomial distribution
    and can be interpreted the same as for a t-test. The test-statistic
    is distributed Binom(min(N(+), N(-)), n_trials, .5) where n_trials
    equals N(+) + N(-).

    See Also
    --------
    scipy.stats.wilcoxon
    """
    samp = np.asarray(samp)
    pos = np.sum(samp > mu0)
    neg = np.sum(samp < mu0)
    if pos + neg == 0:
        raise ValueError(
            "The sign test is not defined when no observation differs from "
            "mu0. Every value in samp is equal to mu0 (or samp is empty), and "
            "tied values are discarded."
        )
    M = (pos - neg) / 2.0
    p = stats.binomtest(min(pos, neg), pos + neg, 0.5).pvalue
    return M, p



NUMERIC_STATISTICS = (
    "nobs",
    "missing",
    "mean",
    "std_err",
    "ci",
    "std",
    "iqr",
    "iqr_normal",
    "mad",
    "mad_normal",
    "coef_var",
    "range",
    "max",
    "min",
    "skew",
    "kurtosis",
    "jarque_bera",
    "mode",
    "median",
    "percentiles",
)
CATEGORICAL_STATISTICS = ("nobs", "missing", "distinct", "top", "freq")
_additional = [
    stat for stat in CATEGORICAL_STATISTICS if stat not in NUMERIC_STATISTICS
]
DEFAULT_STATISTICS = NUMERIC_STATISTICS + tuple(_additional)



[docs]
class Description:
    """
    Extended descriptive statistics for data

    Parameters
    ----------
    data : array_like
        Data to describe. Must be convertible to a pandas DataFrame.
    stats : Sequence[str], optional
        Statistics to include. If not provided the full set of statistics is
        computed. This list may evolve across versions to reflect best
        practices. Supported options are:
        "nobs", "missing", "mean", "std_err", "ci", "std", "iqr",
        "iqr_normal", "mad", "mad_normal", "coef_var", "range", "max",
        "min", "skew", "kurtosis", "jarque_bera", "mode",
        "median", "percentiles", "distinct", "top", and "freq". See Notes for
        details.
    numeric : bool, default True
        Whether to include numeric columns in the descriptive statistics.
    categorical : bool, default True
        Whether to include categorical columns in the descriptive statistics.
    alpha : float, default 0.05
        A number between 0 and 1 representing the size used to compute the
        confidence interval, which has coverage 1 - alpha.
    use_t : bool, default False
        Use the Student's t distribution to construct confidence intervals.
    percentiles : sequence[float]
        A distinct sequence of floating point values all between 0 and 100.
        The default percentiles are 1, 5, 10, 25, 50, 75, 90, 95, 99.
    ntop : int, default 5
        The number of top categorical labels to report. Default is 5.

    Attributes
    ----------
    numeric_statistics
        The list of supported statistics for numeric data
    categorical_statistics
        The list of supported statistics for categorical data
    default_statistics
        The default list of statistics

    See Also
    --------
    pandas.DataFrame.describe
        Basic descriptive statistics
    describe
        A simplified version that returns a DataFrame

    Notes
    -----
    The selectable statistics include:

    * "nobs" - Number of observations
    * "missing" - Number of missing observations
    * "mean" - Mean
    * "std_err" - Standard Error of the mean assuming no correlation
    * "ci" - Confidence interval with coverage (1 - alpha) using the normal or
      t. This option creates two entries in any tables: lower_ci and upper_ci.
    * "std" - Standard Deviation
    * "iqr" - Interquartile range
    * "iqr_normal" - Interquartile range relative to a Normal
    * "mad" - Mean absolute deviation
    * "mad_normal" - Mean absolute deviation relative to a Normal
    * "coef_var" - Coefficient of variation
    * "range" - Range between the maximum and the minimum
    * "max" - The maximum
    * "min" - The minimum
    * "skew" - The skewness defined as the standardized 3rd central moment
    * "kurtosis" - The kurtosis defined as the standardized 4th central moment
    * "jarque_bera" - The Jarque-Bera test statistic for normality based on
      the skewness and kurtosis. This option creates two entries, jarque_bera
      and jarque_bera_pval.
    * "mode" - The mode of the data. This option creates two entries in all tables,
      mode and mode_freq which is the empirical frequency of the modal value.
    * "median" - The median of the data.
    * "percentiles" - The percentiles. Values included depend on the input value of
      ``percentiles``.
    * "distinct" - The number of distinct categories in a categorical.
    * "top" - The most common categories. Labeled top_n for n in 1, 2, ..., ``ntop``.
    * "freq" - The frequency of the common categories. Labeled freq_n for n in 1,
      2, ..., ``ntop``.
    """

    _int_fmt = ("nobs", "missing", "distinct")
    numeric_statistics = NUMERIC_STATISTICS
    categorical_statistics = CATEGORICAL_STATISTICS
    default_statistics = DEFAULT_STATISTICS

    def __init__(
        self,
        data: np.ndarray | pd.Series | pd.DataFrame,
        stats: Sequence[str] | None = None,
        *,
        numeric: bool = True,
        categorical: bool = True,
        alpha: float = 0.05,
        use_t: bool = False,
        percentiles: Sequence[int | float] = PERCENTILES,
        ntop: bool = 5,
    ):
        data_arr = data
        if not isinstance(data, (pd.Series, pd.DataFrame)):
            data_arr = array_like(data, "data", maxdim=2)
        if data_arr.ndim == 1:
            data = pd.Series(data)
        numeric = bool_like(numeric, "numeric")
        categorical = bool_like(categorical, "categorical")
        include = []
        col_types = ""
        if numeric:
            include.append(np.number)
            col_types = "numeric"
        if categorical:
            include.append("category")
            col_types += "and " if col_types != "" else ""
            col_types += "categorical"
        if not numeric and not categorical:
            raise ValueError("At least one of numeric and categorical must be True")
        self._data = pd.DataFrame(data).select_dtypes(include)
        if self._data.shape[1] == 0:

            raise ValueError(f"Selecting {col_types} results in an empty DataFrame")
        self._is_numeric = [is_numeric_dtype(dt) for dt in self._data.dtypes]
        self._is_cat_like = [is_categorical_dtype(dt) for dt in self._data.dtypes]

        if stats is not None:
            undef = [stat for stat in stats if stat not in DEFAULT_STATISTICS]
            if undef:
                raise ValueError(f"{', '.join(undef)} are not known statistics")
        self._stats = list(DEFAULT_STATISTICS) if stats is None else list(stats)
        self._ntop = int_like(ntop, "ntop")
        self._compute_top = "top" in self._stats
        self._compute_freq = "freq" in self._stats
        if self._compute_top and self._ntop <= 0 < sum(self._is_cat_like):
            raise ValueError("top must be a non-negative integer")

        # Expand special stats
        replacements = {
            "mode": ["mode", "mode_freq"],
            "ci": ["upper_ci", "lower_ci"],
            "jarque_bera": ["jarque_bera", "jarque_bera_pval"],
            "top": [f"top_{i}" for i in range(1, self._ntop + 1)],
            "freq": [f"freq_{i}" for i in range(1, self._ntop + 1)],
        }

        for key, value in replacements.items():
            if key in self._stats:
                idx = self._stats.index(key)
                self._stats = self._stats[:idx] + value + self._stats[idx + 1 :]

        self._percentiles = array_like(percentiles, "percentiles", maxdim=1, dtype="d")
        self._percentiles = np.sort(self._percentiles)
        if np.unique(self._percentiles).shape[0] != self._percentiles.shape[0]:
            raise ValueError("percentiles must be distinct")
        if np.any(self._percentiles >= 100) or np.any(self._percentiles <= 0):
            raise ValueError("percentiles must be strictly between 0 and 100")
        self._alpha = float_like(alpha, "alpha")
        if not 0 < alpha < 1:
            raise ValueError("alpha must be strictly between 0 and 1")
        self._use_t = bool_like(use_t, "use_t")

    def _reorder(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.loc[[s for s in self._stats if s in df.index]]


[docs]
    @cache_readonly
    def frame(self) -> pd.DataFrame:
        """
        Descriptive statistics for both numeric and categorical data

        Returns
        -------
        DataFrame
            The statistics
        """
        numeric = self.numeric
        categorical = self.categorical
        if categorical.shape[1] == 0:
            return numeric
        elif numeric.shape[1] == 0:
            return categorical
        df = pd.concat([numeric, categorical], axis=1)
        return self._reorder(df[self._data.columns])



[docs]
    @cache_readonly
    def numeric(self) -> pd.DataFrame:
        """
        Descriptive statistics for numeric data

        Returns
        -------
        DataFrame
            The statistics of the numeric columns
        """
        df: pd.DataFrame = self._data.loc[:, self._is_numeric]
        cols = df.columns
        _, k = df.shape
        std = df.std()
        count = df.count()
        mean = df.mean()
        mad = (df - mean).abs().mean()
        std_err = std.copy()
        std_err.loc[count > 0] /= count.loc[count > 0] ** 0.5
        if self._use_t:
            q = stats.t(count - 1).ppf(1.0 - self._alpha / 2)
        else:
            q = stats.norm.ppf(1.0 - self._alpha / 2)

        def _mode(ser):
            dtype = (
                ser.dtype if isinstance(ser.dtype, np.dtype) else ser.dtype.numpy_dtype
            )
            ser_no_missing = ser.dropna().to_numpy(dtype=dtype)
            kwargs = {} if SP_LT_19 else {"keepdims": True}
            mode_res = stats.mode(ser_no_missing, **kwargs)
            # Changes in SciPy 1.10
            if np.isscalar(mode_res[0]):
                return float(mode_res[0]), mode_res[1]
            if mode_res[0].shape[0] > 0:
                return [float(np.squeeze(val)) for val in mode_res]
            return np.nan, np.nan

        if df.shape[0] == 0:
            # No observations: the mode is undefined. Skip the apply since
            # pandas' empty-result path mis-sizes the output, raising
            # "Length of values (2) does not match length of index" (GH#9891).
            mode = np.full(k, np.nan)
            mode_counts = np.full(k, np.nan)
        else:
            mode_values = df.apply(_mode).T
            if mode_values.size > 0:
                mode = np.asarray(mode_values[0], dtype=float)
                mode_counts = np.asarray(mode_values[1], dtype=np.int64)
            else:
                mode = mode_counts = np.empty(0)
        loc = count > 0
        mode_freq = np.full(mode.shape[0], np.nan)
        mode_freq[loc] = mode_counts[loc] / count.loc[loc]
        # TODO: Workaround for pandas AbstractMethodError in extension
        #  types. Remove when quantile is supported for these
        _df = df.copy()
        for col in df:
            if is_extension_array_dtype(df[col].dtype):
                if _df[col].isna().any():
                    _df[col] = _df[col].fillna(np.nan)

        if df.shape[1] > 0:
            iqr = _df.quantile(0.75) - _df.quantile(0.25)
        else:
            iqr = mean

        def _safe_jarque_bera(c):
            a = np.asarray(c)
            if a.shape[0] < 2:
                return (np.nan,) * 4
            return jarque_bera(a)

        if df.size:
            jb = df.apply(
                lambda x: list(_safe_jarque_bera(x.dropna())),
                result_type="expand",
            ).T
        else:
            # No observations (or no numeric columns): Jarque-Bera is
            # undefined. Build a NaN frame with the expected four columns so
            # the skew/kurtosis/JB lookups below do not raise KeyError
            # (GH#9891).
            jb = pd.DataFrame(np.nan, index=cols, columns=range(4))
        nan_mean = mean.copy()
        nan_mean.loc[nan_mean == 0] = np.nan
        coef_var = std / nan_mean

        results = {
            "nobs": pd.Series(np.ones(k, dtype=np.int64) * df.shape[0], index=cols),
            "missing": df.shape[0] - count,
            "mean": mean,
            "std_err": std_err,
            "upper_ci": mean + q * std_err,
            "lower_ci": mean - q * std_err,
            "std": std,
            "iqr": iqr,
            "mad": mad,
            "coef_var": coef_var,
            "range": pd_ptp(df),
            "max": df.max(),
            "min": df.min(),
            "skew": jb[2],
            "kurtosis": jb[3],
            "iqr_normal": iqr / np.diff(stats.norm.ppf([0.25, 0.75])),
            "mad_normal": mad / np.sqrt(2 / np.pi),
            "jarque_bera": jb[0],
            "jarque_bera_pval": jb[1],
            "mode": pd.Series(mode, index=cols),
            "mode_freq": pd.Series(mode_freq, index=cols),
            "median": df.median(),
        }
        final = {k: v for k, v in results.items() if k in self._stats}
        results_df = pd.DataFrame(
            list(final.values()), columns=cols, index=list(final.keys())
        )
        if "percentiles" not in self._stats:
            return results_df
        # Pandas before 1.0 cannot handle empty DF
        if df.shape[1] > 0:
            # TODO: Remove when extension types support quantile
            perc = _df.quantile(self._percentiles / 100).astype(float)
        else:
            perc = pd.DataFrame(index=self._percentiles / 100, dtype=float)
        if np.all(np.floor(100 * perc.index) == (100 * perc.index)):
            perc.index = [f"{int(100 * idx)}%" for idx in perc.index]
        else:
            dupe = True
            scale = 100
            index = perc.index
            while dupe:
                scale *= 10
                idx = np.floor(scale * perc.index)
                if np.all(np.diff(idx) > 0):
                    dupe = False
            index = np.floor(scale * index) / (scale / 100)
            fmt = f"0.{len(str(scale//100))-1}f"
            output = f"{{0:{fmt}}}%"
            perc.index = [output.format(val) for val in index]

        # Add in the names of the percentiles to the output
        self._stats = self._stats + perc.index.tolist()

        return self._reorder(pd.concat([results_df, perc], axis=0))



[docs]
    @cache_readonly
    def categorical(self) -> pd.DataFrame:
        """
        Descriptive statistics for categorical data

        Returns
        -------
        DataFrame
            The statistics of the categorical columns
        """

        df = self._data.loc[:, list(self._is_cat_like)]
        k = df.shape[1]
        cols = df.columns
        vc = {col: df[col].value_counts(normalize=True) for col in df}
        distinct = pd.Series({col: vc[col].shape[0] for col in vc}, dtype=np.int64)
        top = {}
        freq = {}
        for col, single in vc.items():
            if single.shape[0] >= self._ntop:
                top[col] = single.index[: self._ntop]
                freq[col] = np.asarray(single.iloc[:5])
            else:
                val = list(single.index)
                val += [None] * (self._ntop - len(val))
                top[col] = val
                freq_val = list(single)
                freq_val += [np.nan] * (self._ntop - len(freq_val))
                freq[col] = np.asarray(freq_val)
        index = [f"top_{i}" for i in range(1, self._ntop + 1)]
        top_df = pd.DataFrame(top, dtype="object", index=index, columns=cols)
        index = [f"freq_{i}" for i in range(1, self._ntop + 1)]
        freq_df = pd.DataFrame(freq, dtype="object", index=index, columns=cols)

        results = {
            "nobs": pd.Series(np.ones(k, dtype=np.int64) * df.shape[0], index=cols),
            "missing": df.shape[0] - df.count(),
            "distinct": distinct,
        }
        final = {k: v for k, v in results.items() if k in self._stats}
        results_df = pd.DataFrame(
            list(final.values()),
            columns=cols,
            index=list(final.keys()),
            dtype="object",
        )
        if self._compute_top:
            results_df = pd.concat([results_df, top_df], axis=0)
        if self._compute_freq:
            results_df = pd.concat([results_df, freq_df], axis=0)

        return self._reorder(results_df)



[docs]
    def summary(self) -> SimpleTable:
        """
        Summary table of the descriptive statistics

        Returns
        -------
        SimpleTable
            A table instance supporting export to text, csv and LaTeX
        """
        df = self.frame.astype(object)
        if df.isna().any().any():
            df = df.fillna("")
        cols = [str(col) for col in df.columns]
        stubs = [str(idx) for idx in df.index]
        data = []
        for _, row in df.iterrows():
            data.append(list(row))

        def _formatter(v):
            if isinstance(v, str):
                return v
            elif v // 1 == v:
                return str(int(v))
            return f"{v:0.4g}"

        return SimpleTable(
            data,
            header=cols,
            stubs=stubs,
            title="Descriptive Statistics",
            txt_fmt={"data_fmts": {0: "%s", 1: _formatter}},
            datatypes=[1] * len(data),
        )


    def __str__(self) -> str:
        return str(self.summary().as_text())



ds = Docstring(Description.__doc__)
ds.replace_block("Returns", Parameter(None, "DataFrame", ["Descriptive statistics"]))
ds.replace_block("Attributes", [])
ds.replace_block(
    "See Also",
    [
        (
            [("pandas.DataFrame.describe", None)],
            ["Basic descriptive statistics"],
        ),
        (
            [("Description", None)],
            ["Descriptive statistics class with additional output options"],
        ),
    ],
)



[docs]
@Appender(str(ds))
def describe(
    data: np.ndarray | pd.Series | pd.DataFrame,
    stats: Sequence[str] | None = None,
    *,
    numeric: bool = True,
    categorical: bool = True,
    alpha: float = 0.05,
    use_t: bool = False,
    percentiles: Sequence[int | float] = PERCENTILES,
    ntop: bool = 5,
) -> pd.DataFrame:
    return Description(
        data,
        stats,
        numeric=numeric,
        categorical=categorical,
        alpha=alpha,
        use_t=use_t,
        percentiles=percentiles,
        ntop=ntop,
    ).frame



class Describe:
    """Removed"""

    def __init__(self, dataset):
        raise NotImplementedError("Describe has been removed")