Source code for statsmodels.nonparametric._kernel_base

"""
Module containing the base object for multivariate kernel density and
regression, plus some utilities
"""

from __future__ import annotations

import copy

import numpy as np
from scipy import optimize
from scipy.stats.mstats import mquantiles

from statsmodels.tools.rng_qrng import check_random_state

try:
    import joblib

    has_joblib = True
except ImportError:
    has_joblib = False

from . import kernels

kernel_func = dict(
    wangryzin=kernels.wang_ryzin,
    aitchisonaitken=kernels.aitchison_aitken,
    gaussian=kernels.gaussian,
    aitchison_aitken_reg=kernels.aitchison_aitken_reg,
    wangryzin_reg=kernels.wang_ryzin_reg,
    gauss_convolution=kernels.gaussian_convolution,
    wangryzin_convolution=kernels.wang_ryzin_convolution,
    aitchisonaitken_convolution=kernels.aitchison_aitken_convolution,
    gaussian_cdf=kernels.gaussian_cdf,
    aitchisonaitken_cdf=kernels.aitchison_aitken_cdf,
    wangryzin_cdf=kernels.wang_ryzin_cdf,
    d_gaussian=kernels.d_gaussian,
    tricube=kernels.tricube,
)


def _compute_min_std_IQR(data):
    """
    Compute minimum of std and IQR for each variable

    Parameters
    ----------
    data : ndarray
        The data for which to compute the dispersion measure, variables
        in columns.

    Returns
    -------
    ndarray
        The minimum of the standard deviation and IQR / 1.349 for each
        variable.
    """
    s1 = np.std(data, axis=0)
    q75 = mquantiles(data, 0.75, axis=0).data[0]
    q25 = mquantiles(data, 0.25, axis=0).data[0]
    s2 = (q75 - q25) / 1.349  # IQR
    dispersion = np.minimum(s1, s2)
    return dispersion


def _compute_subset(
    class_type,
    data,
    bw,
    co,
    do,
    n_cvars,
    ix_ord,
    ix_unord,
    n_sub,
    class_vars,
    randomize,
    bound,
    generator,
):
    """
    Compute bw on subset of data

    Called from ``GenericKDE._compute_efficient_*``.

    Parameters
    ----------
    class_type : str
        The class name, one of {"KDEMultivariate", "KDEMultivariateConditional",
        "KernelReg"}, used to determine which class to instantiate on the
        subset of data.
    data : ndarray
        The full data array from which the subset is drawn.
    bw : ndarray
        The bandwidth values used to fit the sub-sample model.
    co : float
        Twice the order of the continuous kernel, used in the scaling
        factor exponent.
    do : float
        Twice the order of the discrete kernel, used in the scaling
        factor exponent.
    n_cvars : int
        The number of continuous variables.
    ix_ord : ndarray
        Boolean index array indicating which variables are ordered
        discrete.
    ix_unord : ndarray
        Boolean index array indicating which variables are unordered
        discrete.
    n_sub : int
        The size of the sub-sample.
    class_vars : tuple
        Additional variables required to instantiate `class_type`.
    randomize : bool
        If True, a random sub-sample of size `n_sub` is drawn from `data`.
        If False, the sub-sample is sliced from `data` using `bound`.
    bound : tuple of int
        The (start, stop) indices used to slice `data` when `randomize`
        is False.
    generator : {RandomState, Generator}
        The random number generator used when `randomize` is True.

    Returns
    -------
    sample_scale_sub : ndarray
        The estimated scaling factor for the sub-sample.
    bw_sub : ndarray
        The bandwidth estimated on the sub-sample.

    Notes
    -----
    Needs to be outside the class in order for joblib to be able to pickle it.
    """
    if randomize:
        generator.shuffle(data)
        sub_data = data[:n_sub, :]
    else:
        sub_data = data[bound[0] : bound[1], :]

    if class_type == "KDEMultivariate":
        from .kernel_density import KDEMultivariate

        var_type = class_vars[0]
        sub_model = KDEMultivariate(
            sub_data,
            var_type,
            bw=bw,
            defaults=EstimatorSettings(efficient=False),
            rng=generator,
        )
    elif class_type == "KDEMultivariateConditional":
        from .kernel_density import KDEMultivariateConditional

        k_dep, dep_type, indep_type = class_vars
        endog = sub_data[:, :k_dep]
        exog = sub_data[:, k_dep:]
        sub_model = KDEMultivariateConditional(
            endog,
            exog,
            dep_type,
            indep_type,
            bw=bw,
            defaults=EstimatorSettings(efficient=False),
            rng=generator,
        )
    elif class_type == "KernelReg":
        from .kernel_regression import KernelReg

        var_type, k_vars, reg_type = class_vars
        endog = _adjust_shape(sub_data[:, 0], 1)
        exog = _adjust_shape(sub_data[:, 1:], k_vars)
        sub_model = KernelReg(
            endog=endog,
            exog=exog,
            reg_type=reg_type,
            var_type=var_type,
            bw=bw,
            defaults=EstimatorSettings(efficient=False),
            rng=generator,
        )

    else:
        raise ValueError(
            "class_type not recognized, should be one of "
            "{KDEMultivariate, KDEMultivariateConditional, KernelReg}"
        )

    # Compute dispersion in next 4 lines
    if class_type == "KernelReg":
        sub_data = sub_data[:, 1:]

    dispersion = _compute_min_std_IQR(sub_data)

    fct = dispersion * n_sub ** (-1.0 / (n_cvars + co))
    fct[ix_unord] = n_sub ** (-2.0 / (n_cvars + do))
    fct[ix_ord] = n_sub ** (-2.0 / (n_cvars + do))
    sample_scale_sub = sub_model.bw / fct  # TODO: check if correct
    bw_sub = sub_model.bw
    return sample_scale_sub, bw_sub


class GenericKDE:
    """
    Base class for density estimation and regression KDE classes
    """

    def _compute_bw(self, bw):
        """
        Computes the bandwidth of the data

        Parameters
        ----------
        bw : {array_like, str}
            If array_like: user-specified bandwidth.
            If a string, should be one of:

                - cv_ml: cross validation maximum likelihood
                - normal_reference: normal reference rule of thumb
                - cv_ls: cross validation least squares

        Notes
        -----
        The default values for bw is 'normal_reference'.
        """
        if bw is None:
            bw = "normal_reference"

        if not isinstance(bw, str):
            self._bw_method = "user-specified"
            res = np.asarray(bw)
        else:
            # The user specified a bandwidth selection method
            self._bw_method = bw
            # Workaround to avoid instance methods in __dict__
            if bw == "normal_reference":
                bwfunc = self._normal_reference
            elif bw == "cv_ml":
                bwfunc = self._cv_ml
            else:  # bw == 'cv_ls'
                bwfunc = self._cv_ls
            res = bwfunc()

        return res

    def _compute_dispersion(self, data):
        """
        Computes the measure of dispersion

        The minimum of the standard deviation and interquartile range / 1.349

        Parameters
        ----------
        data : ndarray
            The data for which to compute the dispersion measure, variables
            in columns.

        Returns
        -------
        ndarray
            The minimum of the standard deviation and IQR / 1.349 for each
            variable.

        Notes
        -----
        Reimplemented in `KernelReg`, because the first column of `data` has to
        be removed.

        References
        ----------
        See the user guide for the np package in R.
        In the notes on bwscaling option in npreg, npudens, npcdens there is
        a discussion on the measure of dispersion
        """
        return _compute_min_std_IQR(data)

    def _get_class_vars_type(self):
        """
        Helper method to be able to pass needed vars to _compute_subset

        Notes
        -----
        Needs to be implemented by subclasses.
        """

    def _compute_efficient(self, bw):
        """
        Computes the bandwidth by estimating the scaling factor (c)

        Uses n_res resamples of size ``n_sub`` (in `randomize` case), or by
        dividing ``nobs`` into as many ``n_sub`` blocks as needed (if
        `randomize` is False).

        Parameters
        ----------
        bw : {array_like, str, None}
            If a string, the bandwidth selection method, used to set
            ``self._bw_method``. Otherwise `bw` is returned unchanged.

        Returns
        -------
        ndarray
            The estimated bandwidth if `bw` is a string, otherwise `bw`
            unchanged.

        References
        ----------
        See p.9 in socserv.mcmaster.ca/racine/np_faq.pdf
        """

        if bw is None:
            self._bw_method = "normal_reference"
        if isinstance(bw, str):
            self._bw_method = bw
        else:
            self._bw_method = "user-specified"
            return bw

        nobs = self.nobs
        n_sub = self.n_sub
        data = copy.deepcopy(self.data)
        n_cvars = self.data_type.count("c")
        co = 4  # 2*order of continuous kernel
        do = 4  # 2*order of discrete kernel
        _, ix_ord, ix_unord = _get_type_pos(self.data_type)

        # Define bounds for slicing the data
        if self.randomize:
            # randomize chooses blocks of size n_sub, independent of nobs
            bounds = [None] * self.n_res
        else:
            bounds = [(i * n_sub, (i + 1) * n_sub) for i in range(nobs // n_sub)]
            if nobs % n_sub > 0:
                bounds.append((nobs - nobs % n_sub, nobs))

        n_blocks = self.n_res if self.randomize else len(bounds)
        sample_scale = np.empty((n_blocks, self.k_vars))
        only_bw = np.empty((n_blocks, self.k_vars))

        class_type, class_vars = self._get_class_vars_type()
        if has_joblib:
            # `res` is a list of tuples (sample_scale_sub, bw_sub)
            res = joblib.Parallel(n_jobs=self.n_jobs)(
                joblib.delayed(_compute_subset)(
                    class_type,
                    data,
                    bw,
                    co,
                    do,
                    n_cvars,
                    ix_ord,
                    ix_unord,
                    n_sub,
                    class_vars,
                    self.randomize,
                    bounds[i],
                    self._generator,
                )
                for i in range(n_blocks)
            )
        else:
            res = []
            for i in range(n_blocks):
                res.append(
                    _compute_subset(
                        class_type,
                        data,
                        bw,
                        co,
                        do,
                        n_cvars,
                        ix_ord,
                        ix_unord,
                        n_sub,
                        class_vars,
                        self.randomize,
                        bounds[i],
                        self._generator,
                    )
                )

        for i in range(n_blocks):
            sample_scale[i, :] = res[i][0]
            only_bw[i, :] = res[i][1]

        s = self._compute_dispersion(data)
        order_func = np.median if self.return_median else np.mean
        m_scale = order_func(sample_scale, axis=0)
        # TODO: Check if 1/5 is correct in line below!
        bw = m_scale * s * nobs ** (-1.0 / (n_cvars + co))
        bw[ix_ord] = m_scale[ix_ord] * nobs ** (-2.0 / (n_cvars + do))
        bw[ix_unord] = m_scale[ix_unord] * nobs ** (-2.0 / (n_cvars + do))

        if self.return_only_bw:
            bw = np.median(only_bw, axis=0)

        return bw

    def _set_defaults(self, defaults):
        """
        Sets the default values for the efficient estimation

        Parameters
        ----------
        defaults : EstimatorSettings
            The settings used for the efficient estimation of the
            bandwidth.
        """
        self.n_res = defaults.n_res
        self.n_sub = defaults.n_sub
        self.randomize = defaults.randomize
        self.return_median = defaults.return_median
        self.efficient = defaults.efficient
        self.return_only_bw = defaults.return_only_bw
        self.n_jobs = defaults.n_jobs

    def _normal_reference(self):
        """
        Returns Scott's normal reference rule of thumb bandwidth parameter

        Returns
        -------
        ndarray
            The bandwidth estimated by the normal reference rule of thumb.

        Notes
        -----
        See p.13 in [2] for an example and discussion.  The formula for the
        bandwidth is

        .. math:: h = 1.06n^{-1/(4+q)}

        where ``n`` is the number of observations and ``q`` is the number of
        variables.
        """
        X = np.std(self.data, axis=0)
        return 1.06 * X * self.nobs ** (-1.0 / (4 + self.data.shape[1]))

    def _set_bw_bounds(self, bw):
        """
        Sets bandwidth lower bound to effectively zero (1e-10), and for
        discrete values upper bound to 1

        Parameters
        ----------
        bw : ndarray
            The bandwidth values to bound.

        Returns
        -------
        ndarray
            The bounded bandwidth values.
        """
        bw[bw < 0] = 1e-10
        _, ix_ord, ix_unord = _get_type_pos(self.data_type)
        bw[ix_ord] = np.minimum(bw[ix_ord], 1.0)
        bw[ix_unord] = np.minimum(bw[ix_unord], 1.0)

        return bw

    def _cv_ml(self):
        r"""
        Returns the cross validation maximum likelihood bandwidth parameter

        Returns
        -------
        ndarray
            The bandwidth that maximizes the leave-one-out likelihood.

        Notes
        -----
        For more details see p.16, 18, 27 in Ref. [1] (see module docstring).

        Returns the bandwidth estimate that maximizes the leave-one-out
        likelihood.  The leave-one-out log likelihood function is:

        .. math:: \ln L=\sum_{i=1}^{n}\ln f_{-i}(X_{i})

        The leave-one-out kernel estimator of :math:`f_{-i}` is:

        .. math:: f_{-i}(X_{i})=\frac{1}{(n-1)h}
                        \sum_{j=1,j\neq i}K_{h}(X_{i},X_{j})

        where :math:`K_{h}` represents the Generalized product kernel
        estimator:

        .. math:: K_{h}(X_{i},X_{j})=\prod_{s=1}^
                        {q}h_{s}^{-1}k\left(\frac{X_{is}-X_{js}}{h_{s}}\right)
        """
        # the initial value for the optimization is the normal_reference
        h0 = self._normal_reference()
        bw = optimize.fmin(
            self.loo_likelihood,
            x0=h0,
            args=(np.log,),
            maxiter=1e3,
            maxfun=1e3,
            disp=0,
            xtol=1e-3,
        )
        bw = self._set_bw_bounds(bw)  # bound bw if necessary
        return bw

    def _cv_ls(self):
        r"""
        Returns the cross-validation least squares bandwidth parameter(s)

        Returns
        -------
        ndarray
            The bandwidth that minimizes the integrated mean square error.

        Notes
        -----
        For more details see pp. 16, 27 in Ref. [1] (see module docstring).

        Returns the value of the bandwidth that minimizes the integrated mean
        square error between the estimated and actual distribution.  The
        integrated mean square error (IMSE) is given by:

        .. math:: \int\left[\hat{f}(x)-f(x)\right]^{2}dx

        This is the general formula for the IMSE.  The IMSE differs for
        conditional (``KDEMultivariateConditional``) and unconditional
        (``KDEMultivariate``) kernel density estimation.
        """
        h0 = self._normal_reference()
        bw = optimize.fmin(self.imse, x0=h0, maxiter=1e3, maxfun=1e3, disp=0, xtol=1e-3)
        bw = self._set_bw_bounds(bw)  # bound bw if necessary
        return bw

    def loo_likelihood(self):
        """
        Returns the leave-one-out likelihood function

        Notes
        -----
        Needs to be implemented by subclasses.
        """
        raise NotImplementedError



[docs]
class EstimatorSettings:
    """
    Object to specify settings for density estimation or regression

    `EstimatorSettings` has several properties related to how bandwidth
    estimation for the `KDEMultivariate`, `KDEMultivariateConditional`,
    `KernelReg` and `CensoredKernelReg` classes behaves.

    Parameters
    ----------
    efficient : bool, optional
        If True, the bandwidth estimation is to be performed
        efficiently -- by taking smaller sub-samples and estimating
        the scaling factor of each subsample.  This is useful for large
        samples (nobs >> 300) and/or multiple variables (k_vars > 3).
        If False (default), all data is used at the same time.
    randomize : bool, optional
        If True, the bandwidth estimation is to be performed by
        taking `n_res` random resamples (with replacement) of size `n_sub` from
        the full sample.  If set to False (default), the estimation is
        performed by slicing the full sample in sub-samples of size `n_sub` so
        that all samples are used once.
    n_sub : int, optional
        Size of the sub-samples.  Default is 50.
    n_res : int, optional
        The number of random re-samples used to estimate the bandwidth.
        Only has an effect if ``randomize == True``.  Default value is 25.
    return_median : bool, optional
        If True (default), the estimator uses the median of all scaling factors
        for each sub-sample to estimate the bandwidth of the full sample.
        If False, the estimator uses the mean.
    return_only_bw : bool, optional
        If True, the estimator is to use the bandwidth and not the
        scaling factor.  This is *not* theoretically justified.
        Should be used only for experimenting.
    n_jobs : int, optional
        The number of jobs to use for parallel estimation with
        ``joblib.Parallel``.  Default is -1, meaning ``n_cores - 1``, with
        ``n_cores`` the number of available CPU cores.
        See the `joblib documentation
        <https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html>`_ for more details.

    Examples
    --------
    >>> settings = EstimatorSettings(randomize=True, n_jobs=3)
    >>> k_dens = KDEMultivariate(data, var_type, defaults=settings)
    """

    def __init__(
        self,
        efficient=False,
        randomize=False,
        n_res=25,
        n_sub=50,
        return_median=True,
        return_only_bw=False,
        n_jobs=-1,
    ):
        self.efficient = efficient
        self.randomize = randomize
        self.n_res = n_res
        self.n_sub = n_sub
        self.return_median = return_median
        self.return_only_bw = return_only_bw  # TODO: remove this?
        self.n_jobs = n_jobs



class LeaveOneOut:
    """
    Generator to give leave-one-out views on X

    Parameters
    ----------
    X : array_like
        2-D array.

    Notes
    -----
    A little lighter weight than sklearn LOO. We do not need test index.
    Also passes views on X, not the index.

    Examples
    --------
    >>> X = np.random.normal(0, 1, [10,2])
    >>> loo = LeaveOneOut(X)
    >>> for x in loo:
    ...    print(x)
    """

    def __init__(self, X):
        self.X = np.asarray(X)

    def __iter__(self):
        X = self.X
        nobs, k_vars = np.shape(X)

        for i in range(nobs):
            index = np.ones(nobs, dtype=bool)
            index[i] = False
            yield X[index, :]


def _get_type_pos(var_type):
    """
    Splits `var_type` into boolean index arrays by variable type

    Parameters
    ----------
    var_type : str
        The variable types, composed of the characters "c" (continuous),
        "o" (ordered discrete) and "u" (unordered discrete).

    Returns
    -------
    ix_cont : ndarray
        Boolean index array, True for continuous variables.
    ix_ord : ndarray
        Boolean index array, True for ordered discrete variables.
    ix_unord : ndarray
        Boolean index array, True for unordered discrete variables.
    """
    ix_cont = np.array([c == "c" for c in var_type])
    ix_ord = np.array([c == "o" for c in var_type])
    ix_unord = np.array([c == "u" for c in var_type])
    return ix_cont, ix_ord, ix_unord


def _adjust_shape(dat, k_vars):
    """
    Returns an array of shape (nobs, k_vars) for use with `gpke`

    Parameters
    ----------
    dat : array_like
        The data to reshape.
    k_vars : int
        The number of variables (columns) that `dat` should have.

    Returns
    -------
    ndarray
        `dat` reshaped to have shape (nobs, k_vars).
    """
    dat = np.asarray(dat)
    if dat.ndim > 2:
        dat = np.squeeze(dat)
    if dat.ndim == 1 and k_vars > 1:  # one obs many vars
        nobs = 1
    elif dat.ndim == 1 and k_vars == 1:  # one obs one var
        nobs = len(dat)
    else:
        if np.shape(dat)[0] == k_vars and np.shape(dat)[1] != k_vars:
            dat = dat.T

        nobs = np.shape(dat)[0]  # ndim >1 so many obs many vars

    dat = np.reshape(dat, (nobs, k_vars))
    return dat


def gpke(
    bw,
    data,
    data_predict,
    var_type,
    ckertype="gaussian",
    okertype="wangryzin",
    ukertype="aitchisonaitken",
    tosum=True,
):
    r"""
    Returns the non-normalized Generalized Product Kernel Estimator

    Parameters
    ----------
    bw : 1-D ndarray
        The user-specified bandwidth parameters.
    data : 1D or 2-D ndarray
        The training data.
    data_predict : 1-D ndarray
        The evaluation points at which the kernel estimation is performed.
    var_type : str
        The variable type (continuous, ordered, unordered).
    ckertype : str, optional
        The kernel used for the continuous variables.
    okertype : str, optional
        The kernel used for the ordered discrete variables.
    ukertype : str, optional
        The kernel used for the unordered discrete variables.
    tosum : bool, optional
        Whether or not to sum the calculated array of densities.  Default is
        True.

    Returns
    -------
    dens : array_like
        The generalized product kernel density estimator.

    Notes
    -----
    The formula for the multivariate kernel estimator for the pdf is:

    .. math:: f(x)=\frac{1}{nh_{1}...h_{q}}\sum_{i=1}^
                        {n}K\left(\frac{X_{i}-x}{h}\right)

    where

    .. math:: K\left(\frac{X_{i}-x}{h}\right) =
                k\left( \frac{X_{i1}-x_{1}}{h_{1}}\right)\times
                k\left( \frac{X_{i2}-x_{2}}{h_{2}}\right)\times...\times
                k\left(\frac{X_{iq}-x_{q}}{h_{q}}\right)
    """
    kertypes = dict(c=ckertype, o=okertype, u=ukertype)
    # Kval = []
    # for ii, vtype in enumerate(var_type):
    #    func = kernel_func[kertypes[vtype]]
    #    Kval.append(func(bw[ii], data[:, ii], data_predict[ii]))

    # Kval = np.column_stack(Kval)

    Kval = np.empty(data.shape)
    for ii, vtype in enumerate(var_type):
        func = kernel_func[kertypes[vtype]]
        Kval[:, ii] = func(bw[ii], data[:, ii], data_predict[ii])

    iscontinuous = np.array([c == "c" for c in var_type])
    dens = Kval.prod(axis=1) / np.prod(bw[iscontinuous])
    if tosum:
        return dens.sum(axis=0)
    else:
        return dens


def initialize_generator(
    entropy: int | np.random.RandomState | np.random.Generator | None,
) -> np.random.Generator | np.random.RandomState:
    """
    Handle seed transformation to a NumPy random generator object

    Parameters
    ----------
    entropy : {None, int, array_like[int], Generator, RandomState}, optional
        If an initialized NumPy random Generator or an initialized RandomState,
        the object is returned unchanged. If it is an integer or array_like of
        integers, the value is used to seed a new numpy.random.default_rng. If
        None, the functions will continue to use the legacy singleton
        RandomState.

        .. deprecated:: 0.15.0

            In release 0.17.0 or after January 2028, whichever comes sooner,
            using None will initialize a new numpy.random.default_rng using
            system entropy.

    Returns
    -------
    generator: {Generator, RandomState}
        The object that is used for random number generation.
    """
    if entropy is None:
        import warnings

        warnings.warn(
            "After 0.17 or January 2028, whichever comes first, the "
            "default behavior will switch to using an entropy initialized "
            "numpy.random.Generator.",
            FutureWarning,
            stacklevel=3,
        )
        return np.random.mtrand._rand
    return check_random_state(entropy)