Source code for statsmodels.discrete.conditional_models

"""
Conditional logistic, Poisson, and multinomial logit regression
"""

import collections
import itertools
import warnings

import numpy as np
from scipy.special import logsumexp

import statsmodels.base.model as base
import statsmodels.base.wrapper as wrap
from statsmodels.discrete.discrete_model import (
    MultinomialResults,
    MultinomialResultsWrapper,
)
from statsmodels.formula.formulatools import advance_eval_env
import statsmodels.regression.linear_model as lm
from statsmodels.tools.sm_exceptions import ModelWarning


class _ConditionalModel(base.LikelihoodModel):

    def __init__(self, endog, exog, missing="none", **kwargs):

        if "groups" not in kwargs:
            raise ValueError("'groups' is a required argument")
        groups = kwargs["groups"]

        if groups.size != endog.size:
            msg = "'endog' and 'groups' should have the same dimensions"
            raise ValueError(msg)

        if exog.shape[0] != endog.size:
            msg = "The leading dimension of 'exog' should equal the length of 'endog'"
            raise ValueError(msg)

        super().__init__(endog, exog, missing=missing, **kwargs)

        if self.data.const_idx is not None:
            msg = "Conditional models should not have an intercept in the design matrix"
            raise ValueError(msg)

        exog = self.exog
        self.k_params = exog.shape[1]

        # Get the row indices for each group
        row_ix = {}
        for i, g in enumerate(groups):
            if g not in row_ix:
                row_ix[g] = []
            row_ix[g].append(i)

        # Split the data into groups and remove groups with no variation
        endog, exog = np.asarray(endog), np.asarray(exog)
        offset = kwargs.get("offset")
        self._endog_grp = []
        self._exog_grp = []
        self._groupsize = []
        if offset is not None:
            offset = np.asarray(offset)
            self._offset_grp = []
        self._offset = []
        self._sumy = []
        self.nobs = 0
        drops = [0, 0]
        for _, ix in row_ix.items():
            y = endog[ix].flat
            if np.std(y) == 0:
                drops[0] += 1
                drops[1] += len(y)
                continue
            self.nobs += len(y)
            self._endog_grp.append(y)
            if offset is not None:
                self._offset_grp.append(offset[ix])
            self._groupsize.append(len(y))
            self._exog_grp.append(exog[ix, :])
            self._sumy.append(np.sum(y))

        if drops[0] > 0:
            msg = (
                f"Dropped {drops[0]} groups and {drops[1]} observations for "
                f"having no within-group variance"
            )
            warnings.warn(msg, ModelWarning, stacklevel=2)

        # This can be pre-computed
        if offset is not None:
            self._endofs = []
            for k, ofs in enumerate(self._offset_grp):
                self._endofs.append(np.dot(self._endog_grp[k], ofs))

        # Number of groups
        self._n_groups = len(self._endog_grp)

        # These are the sufficient statistics
        self._xy = []
        self._n1 = []
        for g in range(self._n_groups):
            self._xy.append(np.dot(self._endog_grp[g], self._exog_grp[g]))
            self._n1.append(np.sum(self._endog_grp[g]))

    def hessian(self, params):

        from statsmodels.tools.numdiff import approx_fprime

        hess = approx_fprime(params, self.score)
        hess = np.atleast_2d(hess)
        return hess

    def fit(
        self,
        start_params=None,
        method="BFGS",
        maxiter=100,
        full_output=True,
        disp=False,
        fargs=(),
        callback=None,
        retall=False,
        skip_hessian=False,
        **kwargs,
    ):

        rslt = super().fit(
            start_params=start_params,
            method=method,
            maxiter=maxiter,
            full_output=full_output,
            disp=disp,
            skip_hessian=skip_hessian,
        )

        if skip_hessian:
            cov_params = None
        else:
            cov_params = rslt.cov_params()

        crslt = ConditionalResults(self, rslt.params, cov_params, 1)
        crslt.method = method
        crslt.nobs = self.nobs
        crslt.n_groups = self._n_groups
        crslt._group_stats = [
            f"{min(self._groupsize):d}",
            f"{max(self._groupsize):d}",
            f"{np.mean(self._groupsize):.1f}",
        ]
        rslt = ConditionalResultsWrapper(crslt)
        return rslt

    def fit_regularized(
        self, method="elastic_net", alpha=0.0, start_params=None, refit=False, **kwargs
    ):
        """
        Return a regularized fit to a linear regression model.

        Parameters
        ----------
        method : {'elastic_net'}
            Only the `elastic_net` approach is currently implemented.
        alpha : scalar or array_like
            The penalty weight.  If a scalar, the same penalty weight
            applies to all variables in the model.  If a vector, it
            must have the same length as `params`, and contains a
            penalty weight for each coefficient.
        start_params : array_like
            Starting values for `params`.
        refit : bool
            If True, the model is refit using only the variables that
            have non-zero coefficients in the regularized fit.  The
            refitted model is not regularized.
        **kwargs
            Additional keyword argument that are used when fitting the model.

        Returns
        -------
        Results
            A results instance.
        """

        from statsmodels.base.elastic_net import fit_elasticnet

        if method != "elastic_net":
            raise ValueError("method for fit_regularized must be elastic_net")

        defaults = {"maxiter": 50, "L1_wt": 1, "cnvrg_tol": 1e-10, "zero_tol": 1e-10}
        defaults.update(kwargs)

        return fit_elasticnet(
            self,
            method=method,
            alpha=alpha,
            start_params=start_params,
            refit=refit,
            **defaults,
        )

    # Override to allow groups to be passed as a variable name.
    @classmethod
    def from_formula(cls, formula, data, subset=None, drop_cols=None, *args, **kwargs):

        try:
            groups = kwargs["groups"]
            del kwargs["groups"]
        except KeyError as err:
            raise ValueError("'groups' is a required argument") from err

        if isinstance(groups, str):
            groups = data[groups]

        if "0+" not in formula.replace(" ", ""):
            warnings.warn(
                "Conditional models should not include an intercept",
                ModelWarning,
                stacklevel=2,
            )
        advance_eval_env(kwargs)
        model = super().from_formula(formula, data, *args, groups=groups, **kwargs)

        return model



[docs]
class ConditionalLogit(_ConditionalModel):
    """
    Fit a conditional logistic regression model to grouped data.

    Every group is implicitly given an intercept, but the model is fit using
    a conditional likelihood in which the intercepts are not present.  Thus,
    intercept estimates are not given, but the other parameter estimates can
    be interpreted as being adjusted for any group-level confounders.

    Parameters
    ----------
    endog : array_like
        The response variable, must contain only 0 and 1.
    exog : array_like
        The array of covariates.  Do not include an intercept
        in this array.
    groups : array_like
        Codes defining the groups. This is a required keyword parameter.
    missing : str
        Available options are 'none', 'drop', and 'raise'. If 'none', no nan
        checking is done. If 'drop', any observations with nans are dropped.
        If 'raise', an error is raised.
    """

    def __init__(self, endog, exog, missing="none", **kwargs):

        super().__init__(endog, exog, missing=missing, **kwargs)

        if np.any(np.unique(self.endog) != np.r_[0, 1]):
            msg = "endog must be coded as 0, 1"
            raise ValueError(msg)

        self.K = self.exog.shape[1]
        # i.e. self.k_params, for compatibility with MNLogit


[docs]
    def loglike(self, params):

        ll = 0
        for g in range(len(self._endog_grp)):
            ll += self.loglike_grp(g, params)

        return ll



[docs]
    def score(self, params):

        score = 0
        for g in range(self._n_groups):
            score += self.score_grp(g, params)

        return score


    def _denom(self, grp, params, ofs=None):

        if ofs is None:
            ofs = 0

        exb = np.exp(np.dot(self._exog_grp[grp], params) + ofs)

        # In the recursions, f may be called multiple times with the
        # same arguments, so we memoize the results.
        memo = {}

        def f(t, k):
            if t < k:
                return 0
            if k == 0:
                return 1

            try:
                return memo[(t, k)]
            except KeyError:
                pass

            v = f(t - 1, k) + f(t - 1, k - 1) * exb[t - 1]
            memo[(t, k)] = v

            return v

        return f(self._groupsize[grp], self._n1[grp])

    def _denom_grad(self, grp, params, ofs=None):

        if ofs is None:
            ofs = 0

        ex = self._exog_grp[grp]
        exb = np.exp(np.dot(ex, params) + ofs)

        # s may be called multiple times in the recursions with the
        # same arguments, so memoize the results.
        memo = {}

        def s(t, k):

            if t < k:
                return 0, np.zeros(self.k_params)
            if k == 0:
                return 1, 0

            try:
                return memo[(t, k)]
            except KeyError:
                pass

            h = exb[t - 1]
            a, b = s(t - 1, k)
            c, e = s(t - 1, k - 1)
            d = c * h * ex[t - 1, :]

            u, v = a + c * h, b + d + e * h
            memo[(t, k)] = (u, v)

            return u, v

        return s(self._groupsize[grp], self._n1[grp])


[docs]
    def loglike_grp(self, grp, params):

        ofs = None
        if hasattr(self, "offset"):
            ofs = self._offset_grp[grp]

        llg = np.dot(self._xy[grp], params)

        if ofs is not None:
            llg += self._endofs[grp]

        llg -= np.log(self._denom(grp, params, ofs))

        return llg



[docs]
    def score_grp(self, grp, params):

        ofs = 0
        if hasattr(self, "offset"):
            ofs = self._offset_grp[grp]

        d, h = self._denom_grad(grp, params, ofs)
        return self._xy[grp] - h / d





[docs]
class ConditionalPoisson(_ConditionalModel):
    """
    Fit a conditional Poisson regression model to grouped data.

    Every group is implicitly given an intercept, but the model is fit using
    a conditional likelihood in which the intercepts are not present.  Thus,
    intercept estimates are not given, but the other parameter estimates can
    be interpreted as being adjusted for any group-level confounders.

    Parameters
    ----------
    endog : array_like
        The response variable
    exog : array_like
        The covariates
    groups : array_like
        Codes defining the groups. This is a required keyword parameter.
    missing : str
        Available options are 'none', 'drop', and 'raise'. If 'none', no nan
        checking is done. If 'drop', any observations with nans are dropped.
        If 'raise', an error is raised.
    """


[docs]
    def loglike(self, params):

        ofs = None
        if hasattr(self, "offset"):
            ofs = self._offset_grp

        ll = 0.0

        for i in range(len(self._endog_grp)):

            xb = np.dot(self._exog_grp[i], params)
            if ofs is not None:
                xb += ofs[i]
            exb = np.exp(xb)
            y = self._endog_grp[i]
            ll += np.dot(y, xb)
            s = exb.sum()
            ll -= self._sumy[i] * np.log(s)

        return ll



[docs]
    def score(self, params):

        ofs = None
        if hasattr(self, "offset"):
            ofs = self._offset_grp

        score = 0.0

        for i in range(len(self._endog_grp)):

            x = self._exog_grp[i]
            xb = np.dot(x, params)
            if ofs is not None:
                xb += ofs[i]
            exb = np.exp(xb)
            s = exb.sum()
            y = self._endog_grp[i]
            score += np.dot(y, x)
            score -= self._sumy[i] * np.dot(exb, x) / s

        return score





[docs]
class ConditionalResults(base.LikelihoodModelResults):
    def __init__(self, model, params, normalized_cov_params, scale):

        super().__init__(
            model, params, normalized_cov_params=normalized_cov_params, scale=scale
        )


[docs]
    def summary(self, yname=None, xname=None, title=None, alpha=0.05):
        """
        Summarize the fitted model.

        Parameters
        ----------
        yname : str, optional
            Default is `y`
        xname : list[str], optional
            Names for the exogenous variables, default is "var_xx".
            Must match the number of parameters in the model
        title : str, optional
            Title for the top table. If not None, then this replaces the
            default title
        alpha : float
            Significance level for the confidence intervals

        Returns
        -------
        smry : Summary instance
            This holds the summary tables and text, which can be printed or
            converted to various output formats.

        See Also
        --------
        statsmodels.iolib.summary.Summary : class to hold summary
            results
        """

        top_left = [
            ("Dep. Variable:", None),
            ("Model:", None),
            ("Log-Likelihood:", None),
            ("Method:", [self.method]),
            ("Date:", None),
            ("Time:", None),
        ]

        top_right = [
            ("No. Observations:", None),
            ("No. groups:", [self.n_groups]),
            ("Min group size:", [self._group_stats[0]]),
            ("Max group size:", [self._group_stats[1]]),
            ("Mean group size:", [self._group_stats[2]]),
        ]

        if title is None:
            title = "Conditional Logit Model Regression Results"

        # create summary tables
        from statsmodels.iolib.summary import Summary

        smry = Summary()
        smry.add_table_2cols(
            self,
            gleft=top_left,
            gright=top_right,  # [],
            yname=yname,
            xname=xname,
            title=title,
        )
        smry.add_table_params(
            self, yname=yname, xname=xname, alpha=alpha, use_t=self.use_t
        )

        return smry





[docs]
class ConditionalMNLogit(_ConditionalModel):
    """
    Fit a conditional multinomial logit model to grouped data.

    Parameters
    ----------
    endog : array_like
        The dependent variable, must be integer-valued, coded
        0, 1, ..., c-1, where c is the number of response
        categories.
    exog : array_like
        The independent variables.
    groups : array_like
        Codes defining the groups. This is a required keyword parameter.
    missing : str
        Available options are 'none', 'drop', and 'raise'. If 'none', no nan
        checking is done. If 'drop', any observations with nans are dropped.
        If 'raise', an error is raised.

    Notes
    -----
    Equivalent to femlogit in Stata.

    References
    ----------
    Gary Chamberlain (1980).  Analysis of covariance with qualitative
    data. The Review of Economic Studies.  Vol. 47, No. 1, pp. 225-238.
    """

    def __init__(self, endog, exog, missing="none", **kwargs):

        super().__init__(endog, exog, missing=missing, **kwargs)

        # endog must be integers
        self.endog = self.endog.astype(int)

        self.k_cat = self.endog.max() + 1
        self.df_model = (self.k_cat - 1) * self.exog.shape[1]
        self.df_resid = self.nobs - self.df_model
        self._ynames_map = {j: str(j) for j in range(self.k_cat)}
        self.J = self.k_cat  # Unfortunate name, needed for results
        self.K = self.exog.shape[1]  # for compatibility with MNLogit

        if self.endog.min() < 0:
            msg = "endog may not contain negative values"
            raise ValueError(msg)

        grx = collections.defaultdict(list)
        for k, v in enumerate(self.groups):
            grx[v].append(k)
        self._group_labels = list(grx.keys())
        self._group_labels.sort()
        self._grp_ix = [grx[k] for k in self._group_labels]


[docs]
    def fit(
        self,
        start_params=None,
        method="BFGS",
        maxiter=100,
        full_output=True,
        disp=False,
        fargs=(),
        callback=None,
        retall=False,
        skip_hessian=False,
        generator=None,
        **kwargs,
    ):

        if start_params is None:
            q = self.exog.shape[1]
            c = self.k_cat - 1
            if isinstance(generator, (np.random.RandomState, np.random.Generator)):
                start_params = generator.normal(size=q * c)
            else:
                start_params = np.random.normal(size=q * c)

        # Do not call super(...).fit because it cannot handle the 2d-params.
        rslt = base.LikelihoodModel.fit(
            self,
            start_params=start_params,
            method=method,
            maxiter=maxiter,
            full_output=full_output,
            disp=disp,
            skip_hessian=skip_hessian,
        )

        rslt.params = rslt.params.reshape((self.exog.shape[1], -1))
        rslt = MultinomialResults(self, rslt)

        # Not clear what the null likelihood should be, there is no intercept
        # so the null model is not clearly defined.  This is needed for summary
        # to work.
        rslt.set_null_options(llnull=np.nan)

        return MultinomialResultsWrapper(rslt)



[docs]
    def loglike(self, params):

        q = self.exog.shape[1]
        c = self.k_cat - 1

        pmat = params.reshape((q, c))
        pmat = np.concatenate((np.zeros((q, 1)), pmat), axis=1)
        lpr = np.dot(self.exog, pmat)

        ll = 0.0

        # denom - immediately calculate the sums for the selected elements

        for ii in self._grp_ix:
            x = lpr[ii, :]
            jj = np.arange(x.shape[0], dtype=int)
            y = self.endog[ii]
            denom = np.sum(x[jj, list(itertools.permutations(y))], axis=1)
            ll += x[(jj, y)].sum() - logsumexp(denom)

        return ll



[docs]
    def score(self, params):

        q = self.exog.shape[1]
        c = self.k_cat - 1

        pmat = params.reshape((q, c))
        pmat = np.concatenate((np.zeros((q, 1)), pmat), axis=1)
        lpr = np.dot(self.exog, pmat)

        grad = np.zeros((q, c))
        for ii in self._grp_ix:
            x = lpr[ii, :]
            jj = np.arange(x.shape[0], dtype=int)
            y = self.endog[ii]
            denomg = np.zeros((q, c)).T

            # Extract itertools.permutations(y) to the list
            iter_ = np.array(list(itertools.permutations(y)))

            # Instead of iterative exponential value of sums of
            # selected elements and their product by selected
            # elements from self.exog, we calculate them at
            # once (exp_sum, exog_exp_multy).
            exp_sum = np.exp(np.sum(x[jj, iter_], axis=1))
            denom = np.sum(exp_sum)

            ind_exog = np.arange(iter_.shape[1], dtype=np.int32)
            hist = len(iter_)
            mask = iter_ != 0
            iexog = np.take(ind_exog, mask.nonzero()[1])
            iexog = iexog.reshape((hist, int(len(iexog) / hist)))
            ii_ = np.take(ii, iexog)
            exog_exp_multy = self.exog[ii_, :] * exp_sum[:, np.newaxis, np.newaxis]
            ind_iter = iter_[mask].reshape((hist, int(len(iter_[mask]) / hist))) - 1
            np.add.at(denomg, ind_iter, exog_exp_multy)
            denomg = denomg.T

            for i, r in enumerate(y):
                if r != 0:
                    grad[:, r - 1] += self.exog[ii[i], :]

            grad -= denomg / denom

        return grad.flatten()




class ConditionalResultsWrapper(lm.RegressionResultsWrapper):
    pass


wrap.populate_wrapper(ConditionalResultsWrapper, ConditionalResults)