Source code for statsmodels.regression._prediction

"""
Created on Fri Dec 19 11:29:18 2014

Author: Josef Perktold
License: BSD-3

"""

import numpy as np
import pandas as pd
from scipy import stats

from statsmodels.formula._manager import FormulaManager


# this is similar to ContrastResults after t_test, copied and adjusted

[docs]
class PredictionResults:
    """
    Results class for predictions

    Parameters
    ----------
    predicted_mean : ndarray
        The array containing the prediction means.
    var_pred_mean : ndarray
        The array of the variance of the prediction means.
    var_resid : ndarray
        The array of residual variances.
    df : int
        The degree of freedom used if dist is 't'.
    dist : {'norm', 't', object}
        Either a string for the normal or t distribution or another object
        that exposes a `ppf` method.
    row_labels : list[str]
        Row labels used in summary frame.

    """

    def __init__(self, predicted_mean, var_pred_mean, var_resid,
                 df=None, dist=None, row_labels=None):
        self.predicted = predicted_mean
        self.var_pred = var_pred_mean
        self.df = df
        self.var_resid = var_resid
        self.row_labels = row_labels

        if dist is None or dist == "norm":
            self.dist = stats.norm
            self.dist_args = ()
        elif dist == "t":
            self.dist = stats.t
            self.dist_args = (self.df,)
        else:
            self.dist = dist
            self.dist_args = ()

    @property
    def se_obs(self):
        return np.sqrt(self.var_pred_mean + self.var_resid)

    @property
    def se_mean(self):
        return self.se

    @property
    def predicted_mean(self):
        # alias for backwards compatibility
        return self.predicted

    @property
    def var_pred_mean(self):
        # alias for backwards compatibility
        return self.var_pred

    @property
    def se(self):
        # alias for backwards compatibility
        return np.sqrt(self.var_pred_mean)


[docs]
    def conf_int(self, obs=False, alpha=0.05):
        """
        Confidence or prediction interval for the predicted values

        Parameters
        ----------
        obs : bool, optional
            If False (default), return the confidence interval for the mean
            prediction, using the standard error of the mean (``se_mean``).
            If True, return the prediction interval for a new observation,
            using the standard error of the observation (``se_obs``), which
            also accounts for the residual variance.
        alpha : float, optional
            The significance level for the confidence interval.
            ie., The default `alpha` = .05 returns a 95% confidence interval.

        Returns
        -------
        ci : ndarray, (nobs, 2)
            The lower and upper bound of the interval for each observation.
            Column 0 contains the lower bound, column 1 the upper bound.

        """
        se = self.se_obs if obs else self.se_mean

        q = self.dist.ppf(1 - alpha / 2., *self.dist_args)
        lower = self.predicted_mean - q * se
        upper = self.predicted_mean + q * se
        return np.column_stack((lower, upper))



[docs]
    def summary_frame(self, alpha=0.05):
        # TODO: finish and cleanup
        ci_obs = self.conf_int(alpha=alpha, obs=True)  # need to split
        ci_mean = self.conf_int(alpha=alpha, obs=False)
        to_include = {}
        to_include["mean"] = self.predicted_mean
        to_include["mean_se"] = self.se_mean
        to_include["mean_ci_lower"] = ci_mean[:, 0]
        to_include["mean_ci_upper"] = ci_mean[:, 1]
        to_include["obs_ci_lower"] = ci_obs[:, 0]
        to_include["obs_ci_upper"] = ci_obs[:, 1]

        self.table = to_include
        # pandas dict does not handle 2d_array
        # data = np.column_stack(list(to_include.values()))
        # names = ....
        res = pd.DataFrame(to_include, index=self.row_labels,
                           columns=to_include.keys())
        return res




def get_prediction(self, exog=None, transform=True, weights=None,
                   row_labels=None, pred_kwds=None):
    """
    Compute prediction results

    Parameters
    ----------
    self : RegressionResults
        Results instance used to generate predictions.
    exog : array_like, optional
        The values for which you want to predict.
    transform : bool, optional
        If the model was fit via a formula, do you want to pass
        exog through the formula. Default is True. E.g., if you fit
        a model y ~ log(x1) + log(x2), and transform is True, then
        you can pass a data structure that contains x1 and x2 in
        their original form. Otherwise, you'd need to log the data
        first.
    weights : array_like, optional
        Weights interpreted as in WLS, used for the variance of the predicted
        residual.
    row_labels : list
        A list of row labels to use.  If not provided, read `exog` is
        available.
    pred_kwds : dict, optional
        Some models can take additional keyword arguments, see the predict
        method of the model for details.

    Returns
    -------
    linear_model.PredictionResults
        The prediction results instance contains prediction and prediction
        variance and can on demand calculate confidence intervals and summary
        tables for the prediction of the mean and of new observations.

    """
    # prepare exog and row_labels, based on base Results.predict
    if transform and hasattr(self.model, "formula") and exog is not None:
        if isinstance(exog, pd.Series):
            # GH-6509
            exog = pd.DataFrame(exog)
        exog = FormulaManager().get_matrices(self.model.data.model_spec, exog)

    if exog is not None:
        if row_labels is None:
            row_labels = getattr(exog, "index", None)
            if callable(row_labels):
                row_labels = None

        exog = np.asarray(exog)
        if exog.ndim == 1:
            # Params informs whether a row or column vector
            if self.params.shape[0] > 1:
                exog = exog[None, :]
            else:
                exog = exog[:, None]
        exog = np.atleast_2d(exog)  # needed in count model shape[1]
    else:
        exog = self.model.exog
        if weights is None:
            weights = getattr(self.model, "weights", None)

        if row_labels is None:
            row_labels = getattr(self.model.data, "row_labels", None)

    # need to handle other arrays, TODO: is delegating to model possible ?
    if weights is not None:
        weights = np.asarray(weights)
        if (weights.size > 1 and
                (weights.ndim != 1 or weights.shape[0] == exog.shape[1])):
            raise ValueError("weights has wrong shape")

    if pred_kwds is None:
        pred_kwds = {}
    predicted_mean = self.model.predict(self.params, exog, **pred_kwds)

    covb = self.cov_params()
    var_pred_mean = (exog * np.dot(covb, exog.T).T).sum(1)
    var_resid = self.scale  # self.mse_resid / weights

    # TODO: check that we have correct scale, Refactor scale #???
    # special case for now:
    if self.cov_type == "fixed scale":
        var_resid = self.cov_kwds["scale"]

    if weights is not None:
        var_resid /= weights

    dist = ["norm", "t"][self.use_t]
    return PredictionResults(predicted_mean, var_pred_mean, var_resid,
                             df=self.df_resid, dist=dist,
                             row_labels=row_labels)