Missing Data

All of the models can handle missing data. For performance reasons, the default is not to do any checking for missing data. If, however, you would like for missing data to be handled internally, you can do so by using the missing keyword argument. The default is to do nothing

In [1]: import statsmodels.api as sm

In [2]: data = sm.datasets.longley.load()

In [3]: data.exog = sm.add_constant(data.exog)

# add in some missing data
In [4]: missing_idx = np.array([False] * len(data.endog))

In [5]: missing_idx[[4, 10, 15]] = True

In [6]: data.endog[missing_idx] = np.nan

In [7]: ols_model = sm.OLS(data.endog, data.exog)

In [8]: ols_fit = ols_model.fit()

In [9]: print(ols_fit.params)
const     NaN
GNPDEFL   NaN
GNP       NaN
UNEMP     NaN
ARMED     NaN
POP       NaN
YEAR      NaN
dtype: float64

This silently fails and all of the model parameters are NaN, which is probably not what you expected. If you are not sure whether or not you have missing data you can use missing = ‘raise’. This will raise a MissingDataError during model instantiation if missing data is present so that you know something was wrong in your input data.

In [10]: ols_model = sm.OLS(data.endog, data.exog, missing='raise')
---------------------------------------------------------------------------
MissingDataError                          Traceback (most recent call last)
<ipython-input-10-5debd60362bf> in <module>
----> 1 ols_model = sm.OLS(data.endog, data.exog, missing='raise')

~/work/statsmodels/statsmodels/statsmodels/regression/linear_model.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
    890                    "An exception will be raised in the next version.")
    891             warnings.warn(msg, ValueWarning)
--> 892         super(OLS, self).__init__(endog, exog, missing=missing,
    893                                   hasconst=hasconst, **kwargs)
    894         if "weights" in self._init_keys:

~/work/statsmodels/statsmodels/statsmodels/regression/linear_model.py in __init__(self, endog, exog, weights, missing, hasconst, **kwargs)
    717         else:
    718             weights = weights.squeeze()
--> 719         super(WLS, self).__init__(endog, exog, missing=missing,
    720                                   weights=weights, hasconst=hasconst, **kwargs)
    721         nobs = self.exog.shape[0]

~/work/statsmodels/statsmodels/statsmodels/regression/linear_model.py in __init__(self, endog, exog, **kwargs)
    191     """
    192     def __init__(self, endog, exog, **kwargs):
--> 193         super(RegressionModel, self).__init__(endog, exog, **kwargs)
    194         self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
    195 

~/work/statsmodels/statsmodels/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
    265 
    266     def __init__(self, endog, exog=None, **kwargs):
--> 267         super().__init__(endog, exog, **kwargs)
    268         self.initialize()
    269 

~/work/statsmodels/statsmodels/statsmodels/base/model.py in __init__(self, endog, exog, **kwargs)
     90         missing = kwargs.pop('missing', 'none')
     91         hasconst = kwargs.pop('hasconst', None)
---> 92         self.data = self._handle_data(endog, exog, missing, hasconst,
     93                                       **kwargs)
     94         self.k_constant = self.data.k_constant

~/work/statsmodels/statsmodels/statsmodels/base/model.py in _handle_data(self, endog, exog, missing, hasconst, **kwargs)
    130 
    131     def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
--> 132         data = handle_data(endog, exog, missing, hasconst, **kwargs)
    133         # kwargs arrays could have changed, easier to just attach here
    134         for key in kwargs:

~/work/statsmodels/statsmodels/statsmodels/base/data.py in handle_data(endog, exog, missing, hasconst, **kwargs)
    671 
    672     klass = handle_data_class_factory(endog, exog)
--> 673     return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
    674                  **kwargs)

~/work/statsmodels/statsmodels/statsmodels/base/data.py in __init__(self, endog, exog, missing, hasconst, **kwargs)
     68             self.formula = kwargs.pop('formula')
     69         if missing != 'none':
---> 70             arrays, nan_idx = self.handle_missing(endog, exog, missing,
     71                                                   **kwargs)
     72             self.missing_row_idx = nan_idx

~/work/statsmodels/statsmodels/statsmodels/base/data.py in handle_missing(cls, endog, exog, missing, **kwargs)
    282 
    283         elif missing == 'raise':
--> 284             raise MissingDataError("NaNs were encountered in the data")
    285 
    286         elif missing == 'drop':

MissingDataError: NaNs were encountered in the data

If you want statsmodels to handle the missing data by dropping the observations, use missing = ‘drop’.

In [11]: ols_model = sm.OLS(data.endog, data.exog, missing='drop')

We are considering adding a configuration framework so that you can set the option with a global setting.