Missing Data¶
All of the models can handle missing data. For performance reasons, the default is not to do any checking for missing data. If, however, you would like for missing data to be handled internally, you can do so by using the missing keyword argument. The default is to do nothing
In [1]: import statsmodels.api as sm
In [2]: data = sm.datasets.longley.load()
In [3]: data.exog = sm.add_constant(data.exog)
# add in some missing data
In [4]: missing_idx = np.array([False] * len(data.endog))
In [5]: missing_idx[[4, 10, 15]] = True
In [6]: data.endog[missing_idx] = np.nan
In [7]: ols_model = sm.OLS(data.endog, data.exog)
In [8]: ols_fit = ols_model.fit()
In [9]: print(ols_fit.params)
const NaN
GNPDEFL NaN
GNP NaN
UNEMP NaN
ARMED NaN
POP NaN
YEAR NaN
dtype: float64
This silently fails and all of the model parameters are NaN, which is probably not what you expected. If you are not sure whether or not you have missing data you can use missing = ‘raise’. This will raise a MissingDataError during model instantiation if missing data is present so that you know something was wrong in your input data.
In [10]: ols_model = sm.OLS(data.endog, data.exog, missing='raise')
---------------------------------------------------------------------------
MissingDataError Traceback (most recent call last)
Cell In[10], line 1
----> 1 ols_model = sm.OLS(data.endog, data.exog, missing='raise')
File /opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/statsmodels/regression/linear_model.py:923, in OLS.__init__(self, endog, exog, missing, hasconst, **kwargs)
920 msg = ("Weights are not supported in OLS and will be ignored"
921 "An exception will be raised in the next version.")
922 warnings.warn(msg, ValueWarning)
--> 923 super(OLS, self).__init__(endog, exog, missing=missing,
924 hasconst=hasconst, **kwargs)
925 if "weights" in self._init_keys:
926 self._init_keys.remove("weights")
File /opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/statsmodels/regression/linear_model.py:748, in WLS.__init__(self, endog, exog, weights, missing, hasconst, **kwargs)
746 else:
747 weights = weights.squeeze()
--> 748 super(WLS, self).__init__(endog, exog, missing=missing,
749 weights=weights, hasconst=hasconst, **kwargs)
750 nobs = self.exog.shape[0]
751 weights = self.weights
File /opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/statsmodels/regression/linear_model.py:202, in RegressionModel.__init__(self, endog, exog, **kwargs)
201 def __init__(self, endog, exog, **kwargs):
--> 202 super(RegressionModel, self).__init__(endog, exog, **kwargs)
203 self.pinv_wexog: Float64Array | None = None
204 self._data_attr.extend(['pinv_wexog', 'wendog', 'wexog', 'weights'])
File /opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/statsmodels/base/model.py:270, in LikelihoodModel.__init__(self, endog, exog, **kwargs)
269 def __init__(self, endog, exog=None, **kwargs):
--> 270 super().__init__(endog, exog, **kwargs)
271 self.initialize()
File /opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/statsmodels/base/model.py:95, in Model.__init__(self, endog, exog, **kwargs)
93 missing = kwargs.pop('missing', 'none')
94 hasconst = kwargs.pop('hasconst', None)
---> 95 self.data = self._handle_data(endog, exog, missing, hasconst,
96 **kwargs)
97 self.k_constant = self.data.k_constant
98 self.exog = self.data.exog
File /opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/statsmodels/base/model.py:135, in Model._handle_data(self, endog, exog, missing, hasconst, **kwargs)
134 def _handle_data(self, endog, exog, missing, hasconst, **kwargs):
--> 135 data = handle_data(endog, exog, missing, hasconst, **kwargs)
136 # kwargs arrays could have changed, easier to just attach here
137 for key in kwargs:
File /opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/statsmodels/base/data.py:675, in handle_data(endog, exog, missing, hasconst, **kwargs)
672 exog = np.asarray(exog)
674 klass = handle_data_class_factory(endog, exog)
--> 675 return klass(endog, exog=exog, missing=missing, hasconst=hasconst,
676 **kwargs)
File /opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/statsmodels/base/data.py:72, in ModelData.__init__(self, endog, exog, missing, hasconst, **kwargs)
70 self.formula = kwargs.pop('formula')
71 if missing != 'none':
---> 72 arrays, nan_idx = self.handle_missing(endog, exog, missing,
73 **kwargs)
74 self.missing_row_idx = nan_idx
75 self.__dict__.update(arrays) # attach all the data arrays
File /opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/statsmodels/base/data.py:286, in ModelData.handle_missing(cls, endog, exog, missing, **kwargs)
283 return combined, []
285 elif missing == 'raise':
--> 286 raise MissingDataError("NaNs were encountered in the data")
288 elif missing == 'drop':
289 nan_mask = ~nan_mask
MissingDataError: NaNs were encountered in the data
If you want statsmodels to handle the missing data by dropping the observations, use missing = ‘drop’.
In [11]: ols_model = sm.OLS(data.endog, data.exog, missing='drop')
We are considering adding a configuration framework so that you can set the option with a global setting.