# Source code for statsmodels.base.elastic_net

```import numpy as np
from statsmodels.base.model import Results
import statsmodels.base.wrapper as wrap

"""
Elastic net regularization.

Routines for fitting regression models using elastic net
regularization.  The elastic net minimizes the objective function

-llf / nobs + alpha((1 - L1_wt) * sum(params**2) / 2 +
L1_wt * sum(abs(params)))

The algorithm implemented here closely follows the implementation in
the R glmnet package, documented here:

http://cran.r-project.org/web/packages/glmnet/index.html

and here:

http://www.jstatsoft.org/v33/i01/paper

This routine should work for any regression model that implements
loglike, score, and hess.
"""

def _gen_npfuncs(k, L1_wt, alpha, loglike_kwds, score_kwds, hess_kwds):
"""
Negative penalized log-likelihood functions.

Returns the negative penalized log-likelihood, its derivative, and
its Hessian.  The penalty only includes the smooth (L2) term.

All three functions have argument signature (x, model), where
``x`` is a point in the parameter space and ``model`` is an
arbitrary statsmodels regression model.
"""

def nploglike(params, model):
nobs = model.nobs
pen_llf = alpha[k] * (1 - L1_wt) * np.sum(params**2) / 2
llf = model.loglike(np.r_[params], **loglike_kwds)
return - llf / nobs + pen_llf

def npscore(params, model):
nobs = model.nobs
pen_grad = alpha[k] * (1 - L1_wt) * params
gr = -model.score(np.r_[params], **score_kwds) / nobs

def nphess(params, model):
nobs = model.nobs
pen_hess = alpha[k] * (1 - L1_wt)
h = -model.hessian(np.r_[params], **hess_kwds)[0, 0] / nobs + pen_hess
return h

return nploglike, npscore, nphess

def fit_elasticnet(model, method="coord_descent", maxiter=100,
alpha=0., L1_wt=1., start_params=None, cnvrg_tol=1e-7,
zero_tol=1e-8, refit=False, check_step=True,
loglike_kwds=None, score_kwds=None, hess_kwds=None):
"""
Return an elastic net regularized fit to a regression model.

Parameters
----------
model : model object
A statsmodels object implementing ``loglike``, ``score``, and
``hessian``.
method : {'coord_descent'}
Only the coordinate descent algorithm is implemented.
maxiter : int
The maximum number of iteration cycles (an iteration cycle
involves running coordinate descent on all variables).
alpha : scalar or array_like
The penalty weight.  If a scalar, the same penalty weight
applies to all variables in the model.  If a vector, it
must have the same length as `params`, and contains a
penalty weight for each coefficient.
L1_wt : scalar
The fraction of the penalty given to the L1 penalty term.
Must be between 0 and 1 (inclusive).  If 0, the fit is
a ridge fit, if 1 it is a lasso fit.
start_params : array_like
Starting values for `params`.
cnvrg_tol : scalar
If `params` changes by less than this amount (in sup-norm)
in one iteration cycle, the algorithm terminates with
convergence.
zero_tol : scalar
Any estimated coefficient smaller than this value is
replaced with zero.
refit : bool
If True, the model is refit using only the variables that have
non-zero coefficients in the regularized fit.  The refitted
model is not regularized.
check_step : bool
If True, confirm that the first step is an improvement and search
further if it is not.
loglike_kwds : dict-like or None
Keyword arguments for the log-likelihood function.
score_kwds : dict-like or None
Keyword arguments for the score function.
hess_kwds : dict-like or None
Keyword arguments for the Hessian function.

Returns
-------
Results
A results object.

Notes
-----
The ``elastic net`` penalty is a combination of L1 and L2
penalties.

The function that is minimized is:

-loglike/n + alpha*((1-L1_wt)*|params|_2^2/2 + L1_wt*|params|_1)

where |*|_1 and |*|_2 are the L1 and L2 norms.

The computational approach used here is to obtain a quadratic
approximation to the smooth part of the target function:

-loglike/n + alpha*(1-L1_wt)*|params|_2^2/2

then repeatedly optimize the L1 penalized version of this function
along coordinate axes.
"""

k_exog = model.exog.shape

loglike_kwds = {} if loglike_kwds is None else loglike_kwds
score_kwds = {} if score_kwds is None else score_kwds
hess_kwds = {} if hess_kwds is None else hess_kwds

if np.isscalar(alpha):
alpha = alpha * np.ones(k_exog)

# Define starting params
if start_params is None:
params = np.zeros(k_exog)
else:
params = start_params.copy()

btol = 1e-4
params_zero = np.zeros(len(params), dtype=bool)

init_args = model._get_init_kwds()
# we do not need a copy of init_args b/c get_init_kwds provides new dict
init_args['hasconst'] = False
model_offset = init_args.pop('offset', None)
if 'exposure' in init_args and init_args['exposure'] is not None:
if model_offset is None:
model_offset = np.log(init_args.pop('exposure'))
else:
model_offset += np.log(init_args.pop('exposure'))

fgh_list = [
_gen_npfuncs(k, L1_wt, alpha, loglike_kwds, score_kwds, hess_kwds)
for k in range(k_exog)]

converged = False

for itr in range(maxiter):

# Sweep through the parameters
params_save = params.copy()
for k in range(k_exog):

# Under the active set method, if a parameter becomes
# zero we do not try to change it again.
# TODO : give the user the option to switch this off
if params_zero[k]:
continue

# Set the offset to account for the variables that are
# being held fixed in the current coordinate
# optimization.
params0 = params.copy()
params0[k] = 0
offset = np.dot(model.exog, params0)
if model_offset is not None:
offset += model_offset

# Create a one-variable model for optimization.
model_1var = model.__class__(
model.endog, model.exog[:, k], offset=offset, **init_args)

# Do the one-dimensional optimization.
params[k] = _opt_1d(
func, grad, hess, model_1var, params[k], alpha[k]*L1_wt,
tol=btol, check_step=check_step)

# Update the active set
if itr > 0 and np.abs(params[k]) < zero_tol:
params_zero[k] = True
params[k] = 0.

# Check for convergence
pchange = np.max(np.abs(params - params_save))
if pchange < cnvrg_tol:
converged = True
break

# Set approximate zero coefficients to be exactly zero
params[np.abs(params) < zero_tol] = 0

if not refit:
results = RegularizedResults(model, params)
results.converged = converged
return RegularizedResultsWrapper(results)

# Fit the reduced model to get standard errors and other
# post-estimation results.
ii = np.flatnonzero(params)
cov = np.zeros((k_exog, k_exog))
init_args = dict([(k, getattr(model, k, None)) for k in model._init_keys])
if len(ii) > 0:
model1 = model.__class__(
model.endog, model.exog[:, ii], **init_args)
rslt = model1.fit()
params[ii] = rslt.params
cov[np.ix_(ii, ii)] = rslt.normalized_cov_params
else:
# Hack: no variables were selected but we need to run fit in
# order to get the correct results class.  So just fit a model
# with one variable.
model1 = model.__class__(model.endog, model.exog[:, 0], **init_args)
rslt = model1.fit(maxiter=0)

# fit may return a results or a results wrapper
if issubclass(rslt.__class__, wrap.ResultsWrapper):
klass = rslt._results.__class__
else:
klass = rslt.__class__

# Not all models have a scale
if hasattr(rslt, 'scale'):
scale = rslt.scale
else:
scale = 1.

# The degrees of freedom should reflect the number of parameters
# in the refit model, not including the zeros that are displayed
# to indicate which variables were dropped.  See issue #1723 for
# discussion about setting df parameters in model and results
# classes.
p, q = model.df_model, model.df_resid
model.df_model = len(ii)
model.df_resid = model.nobs - model.df_model

# Assuming a standard signature for creating results classes.
refit = klass(model, params, cov, scale=scale)
refit.regularized = True
refit.converged = converged
refit.method = method
refit.fit_history = {'iteration': itr + 1}

# Restore df in model class, see issue #1723 for discussion.
model.df_model, model.df_resid = p, q

return refit

def _opt_1d(func, grad, hess, model, start, L1_wt, tol,
check_step=True):
"""
One-dimensional helper for elastic net.

Parameters
----------
func : function
A smooth function of a single variable to be optimized
with L1 penaty.
hess : function
The Hessian of `func`.
model : statsmodels model
The model being fit.
start : real
A starting value for the function argument
L1_wt : non-negative real
The weight for the L1 penalty function.
tol : non-negative real
A convergence threshold.
check_step : bool
If True, check that the first step is an improvement and
use bisection if it is not.  If False, return after the
first step regardless.

Notes
-----
``func``, ``grad``, and ``hess`` have argument signature (x,
model), where ``x`` is a point in the parameter space and
``model`` is the model being fit.

If the log-likelihood for the model is exactly quadratic, the
global minimum is returned in one step.  Otherwise numerical
bisection is used.

Returns
-------
The argmin of the objective function.
"""

# Overview:
# We want to minimize L(x) + L1_wt*abs(x), where L() is a smooth
# loss function that includes the log-likelihood and L2 penalty.
# This is a 1-dimensional optimization.  If L(x) is exactly
# quadratic we can solve for the argmin exactly.  Otherwise we
# approximate L(x) with a quadratic function Q(x) and try to use
# the minimizer of Q(x) + L1_wt*abs(x).  But if this yields an
# uphill step for the actual target function L(x) + L1_wt*abs(x),
# then we fall back to a expensive line search.  The line search
# is never needed for OLS.

x = start
f = func(x, model)
c = hess(x, model)
d = b - c*x

# The optimum is achieved by hard thresholding to zero
if L1_wt > np.abs(d):
return 0.

# x + h is the minimizer of the Q(x) + L1_wt*abs(x)
if d >= 0:
h = (L1_wt - b) / c
elif d < 0:
h = -(L1_wt + b) / c
else:
return np.nan

# If the new point is not uphill for the target function, take it
# and return.  This check is a bit expensive and un-necessary for
# OLS
if not check_step:
return x + h
f1 = func(x + h, model) + L1_wt*np.abs(x + h)
if f1 <= f + L1_wt*np.abs(x) + 1e-10:
return x + h

# Fallback for models where the loss is not quadratic
from scipy.optimize import brent
x_opt = brent(func, args=(model,), brack=(x-1, x+1), tol=tol)
return x_opt

[docs]class RegularizedResults(Results):
"""
Results for models estimated using regularization

Parameters
----------
model : Model
The model instance used to estimate the parameters.
params : ndarray
The estimated (regularized) parameters.
"""
def __init__(self, model, params):
super(RegularizedResults, self).__init__(model, params)

def fittedvalues(self):
"""
The predicted values from the model at the estimated parameters.
"""
return self.model.predict(self.params)

class RegularizedResultsWrapper(wrap.ResultsWrapper):
_attrs = {
'params': 'columns',
'resid': 'rows',
'fittedvalues': 'rows',
}
_wrap_attrs = _attrs
wrap.populate_wrapper(RegularizedResultsWrapper,  # noqa:E305
RegularizedResults)
```