# Source code for statsmodels.sandbox.regression.try_ols_anova

``````''' convenience functions for ANOVA type analysis with OLS

Note: statistical results of ANOVA are not checked, OLS is
checked but not whether the reported results are the ones used
in ANOVA

includes form2design for creating dummy variables

TODO:
* ...
*

'''

from statsmodels.compat.python import lmap
import numpy as np
#from scipy import stats
import statsmodels.api as sm

[docs]
def data2dummy(x, returnall=False):
'''convert array of categories to dummy variables
by default drops dummy variable for last category
uses ravel, 1d only'''
x = x.ravel()
groups = np.unique(x)
if returnall:
return (x[:, None] == groups).astype(int)
else:
return (x[:, None] == groups).astype(int)[:,:-1]

[docs]
def data2proddummy(x):
'''creates product dummy variables from 2 columns of 2d array

drops last dummy variable, but not from each category
singular with simple dummy variable but not with constant

quickly written, no safeguards

'''
#brute force, assumes x is 2d
#replace with encoding if possible
groups = np.unique(lmap(tuple, x.tolist()))
#includes singularity with additive factors
return (x==groups[:,None,:]).all(-1).T.astype(int)[:,:-1]

[docs]
def data2groupcont(x1,x2):
'''create dummy continuous variable

Parameters
----------
x1 : 1d array
label or group array
x2 : 1d array (float)
continuous variable

Notes
-----
useful for group specific slope coefficients in regression
'''
if x2.ndim == 1:
x2 = x2[:,None]
dummy = data2dummy(x1, returnall=True)
return dummy * x2

# Result strings
#the second leaves the constant in, not with NIST regression
#but something fishy with res.ess negative in examples ?
#not checked if these are all the right ones

anova_str0 = '''
ANOVA statistics (model sum of squares excludes constant)
Source    DF  Sum Squares   Mean Square    F Value    Pr > F
Model     %(df_model)i        %(ess)f       %(mse_model)f   %(fvalue)f %(f_pvalue)f
Error     %(df_resid)i     %(ssr)f       %(mse_resid)f
CTotal    %(nobs)i    %(uncentered_tss)f     %(mse_total)f

R squared  %(rsquared)f
'''

anova_str = '''
ANOVA statistics (model sum of squares includes constant)
Source    DF  Sum Squares   Mean Square    F Value    Pr > F
Model     %(df_model)i      %(ssmwithmean)f       %(mse_model)f   %(fvalue)f %(f_pvalue)f
Error     %(df_resid)i     %(ssr)f       %(mse_resid)f
CTotal    %(nobs)i    %(uncentered_tss)f     %(mse_total)f

R squared  %(rsquared)f
'''

'''update regression results dictionary with ANOVA specific statistics

not checked for completeness
'''
ad.update(res.__dict__)  #dict does not work with cached attributes
anova_attr = ['df_model', 'df_resid', 'ess', 'ssr','uncentered_tss',
'mse_model', 'mse_resid', 'mse_total', 'fvalue', 'f_pvalue',
'rsquared']
for key in anova_attr:
ad[key] = getattr(res, key)
ad['ssmwithmean'] = res.uncentered_tss - res.ssr

[docs]
def form2design(ss, data):
'''convert string formula to data dictionary

ss : str
* I : add constant
* varname : for simple varnames data is used as is
* F:varname : create dummy variables for factor varname
* P:varname1*varname2 : create product dummy variables for
varnames
* G:varname1*varname2 : create product between factor and
continuous variable
data : dict or structured array
data set, access of variables by name as in dictionaries

Returns
-------
vars : dictionary
dictionary of variables with converted dummy variables
names : list
list of names, product (P:) and grouped continuous
variables (G:) have name by joining individual names
sorted according to input

Examples
--------
>>> xx, n = form2design('I a F:b P:c*d G:c*f', testdata)
>>> xx.keys()
['a', 'b', 'const', 'cf', 'cd']
>>> n
['const', 'a', 'b', 'cd', 'cf']

Notes
-----

with sorted dict, separate name list would not be necessary
'''
vars = {}
names = []
for item in ss.split():
if item == 'I':
vars['const'] = np.ones(data.shape[0])
names.append('const')
elif ':' not in item:
vars[item] = data[item]
names.append(item)
elif item[:2] == 'F:':
v = item.split(':')[1]
vars[v] = data2dummy(data[v])
names.append(v)
elif item[:2] == 'P:':
v = item.split(':')[1].split('*')
vars[''.join(v)] = data2proddummy(np.c_[data[v[0]],data[v[1]]])
names.append(''.join(v))
elif item[:2] == 'G:':
v = item.split(':')[1].split('*')
vars[''.join(v)] = data2groupcont(data[v[0]], data[v[1]])
names.append(''.join(v))
else:
raise ValueError('unknown expression in formula')
return vars, names

[docs]
def dropname(ss, li):
'''drop names from a list of strings,
names to drop are in space delimited list
does not change original list
'''
newli = li[:]
for item in ss.split():
newli.remove(item)
return newli

if __name__ == '__main__':

# Test Example with created data
# ------------------------------

nobs = 1000
testdataint = np.random.randint(3, size=(nobs,4)).view([('a',int),('b',int),('c',int),('d',int)])
testdatacont = np.random.normal( size=(nobs,2)).view([('e',float), ('f',float)])
import numpy.lib.recfunctions
dt2 = numpy.lib.recfunctions.zip_descr((testdataint, testdatacont),flatten=True)
# concatenate structured arrays
testdata = np.empty((nobs,1), dt2)
for name in testdataint.dtype.names:
testdata[name] = testdataint[name]
for name in testdatacont.dtype.names:
testdata[name] = testdatacont[name]

#print(form2design('a',testdata)

if 0: # print(only when nobs is small, e.g. nobs=10
xx, n = form2design('F:a',testdata)
print(xx)
print(form2design('P:a*b',testdata))
print(data2proddummy((np.c_[testdata['a'],testdata['b']])))

xx, names = form2design('a F:b P:c*d',testdata)

#xx, names = form2design('I a F:b F:c F:d P:c*d',testdata)
xx, names = form2design('I a F:b P:c*d', testdata)
xx, names = form2design('I a F:b P:c*d G:a*e f', testdata)

X = np.column_stack([xx[nn] for nn in names])
# simple test version: all coefficients equal to one
y = X.sum(1) + 0.01*np.random.normal(size=(nobs))
rest1 = sm.OLS(y,X).fit() #results
print(rest1.params)

X = np.column_stack([xx[nn] for nn in dropname('ae f', names)])
# simple test version: all coefficients equal to one
y = X.sum(1) + 0.01*np.random.normal(size=(nobs))
rest1 = sm.OLS(y,X).fit()
print(rest1.params)

# Example: from Bruce
# -------------------

#get data and clean it
#^^^^^^^^^^^^^^^^^^^^^

# requires file 'dftest3.data' posted by Bruce

# read data set and drop rows with missing data
dt_b = np.dtype([('breed', int), ('sex', int), ('litter', int),
('pen', int), ('pig', int), ('age', float),
('bage', float), ('y', float)])
dta = np.genfromtxt('dftest3.data', dt_b,missing='.', usemask=True)
print('missing', [dta.mask[k].sum() for k in dta.dtype.names])
droprows = m.reshape(-1,len(dta.dtype.names)).any(1)
# get complete data as plain structured array
# maybe does not work with masked arrays
dta_use_b1 = dta[~droprows,:].data
print(dta_use_b1.shape)
print(dta_use_b1.dtype)

#Example b1: variables from Bruce's glm
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# prepare data and dummy variables
xx_b1, names_b1 = form2design('I F:sex age', dta_use_b1)
# create design matrix
X_b1 = np.column_stack([xx_b1[nn] for nn in dropname('', names_b1)])
y_b1 = dta_use_b1['y']
# estimate using OLS
rest_b1 = sm.OLS(y_b1, X_b1).fit()
# print(results)
print(rest_b1.params)
#compare with original version only in original version

# Example: use all variables except pig identifier
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

allexog = ' '.join(dta.dtype.names[:-1])
#'breed sex litter pen pig age bage'

xx_b1a, names_b1a = form2design('I F:breed F:sex F:litter F:pen age bage', dta_use_b1)
X_b1a = np.column_stack([xx_b1a[nn] for nn in dropname('', names_b1a)])
y_b1a = dta_use_b1['y']
rest_b1a = sm.OLS(y_b1a, X_b1a).fit()
print(rest_b1a.params)