"""convenience functions for ANOVA type analysis with OLS
Note: statistical results of ANOVA are not checked, OLS is
checked but not whether the reported results are the ones used
in ANOVA
includes form2design for creating dummy variables
TODO:
* ...
*
"""
from statsmodels.compat.python import lmap
import numpy as np
# from scipy import stats
import statsmodels.api as sm
[docs]
def data2dummy(x, returnall=False):
"""convert array of categories to dummy variables
by default drops dummy variable for last category
uses ravel, 1d only"""
x = x.ravel()
groups = np.unique(x)
if returnall:
return (x[:, None] == groups).astype(int)
else:
return (x[:, None] == groups).astype(int)[:, :-1]
[docs]
def data2proddummy(x):
"""creates product dummy variables from 2 columns of 2d array
drops last dummy variable, but not from each category
singular with simple dummy variable but not with constant
quickly written, no safeguards
"""
# brute force, assumes x is 2d
# replace with encoding if possible
groups = np.unique(lmap(tuple, x.tolist()))
# includes singularity with additive factors
return (x == groups[:, None, :]).all(-1).T.astype(int)[:, :-1]
[docs]
def data2groupcont(x1, x2):
"""create dummy continuous variable
Parameters
----------
x1 : 1d array
label or group array
x2 : 1d array (float)
continuous variable
Notes
-----
useful for group specific slope coefficients in regression
"""
if x2.ndim == 1:
x2 = x2[:, None]
dummy = data2dummy(x1, returnall=True)
return dummy * x2
# Result strings
# the second leaves the constant in, not with NIST regression
# but something fishy with res.ess negative in examples ?
# not checked if these are all the right ones
anova_str0 = """
ANOVA statistics (model sum of squares excludes constant)
Source DF Sum Squares Mean Square F Value Pr > F
Model %(df_model)i %(ess)f %(mse_model)f %(fvalue)f %(f_pvalue)f
Error %(df_resid)i %(ssr)f %(mse_resid)f
CTotal %(nobs)i %(uncentered_tss)f %(mse_total)f
R squared %(rsquared)f
"""
anova_str = """
ANOVA statistics (model sum of squares includes constant)
Source DF Sum Squares Mean Square F Value Pr > F
Model %(df_model)i %(ssmwithmean)f %(mse_model)f %(fvalue)f %(f_pvalue)f
Error %(df_resid)i %(ssr)f %(mse_resid)f
CTotal %(nobs)i %(uncentered_tss)f %(mse_total)f
R squared %(rsquared)f
"""
def anovadict(res):
"""update regression results dictionary with ANOVA specific statistics
not checked for completeness
"""
ad = {}
ad.update(res.__dict__) # dict does not work with cached attributes
anova_attr = [
"df_model",
"df_resid",
"ess",
"ssr",
"uncentered_tss",
"mse_model",
"mse_resid",
"mse_total",
"fvalue",
"f_pvalue",
"rsquared",
]
for key in anova_attr:
ad[key] = getattr(res, key)
ad["nobs"] = res.model.nobs
ad["ssmwithmean"] = res.uncentered_tss - res.ssr
return ad
[docs]
def dropname(ss, li):
"""drop names from a list of strings,
names to drop are in space delimited list
does not change original list
"""
newli = li[:]
for item in ss.split():
newli.remove(item)
return newli
if __name__ == "__main__":
# Test Example with created data
# ------------------------------
nobs = 1000
testdataint = np.random.randint(3, size=(nobs, 4)).view(
[("a", int), ("b", int), ("c", int), ("d", int)]
)
testdatacont = np.random.normal(size=(nobs, 2)).view([("e", float), ("f", float)])
import numpy.lib.recfunctions
dt2 = numpy.lib.recfunctions.zip_descr((testdataint, testdatacont), flatten=True)
# concatenate structured arrays
testdata = np.empty((nobs, 1), dt2)
for name in testdataint.dtype.names:
testdata[name] = testdataint[name]
for name in testdatacont.dtype.names:
testdata[name] = testdatacont[name]
# print(form2design('a',testdata)
if 0: # print(only when nobs is small, e.g. nobs=10
xx, n = form2design("F:a", testdata)
print(xx)
print(form2design("P:a*b", testdata))
print(data2proddummy(np.c_[testdata["a"], testdata["b"]]))
xx, names = form2design("a F:b P:c*d", testdata)
# xx, names = form2design('I a F:b F:c F:d P:c*d',testdata)
xx, names = form2design("I a F:b P:c*d", testdata)
xx, names = form2design("I a F:b P:c*d G:a*e f", testdata)
X = np.column_stack([xx[nn] for nn in names])
# simple test version: all coefficients equal to one
y = X.sum(1) + 0.01 * np.random.normal(size=(nobs))
rest1 = sm.OLS(y, X).fit() # results
print(rest1.params)
print(anova_str % anovadict(rest1))
X = np.column_stack([xx[nn] for nn in dropname("ae f", names)])
# simple test version: all coefficients equal to one
y = X.sum(1) + 0.01 * np.random.normal(size=(nobs))
rest1 = sm.OLS(y, X).fit()
print(rest1.params)
print(anova_str % anovadict(rest1))
# Example: from Bruce
# -------------------
# get data and clean it
# ^^^^^^^^^^^^^^^^^^^^^
# requires file 'dftest3.data' posted by Bruce
# read data set and drop rows with missing data
dt_b = np.dtype(
[
("breed", int),
("sex", int),
("litter", int),
("pen", int),
("pig", int),
("age", float),
("bage", float),
("y", float),
]
)
dta = np.genfromtxt("dftest3.data", dt_b, missing=".", usemask=True)
print("missing", [dta.mask[k].sum() for k in dta.dtype.names])
m = dta.mask.view(bool)
droprows = m.reshape(-1, len(dta.dtype.names)).any(1)
# get complete data as plain structured array
# maybe does not work with masked arrays
dta_use_b1 = dta[~droprows, :].data
print(dta_use_b1.shape)
print(dta_use_b1.dtype)
# Example b1: variables from Bruce's glm
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# prepare data and dummy variables
xx_b1, names_b1 = form2design("I F:sex age", dta_use_b1)
# create design matrix
X_b1 = np.column_stack([xx_b1[nn] for nn in dropname("", names_b1)])
y_b1 = dta_use_b1["y"]
# estimate using OLS
rest_b1 = sm.OLS(y_b1, X_b1).fit()
# print(results)
print(rest_b1.params)
print(anova_str % anovadict(rest_b1))
# compare with original version only in original version
# print(anova_str % anovadict(res_b0))
# Example: use all variables except pig identifier
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
allexog = " ".join(dta.dtype.names[:-1])
# 'breed sex litter pen pig age bage'
xx_b1a, names_b1a = form2design(
"I F:breed F:sex F:litter F:pen age bage", dta_use_b1
)
X_b1a = np.column_stack([xx_b1a[nn] for nn in dropname("", names_b1a)])
y_b1a = dta_use_b1["y"]
rest_b1a = sm.OLS(y_b1a, X_b1a).fit()
print(rest_b1a.params)
print(anova_str % anovadict(rest_b1a))
for dropn in names_b1a:
print(("\nResults dropping", dropn))
X_b1a_ = np.column_stack([xx_b1a[nn] for nn in dropname(dropn, names_b1a)])
y_b1a_ = dta_use_b1["y"]
rest_b1a_ = sm.OLS(y_b1a_, X_b1a_).fit()
# print(rest_b1a_.params)
print(anova_str % anovadict(rest_b1a_))