{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Generalized Linear Models (Formula)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This notebook illustrates how you can use R-style formulas to fit Generalized Linear Models.\n", "\n", "To begin, we load the ``Star98`` dataset and we construct a formula and pre-process the data:" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:38.228069Z", "iopub.status.busy": "2023-12-14T14:44:38.227750Z", "iopub.status.idle": "2023-12-14T14:44:39.300597Z", "shell.execute_reply": "2023-12-14T14:44:39.299640Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "import statsmodels.api as sm\n", "import statsmodels.formula.api as smf\n", "\n", "star98 = sm.datasets.star98.load_pandas().data\n", "formula = \"SUCCESS ~ LOWINC + PERASIAN + PERBLACK + PERHISP + PCTCHRT + \\\n", " PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF\"\n", "dta = star98[\n", " [\n", " \"NABOVE\",\n", " \"NBELOW\",\n", " \"LOWINC\",\n", " \"PERASIAN\",\n", " \"PERBLACK\",\n", " \"PERHISP\",\n", " \"PCTCHRT\",\n", " \"PCTYRRND\",\n", " \"PERMINTE\",\n", " \"AVYRSEXP\",\n", " \"AVSALK\",\n", " \"PERSPENK\",\n", " \"PTRATIO\",\n", " \"PCTAF\",\n", " ]\n", "].copy()\n", "endog = dta[\"NABOVE\"] / (dta[\"NABOVE\"] + dta.pop(\"NBELOW\"))\n", "del dta[\"NABOVE\"]\n", "dta[\"SUCCESS\"] = endog" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Then, we fit the GLM model:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:39.304902Z", "iopub.status.busy": "2023-12-14T14:44:39.304273Z", "iopub.status.idle": "2023-12-14T14:44:39.351547Z", "shell.execute_reply": "2023-12-14T14:44:39.350710Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Generalized Linear Model Regression Results \n", "==============================================================================\n", "Dep. Variable: SUCCESS No. Observations: 303\n", "Model: GLM Df Residuals: 282\n", "Model Family: Binomial Df Model: 20\n", "Link Function: Logit Scale: 1.0000\n", "Method: IRLS Log-Likelihood: -127.33\n", "Date: Thu, 14 Dec 2023 Deviance: 8.5477\n", "Time: 14:44:39 Pearson chi2: 8.48\n", "No. Iterations: 4 Pseudo R-squ. (CS): 0.1115\n", "Covariance Type: nonrobust \n", "============================================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "--------------------------------------------------------------------------------------------\n", "Intercept 0.4037 25.036 0.016 0.987 -48.665 49.472\n", "LOWINC -0.0204 0.010 -1.982 0.048 -0.041 -0.000\n", "PERASIAN 0.0159 0.017 0.910 0.363 -0.018 0.050\n", "PERBLACK -0.0198 0.020 -1.004 0.316 -0.058 0.019\n", "PERHISP -0.0096 0.010 -0.951 0.341 -0.029 0.010\n", "PCTCHRT -0.0022 0.022 -0.103 0.918 -0.045 0.040\n", "PCTYRRND -0.0022 0.006 -0.348 0.728 -0.014 0.010\n", "PERMINTE 0.1068 0.787 0.136 0.892 -1.436 1.650\n", "AVYRSEXP -0.0411 1.176 -0.035 0.972 -2.346 2.264\n", "PERMINTE:AVYRSEXP -0.0031 0.054 -0.057 0.954 -0.108 0.102\n", "AVSALK 0.0131 0.295 0.044 0.965 -0.566 0.592\n", "PERMINTE:AVSALK -0.0019 0.013 -0.145 0.885 -0.028 0.024\n", "AVYRSEXP:AVSALK 0.0008 0.020 0.038 0.970 -0.039 0.041\n", "PERMINTE:AVYRSEXP:AVSALK 5.978e-05 0.001 0.068 0.946 -0.002 0.002\n", "PERSPENK -0.3097 4.233 -0.073 0.942 -8.606 7.987\n", "PTRATIO 0.0096 0.919 0.010 0.992 -1.792 1.811\n", "PERSPENK:PTRATIO 0.0066 0.206 0.032 0.974 -0.397 0.410\n", "PCTAF -0.0143 0.474 -0.030 0.976 -0.944 0.916\n", "PERSPENK:PCTAF 0.0105 0.098 0.107 0.915 -0.182 0.203\n", "PTRATIO:PCTAF -0.0001 0.022 -0.005 0.996 -0.044 0.044\n", "PERSPENK:PTRATIO:PCTAF -0.0002 0.005 -0.051 0.959 -0.010 0.009\n", "============================================================================================\n" ] } ], "source": [ "mod1 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()\n", "print(mod1.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Finally, we define a function to operate customized data transformation using the formula framework:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:39.354898Z", "iopub.status.busy": "2023-12-14T14:44:39.354653Z", "iopub.status.idle": "2023-12-14T14:44:39.387526Z", "shell.execute_reply": "2023-12-14T14:44:39.386661Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Generalized Linear Model Regression Results \n", "==============================================================================\n", "Dep. Variable: SUCCESS No. Observations: 303\n", "Model: GLM Df Residuals: 282\n", "Model Family: Binomial Df Model: 20\n", "Link Function: Logit Scale: 1.0000\n", "Method: IRLS Log-Likelihood: -127.33\n", "Date: Thu, 14 Dec 2023 Deviance: 8.5477\n", "Time: 14:44:39 Pearson chi2: 8.48\n", "No. Iterations: 4 Pseudo R-squ. (CS): 0.1115\n", "Covariance Type: nonrobust \n", "============================================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "--------------------------------------------------------------------------------------------\n", "Intercept 0.4037 25.036 0.016 0.987 -48.665 49.472\n", "double_it(LOWINC) -0.0102 0.005 -1.982 0.048 -0.020 -0.000\n", "PERASIAN 0.0159 0.017 0.910 0.363 -0.018 0.050\n", "PERBLACK -0.0198 0.020 -1.004 0.316 -0.058 0.019\n", "PERHISP -0.0096 0.010 -0.951 0.341 -0.029 0.010\n", "PCTCHRT -0.0022 0.022 -0.103 0.918 -0.045 0.040\n", "PCTYRRND -0.0022 0.006 -0.348 0.728 -0.014 0.010\n", "PERMINTE 0.1068 0.787 0.136 0.892 -1.436 1.650\n", "AVYRSEXP -0.0411 1.176 -0.035 0.972 -2.346 2.264\n", "PERMINTE:AVYRSEXP -0.0031 0.054 -0.057 0.954 -0.108 0.102\n", "AVSALK 0.0131 0.295 0.044 0.965 -0.566 0.592\n", "PERMINTE:AVSALK -0.0019 0.013 -0.145 0.885 -0.028 0.024\n", "AVYRSEXP:AVSALK 0.0008 0.020 0.038 0.970 -0.039 0.041\n", "PERMINTE:AVYRSEXP:AVSALK 5.978e-05 0.001 0.068 0.946 -0.002 0.002\n", "PERSPENK -0.3097 4.233 -0.073 0.942 -8.606 7.987\n", "PTRATIO 0.0096 0.919 0.010 0.992 -1.792 1.811\n", "PERSPENK:PTRATIO 0.0066 0.206 0.032 0.974 -0.397 0.410\n", "PCTAF -0.0143 0.474 -0.030 0.976 -0.944 0.916\n", "PERSPENK:PCTAF 0.0105 0.098 0.107 0.915 -0.182 0.203\n", "PTRATIO:PCTAF -0.0001 0.022 -0.005 0.996 -0.044 0.044\n", "PERSPENK:PTRATIO:PCTAF -0.0002 0.005 -0.051 0.959 -0.010 0.009\n", "============================================================================================\n" ] } ], "source": [ "def double_it(x):\n", " return 2 * x\n", "\n", "\n", "formula = \"SUCCESS ~ double_it(LOWINC) + PERASIAN + PERBLACK + PERHISP + PCTCHRT + \\\n", " PCTYRRND + PERMINTE*AVYRSEXP*AVSALK + PERSPENK*PTRATIO*PCTAF\"\n", "mod2 = smf.glm(formula=formula, data=dta, family=sm.families.Binomial()).fit()\n", "print(mod2.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As expected, the coefficient for ``double_it(LOWINC)`` in the second model is half the size of the ``LOWINC`` coefficient from the first model:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:39.390460Z", "iopub.status.busy": "2023-12-14T14:44:39.390212Z", "iopub.status.idle": "2023-12-14T14:44:39.395377Z", "shell.execute_reply": "2023-12-14T14:44:39.394579Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-0.02039598715475645\n", "-0.020395987154756174\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_5395/1000445862.py:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", " print(mod1.params[1])\n", "/tmp/ipykernel_5395/1000445862.py:2: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", " print(mod2.params[1] * 2)\n" ] } ], "source": [ "print(mod1.params[1])\n", "print(mod2.params[1] * 2)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 4 }