{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Discrete Choice Models Overview" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:51.971890Z", "iopub.status.busy": "2023-12-14T14:44:51.971627Z", "iopub.status.idle": "2023-12-14T14:44:53.485374Z", "shell.execute_reply": "2023-12-14T14:44:53.484437Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "import numpy as np\n", "import statsmodels.api as sm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Data\n", "\n", "Load data from Spector and Mazzeo (1980). Examples follow Greene's Econometric Analysis Ch. 21 (5th Edition)." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.492371Z", "iopub.status.busy": "2023-12-14T14:44:53.489896Z", "iopub.status.idle": "2023-12-14T14:44:53.511908Z", "shell.execute_reply": "2023-12-14T14:44:53.510849Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "spector_data = sm.datasets.spector.load()\n", "spector_data.exog = sm.add_constant(spector_data.exog, prepend=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Inspect the data:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.518707Z", "iopub.status.busy": "2023-12-14T14:44:53.516395Z", "iopub.status.idle": "2023-12-14T14:44:53.538924Z", "shell.execute_reply": "2023-12-14T14:44:53.538043Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " GPA TUCE PSI const\n", "0 2.66 20.0 0.0 1.0\n", "1 2.89 22.0 0.0 1.0\n", "2 3.28 24.0 0.0 1.0\n", "3 2.92 12.0 0.0 1.0\n", "4 4.00 21.0 0.0 1.0\n", "0 0.0\n", "1 0.0\n", "2 0.0\n", "3 0.0\n", "4 1.0\n", "Name: GRADE, dtype: float64\n" ] } ], "source": [ "print(spector_data.exog.head())\n", "print(spector_data.endog.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Linear Probability Model (OLS)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.545985Z", "iopub.status.busy": "2023-12-14T14:44:53.543455Z", "iopub.status.idle": "2023-12-14T14:44:53.558639Z", "shell.execute_reply": "2023-12-14T14:44:53.557975Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parameters: GPA 0.463852\n", "TUCE 0.010495\n", "PSI 0.378555\n", "dtype: float64\n" ] } ], "source": [ "lpm_mod = sm.OLS(spector_data.endog, spector_data.exog)\n", "lpm_res = lpm_mod.fit()\n", "print(\"Parameters: \", lpm_res.params[:-1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Logit Model" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.564302Z", "iopub.status.busy": "2023-12-14T14:44:53.562344Z", "iopub.status.idle": "2023-12-14T14:44:53.594384Z", "shell.execute_reply": "2023-12-14T14:44:53.593270Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Parameters: GPA 2.826113\n", "TUCE 0.095158\n", "PSI 2.378688\n", "const -13.021347\n", "dtype: float64\n" ] } ], "source": [ "logit_mod = sm.Logit(spector_data.endog, spector_data.exog)\n", "logit_res = logit_mod.fit(disp=0)\n", "print(\"Parameters: \", logit_res.params)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Marginal Effects" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.602254Z", "iopub.status.busy": "2023-12-14T14:44:53.599682Z", "iopub.status.idle": "2023-12-14T14:44:53.622008Z", "shell.execute_reply": "2023-12-14T14:44:53.621140Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Logit Marginal Effects \n", "=====================================\n", "Dep. Variable: GRADE\n", "Method: dydx\n", "At: overall\n", "==============================================================================\n", " dy/dx std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "GPA 0.3626 0.109 3.313 0.001 0.148 0.577\n", "TUCE 0.0122 0.018 0.686 0.493 -0.023 0.047\n", "PSI 0.3052 0.092 3.304 0.001 0.124 0.486\n", "==============================================================================\n" ] } ], "source": [ "margeff = logit_res.get_margeff()\n", "print(margeff.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "As in all the discrete data models presented below, we can print a nice summary of results:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.630866Z", "iopub.status.busy": "2023-12-14T14:44:53.626564Z", "iopub.status.idle": "2023-12-14T14:44:53.666208Z", "shell.execute_reply": "2023-12-14T14:44:53.665445Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Logit Regression Results \n", "==============================================================================\n", "Dep. Variable: GRADE No. Observations: 32\n", "Model: Logit Df Residuals: 28\n", "Method: MLE Df Model: 3\n", "Date: Thu, 14 Dec 2023 Pseudo R-squ.: 0.3740\n", "Time: 14:44:53 Log-Likelihood: -12.890\n", "converged: True LL-Null: -20.592\n", "Covariance Type: nonrobust LLR p-value: 0.001502\n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "GPA 2.8261 1.263 2.238 0.025 0.351 5.301\n", "TUCE 0.0952 0.142 0.672 0.501 -0.182 0.373\n", "PSI 2.3787 1.065 2.234 0.025 0.292 4.465\n", "const -13.0213 4.931 -2.641 0.008 -22.687 -3.356\n", "==============================================================================\n" ] } ], "source": [ "print(logit_res.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Probit Model " ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.671870Z", "iopub.status.busy": "2023-12-14T14:44:53.669695Z", "iopub.status.idle": "2023-12-14T14:44:53.698278Z", "shell.execute_reply": "2023-12-14T14:44:53.697576Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 0.400588\n", " Iterations 6\n", "Parameters: GPA 1.625810\n", "TUCE 0.051729\n", "PSI 1.426332\n", "const -7.452320\n", "dtype: float64\n", "Marginal effects: \n", " Probit Marginal Effects \n", "=====================================\n", "Dep. Variable: GRADE\n", "Method: dydx\n", "At: overall\n", "==============================================================================\n", " dy/dx std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "GPA 0.3608 0.113 3.182 0.001 0.139 0.583\n", "TUCE 0.0115 0.018 0.624 0.533 -0.025 0.048\n", "PSI 0.3165 0.090 3.508 0.000 0.140 0.493\n", "==============================================================================\n" ] } ], "source": [ "probit_mod = sm.Probit(spector_data.endog, spector_data.exog)\n", "probit_res = probit_mod.fit()\n", "probit_margeff = probit_res.get_margeff()\n", "print(\"Parameters: \", probit_res.params)\n", "print(\"Marginal effects: \")\n", "print(probit_margeff.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Multinomial Logit" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load data from the American National Election Studies:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.704550Z", "iopub.status.busy": "2023-12-14T14:44:53.702709Z", "iopub.status.idle": "2023-12-14T14:44:53.745675Z", "shell.execute_reply": "2023-12-14T14:44:53.744903Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "anes_data = sm.datasets.anes96.load()\n", "anes_exog = anes_data.exog\n", "anes_exog = sm.add_constant(anes_exog)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Inspect the data:" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.752687Z", "iopub.status.busy": "2023-12-14T14:44:53.750490Z", "iopub.status.idle": "2023-12-14T14:44:53.768712Z", "shell.execute_reply": "2023-12-14T14:44:53.767944Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " logpopul selfLR age educ income\n", "0 -2.302585 7.0 36.0 3.0 1.0\n", "1 5.247550 3.0 20.0 4.0 1.0\n", "2 3.437208 2.0 24.0 6.0 1.0\n", "3 4.420045 3.0 28.0 6.0 1.0\n", "4 6.461624 5.0 68.0 6.0 1.0\n", "0 6.0\n", "1 1.0\n", "2 1.0\n", "3 1.0\n", "4 0.0\n", "Name: PID, dtype: float64\n" ] } ], "source": [ "print(anes_data.exog.head())\n", "print(anes_data.endog.head())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit MNL model:" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.776548Z", "iopub.status.busy": "2023-12-14T14:44:53.774316Z", "iopub.status.idle": "2023-12-14T14:44:53.840275Z", "shell.execute_reply": "2023-12-14T14:44:53.839541Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 1.548647\n", " Iterations 7\n", " 0 1 2 3 4 5\n", "const -0.373402 -2.250913 -3.665584 -7.613843 -7.060478 -12.105751\n", "logpopul -0.011536 -0.088751 -0.105967 -0.091557 -0.093285 -0.140881\n", "selfLR 0.297714 0.391669 0.573451 1.278772 1.346962 2.070080\n", "age -0.024945 -0.022898 -0.014851 -0.008681 -0.017904 -0.009433\n", "educ 0.082491 0.181043 -0.007152 0.199828 0.216939 0.321926\n", "income 0.005197 0.047874 0.057575 0.084498 0.080958 0.108894\n" ] } ], "source": [ "mlogit_mod = sm.MNLogit(anes_data.endog, anes_exog)\n", "mlogit_res = mlogit_mod.fit()\n", "print(mlogit_res.params)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Poisson\n", "\n", "Load the Rand data. Note that this example is similar to Cameron and Trivedi's `Microeconometrics` Table 20.5, but it is slightly different because of minor changes in the data. " ] }, { "cell_type": "code", "execution_count": 12, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.847210Z", "iopub.status.busy": "2023-12-14T14:44:53.844919Z", "iopub.status.idle": "2023-12-14T14:44:53.939532Z", "shell.execute_reply": "2023-12-14T14:44:53.938154Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [], "source": [ "rand_data = sm.datasets.randhie.load()\n", "rand_exog = rand_data.exog\n", "rand_exog = sm.add_constant(rand_exog, prepend=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit Poisson model: " ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:53.945421Z", "iopub.status.busy": "2023-12-14T14:44:53.943976Z", "iopub.status.idle": "2023-12-14T14:44:54.154348Z", "shell.execute_reply": "2023-12-14T14:44:54.153602Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 3.091609\n", " Iterations 6\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Poisson Regression Results \n", "==============================================================================\n", "Dep. Variable: mdvis No. Observations: 20190\n", "Model: Poisson Df Residuals: 20180\n", "Method: MLE Df Model: 9\n", "Date: Thu, 14 Dec 2023 Pseudo R-squ.: 0.06343\n", "Time: 14:44:54 Log-Likelihood: -62420.\n", "converged: True LL-Null: -66647.\n", "Covariance Type: nonrobust LLR p-value: 0.000\n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "lncoins -0.0525 0.003 -18.216 0.000 -0.058 -0.047\n", "idp -0.2471 0.011 -23.272 0.000 -0.268 -0.226\n", "lpi 0.0353 0.002 19.302 0.000 0.032 0.039\n", "fmde -0.0346 0.002 -21.439 0.000 -0.038 -0.031\n", "physlm 0.2717 0.012 22.200 0.000 0.248 0.296\n", "disea 0.0339 0.001 60.098 0.000 0.033 0.035\n", "hlthg -0.0126 0.009 -1.366 0.172 -0.031 0.005\n", "hlthf 0.0541 0.015 3.531 0.000 0.024 0.084\n", "hlthp 0.2061 0.026 7.843 0.000 0.155 0.258\n", "const 0.7004 0.011 62.741 0.000 0.678 0.722\n", "==============================================================================\n" ] } ], "source": [ "poisson_mod = sm.Poisson(rand_data.endog, rand_exog)\n", "poisson_res = poisson_mod.fit(method=\"newton\")\n", "print(poisson_res.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Negative Binomial\n", "\n", "The negative binomial model gives slightly different results. " ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:54.159195Z", "iopub.status.busy": "2023-12-14T14:44:54.158010Z", "iopub.status.idle": "2023-12-14T14:44:55.557994Z", "shell.execute_reply": "2023-12-14T14:44:55.557239Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/hostedtoolcache/Python/3.10.13/x64/lib/python3.10/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n", " warnings.warn(\"Maximum Likelihood optimization failed to \"\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " NegativeBinomial Regression Results \n", "==============================================================================\n", "Dep. Variable: mdvis No. Observations: 20190\n", "Model: NegativeBinomial Df Residuals: 20180\n", "Method: MLE Df Model: 9\n", "Date: Thu, 14 Dec 2023 Pseudo R-squ.: 0.01845\n", "Time: 14:44:55 Log-Likelihood: -43384.\n", "converged: False LL-Null: -44199.\n", "Covariance Type: nonrobust LLR p-value: 0.000\n", "==============================================================================\n", " coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "lncoins -0.0579 0.006 -9.515 0.000 -0.070 -0.046\n", "idp -0.2678 0.023 -11.802 0.000 -0.312 -0.223\n", "lpi 0.0412 0.004 9.938 0.000 0.033 0.049\n", "fmde -0.0381 0.003 -11.216 0.000 -0.045 -0.031\n", "physlm 0.2691 0.030 8.985 0.000 0.210 0.328\n", "disea 0.0382 0.001 26.080 0.000 0.035 0.041\n", "hlthg -0.0441 0.020 -2.201 0.028 -0.083 -0.005\n", "hlthf 0.0173 0.036 0.478 0.632 -0.054 0.088\n", "hlthp 0.1782 0.074 2.399 0.016 0.033 0.324\n", "const 0.6635 0.025 26.786 0.000 0.615 0.712\n", "alpha 1.2930 0.019 69.477 0.000 1.256 1.329\n", "==============================================================================\n" ] } ], "source": [ "mod_nbin = sm.NegativeBinomial(rand_data.endog, rand_exog)\n", "res_nbin = mod_nbin.fit(disp=False)\n", "print(res_nbin.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Alternative solvers\n", "\n", "The default method for fitting discrete data MLE models is Newton-Raphson. You can use other solvers by using the ``method`` argument: " ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:44:55.563733Z", "iopub.status.busy": "2023-12-14T14:44:55.562093Z", "iopub.status.idle": "2023-12-14T14:44:56.432856Z", "shell.execute_reply": "2023-12-14T14:44:56.431475Z" }, "jupyter": { "outputs_hidden": false } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Optimization terminated successfully.\n", " Current function value: 1.548647\n", " Iterations: 111\n", " Function evaluations: 117\n", " Gradient evaluations: 117\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " MNLogit Regression Results \n", "==============================================================================\n", "Dep. Variable: PID No. Observations: 944\n", "Model: MNLogit Df Residuals: 908\n", "Method: MLE Df Model: 30\n", "Date: Thu, 14 Dec 2023 Pseudo R-squ.: 0.1648\n", "Time: 14:44:56 Log-Likelihood: -1461.9\n", "converged: True LL-Null: -1750.3\n", "Covariance Type: nonrobust LLR p-value: 1.822e-102\n", "==============================================================================\n", " PID=1 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -0.3734 0.630 -0.593 0.553 -1.608 0.861\n", "logpopul -0.0115 0.034 -0.337 0.736 -0.079 0.056\n", "selfLR 0.2977 0.094 3.180 0.001 0.114 0.481\n", "age -0.0249 0.007 -3.823 0.000 -0.038 -0.012\n", "educ 0.0825 0.074 1.121 0.262 -0.062 0.227\n", "income 0.0052 0.018 0.295 0.768 -0.029 0.040\n", "------------------------------------------------------------------------------\n", " PID=2 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -2.2509 0.763 -2.949 0.003 -3.747 -0.755\n", "logpopul -0.0888 0.039 -2.266 0.023 -0.166 -0.012\n", "selfLR 0.3917 0.108 3.619 0.000 0.180 0.604\n", "age -0.0229 0.008 -2.893 0.004 -0.038 -0.007\n", "educ 0.1810 0.085 2.123 0.034 0.014 0.348\n", "income 0.0479 0.022 2.149 0.032 0.004 0.092\n", "------------------------------------------------------------------------------\n", " PID=3 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -3.6656 1.157 -3.169 0.002 -5.932 -1.399\n", "logpopul -0.1060 0.057 -1.858 0.063 -0.218 0.006\n", "selfLR 0.5734 0.159 3.617 0.000 0.263 0.884\n", "age -0.0149 0.011 -1.311 0.190 -0.037 0.007\n", "educ -0.0072 0.126 -0.057 0.955 -0.255 0.240\n", "income 0.0576 0.034 1.713 0.087 -0.008 0.123\n", "------------------------------------------------------------------------------\n", " PID=4 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -7.6139 0.958 -7.951 0.000 -9.491 -5.737\n", "logpopul -0.0916 0.044 -2.091 0.037 -0.177 -0.006\n", "selfLR 1.2788 0.129 9.921 0.000 1.026 1.531\n", "age -0.0087 0.008 -1.031 0.302 -0.025 0.008\n", "educ 0.1998 0.094 2.123 0.034 0.015 0.384\n", "income 0.0845 0.026 3.226 0.001 0.033 0.136\n", "------------------------------------------------------------------------------\n", " PID=5 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -7.0605 0.844 -8.362 0.000 -8.715 -5.406\n", "logpopul -0.0933 0.039 -2.371 0.018 -0.170 -0.016\n", "selfLR 1.3470 0.117 11.494 0.000 1.117 1.577\n", "age -0.0179 0.008 -2.352 0.019 -0.033 -0.003\n", "educ 0.2169 0.085 2.552 0.011 0.050 0.384\n", "income 0.0810 0.023 3.524 0.000 0.036 0.126\n", "------------------------------------------------------------------------------\n", " PID=6 coef std err z P>|z| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const -12.1058 1.060 -11.421 0.000 -14.183 -10.028\n", "logpopul -0.1409 0.042 -3.343 0.001 -0.223 -0.058\n", "selfLR 2.0701 0.143 14.435 0.000 1.789 2.351\n", "age -0.0094 0.008 -1.160 0.246 -0.025 0.007\n", "educ 0.3219 0.091 3.534 0.000 0.143 0.500\n", "income 0.1089 0.025 4.304 0.000 0.059 0.158\n", "==============================================================================\n" ] } ], "source": [ "mlogit_res = mlogit_mod.fit(method=\"bfgs\", maxiter=250)\n", "print(mlogit_res.summary())" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 4 }