{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Prediction (out of sample)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:39:38.366238Z", "iopub.status.busy": "2023-12-14T14:39:38.365784Z", "iopub.status.idle": "2023-12-14T14:39:39.436056Z", "shell.execute_reply": "2023-12-14T14:39:39.435150Z" } }, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:39:39.441690Z", "iopub.status.busy": "2023-12-14T14:39:39.440188Z", "iopub.status.idle": "2023-12-14T14:39:40.659007Z", "shell.execute_reply": "2023-12-14T14:39:40.658011Z" } }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "import statsmodels.api as sm\n", "\n", "plt.rc(\"figure\", figsize=(16, 8))\n", "plt.rc(\"font\", size=14)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Artificial data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:39:40.663039Z", "iopub.status.busy": "2023-12-14T14:39:40.662639Z", "iopub.status.idle": "2023-12-14T14:39:40.669832Z", "shell.execute_reply": "2023-12-14T14:39:40.669018Z" } }, "outputs": [], "source": [ "nsample = 50\n", "sig = 0.25\n", "x1 = np.linspace(0, 20, nsample)\n", "X = np.column_stack((x1, np.sin(x1), (x1 - 5) ** 2))\n", "X = sm.add_constant(X)\n", "beta = [5.0, 0.5, 0.5, -0.02]\n", "y_true = np.dot(X, beta)\n", "y = y_true + sig * np.random.normal(size=nsample)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Estimation " ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:39:40.673207Z", "iopub.status.busy": "2023-12-14T14:39:40.672810Z", "iopub.status.idle": "2023-12-14T14:39:40.690855Z", "shell.execute_reply": "2023-12-14T14:39:40.690107Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " OLS Regression Results \n", "==============================================================================\n", "Dep. Variable: y R-squared: 0.985\n", "Model: OLS Adj. R-squared: 0.984\n", "Method: Least Squares F-statistic: 1018.\n", "Date: Thu, 14 Dec 2023 Prob (F-statistic): 4.80e-42\n", "Time: 14:39:40 Log-Likelihood: 3.0746\n", "No. Observations: 50 AIC: 1.851\n", "Df Residuals: 46 BIC: 9.499\n", "Df Model: 3 \n", "Covariance Type: nonrobust \n", "==============================================================================\n", " coef std err t P>|t| [0.025 0.975]\n", "------------------------------------------------------------------------------\n", "const 4.9317 0.081 60.993 0.000 4.769 5.094\n", "x1 0.5066 0.012 40.625 0.000 0.482 0.532\n", "x2 0.5504 0.049 11.227 0.000 0.452 0.649\n", "x3 -0.0206 0.001 -18.810 0.000 -0.023 -0.018\n", "==============================================================================\n", "Omnibus: 2.839 Durbin-Watson: 2.081\n", "Prob(Omnibus): 0.242 Jarque-Bera (JB): 2.094\n", "Skew: -0.492 Prob(JB): 0.351\n", "Kurtosis: 3.190 Cond. No. 221.\n", "==============================================================================\n", "\n", "Notes:\n", "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n" ] } ], "source": [ "olsmod = sm.OLS(y, X)\n", "olsres = olsmod.fit()\n", "print(olsres.summary())" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## In-sample prediction" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:39:40.782977Z", "iopub.status.busy": "2023-12-14T14:39:40.782621Z", "iopub.status.idle": "2023-12-14T14:39:40.787813Z", "shell.execute_reply": "2023-12-14T14:39:40.787046Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[ 4.41682151 4.92268515 5.38579485 5.77615583 6.07459828 6.27592688\n", " 6.38977441 6.43901906 6.45602567 6.47732824 6.53762749 6.66408968\n", " 6.8718832 7.1616865 7.51957661 7.91931663 8.32666638 8.705008\n", " 9.02136211 9.25180625 9.3854053 9.42600804 9.39161484 9.31142024\n", " 9.22101636 9.15654525 9.14876081 9.21797603 9.37072562 9.59869241\n", " 9.88007377 10.18316377 10.47156237 10.71015582 10.87088578 10.9373592\n", " 10.90754167 10.79409166 10.62228076 10.42584165 10.2414259 10.10258223\n", " 10.03424454 10.04863535 10.14325767 10.30130525 10.49442381 10.68736953\n", " 10.84379957 10.93224304]\n" ] } ], "source": [ "ypred = olsres.predict(X)\n", "print(ypred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create a new sample of explanatory variables Xnew, predict and plot" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:39:40.790712Z", "iopub.status.busy": "2023-12-14T14:39:40.790187Z", "iopub.status.idle": "2023-12-14T14:39:40.795393Z", "shell.execute_reply": "2023-12-14T14:39:40.794565Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[10.91763232 10.75840089 10.47613227 10.12001251 9.75478779 9.44491225\n", " 9.23876741 9.15681664 9.18659505 9.28576134]\n" ] } ], "source": [ "x1n = np.linspace(20.5, 25, 10)\n", "Xnew = np.column_stack((x1n, np.sin(x1n), (x1n - 5) ** 2))\n", "Xnew = sm.add_constant(Xnew)\n", "ynewpred = olsres.predict(Xnew) # predict out of sample\n", "print(ynewpred)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Plot comparison" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:39:40.798070Z", "iopub.status.busy": "2023-12-14T14:39:40.797744Z", "iopub.status.idle": "2023-12-14T14:39:41.087194Z", "shell.execute_reply": "2023-12-14T14:39:41.086371Z" } }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "\n", "fig, ax = plt.subplots()\n", "ax.plot(x1, y, \"o\", label=\"Data\")\n", "ax.plot(x1, y_true, \"b-\", label=\"True\")\n", "ax.plot(np.hstack((x1, x1n)), np.hstack((ypred, ynewpred)), \"r\", label=\"OLS prediction\")\n", "ax.legend(loc=\"best\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Predicting with Formulas" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Using formulas can make both estimation and prediction a lot easier" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:39:41.090551Z", "iopub.status.busy": "2023-12-14T14:39:41.090071Z", "iopub.status.idle": "2023-12-14T14:39:41.097714Z", "shell.execute_reply": "2023-12-14T14:39:41.097139Z" } }, "outputs": [], "source": [ "from statsmodels.formula.api import ols\n", "\n", "data = {\"x1\": x1, \"y\": y}\n", "\n", "res = ols(\"y ~ x1 + np.sin(x1) + I((x1-5)**2)\", data=data).fit()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We use the `I` to indicate use of the Identity transform. Ie., we do not want any expansion magic from using `**2`" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:39:41.100558Z", "iopub.status.busy": "2023-12-14T14:39:41.100203Z", "iopub.status.idle": "2023-12-14T14:39:41.106958Z", "shell.execute_reply": "2023-12-14T14:39:41.105858Z" } }, "outputs": [ { "data": { "text/plain": [ "Intercept 4.931706\n", "x1 0.506602\n", "np.sin(x1) 0.550372\n", "I((x1 - 5) ** 2) -0.020595\n", "dtype: float64" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.params" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now we only have to pass the single variable and we get the transformed right-hand side variables automatically" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2023-12-14T14:39:41.110438Z", "iopub.status.busy": "2023-12-14T14:39:41.109890Z", "iopub.status.idle": "2023-12-14T14:39:41.117999Z", "shell.execute_reply": "2023-12-14T14:39:41.117381Z" } }, "outputs": [ { "data": { "text/plain": [ "0 10.917632\n", "1 10.758401\n", "2 10.476132\n", "3 10.120013\n", "4 9.754788\n", "5 9.444912\n", "6 9.238767\n", "7 9.156817\n", "8 9.186595\n", "9 9.285761\n", "dtype: float64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "res.predict(exog=dict(x1=x1n))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.13" } }, "nbformat": 4, "nbformat_minor": 4 }