import numpy as np 

np.random.seed(0) # just so we all get the same results

X = np.random.normal(size=(1000, 100))
y = X[:, :10].sum(axis=1) + 10 * np.random.normal(size=1000)

# Try OLS

import statsmodels.api as sm 
import pandas as pd

model = sm.OLS(endog=y, exog=sm.add_constant(X))
result = model.fit()
tvalues = pd.Series(result.tvalues)

# significant features 
tvalues[tvalues > 2]

1     3.893511
2     3.977730
3     2.846583
4     2.837924
5     3.382157
6     3.654197
7     2.517688
8     3.355748
9     2.768420
10    2.926688
33    2.259072
35    2.481969
95    2.350388
dtype: float64

# coefficients of first 20 features 
result.params[1:21]

array([ 1.28003538,  1.29619223,  0.90471389,  0.91860977,  1.1069583 ,
        1.22942272,  0.82773064,  1.14865406,  0.89821728,  0.9811078 ,
        0.05494361, -0.30977471, -0.81276878, -0.18962315,  0.15576422,
       -0.19575052, -0.03620061,  0.43368138,  0.0599448 ,  0.20188705])

# some new (test) data from the same data generating process

X_test = np.random.normal(size=(200, 100))
y_test = X_test[:, :10].sum(axis=1) + 10 * np.random.normal(size=200)

# Diebold-Mariano test

# !pip install dieboldmariano 
from dieboldmariano import dm_test

# compare to a benchmark of zero 
benchmark_predict = np.zeros_like(y_test) 
model_predict = result.predict(sm.add_constant(X_test))
dm_test(y_test, model_predict, benchmark_predict, one_sided=True)

(np.float64(-0.5296622926168975), np.float64(0.29846825225075657))

# compare to a benchmark of the out-of-sample target mean
# unfair but common 

benchmark_predict = np.repeat(y_test.mean(), len(y_test))
dm_test(y_test, model_predict, benchmark_predict, one_sided=True)

(np.float64(-0.5109180851771397), np.float64(0.30498738035359785))

# fit Lasso 

from sklearn.linear_model import Lasso 
model = Lasso(alpha=1)
model.fit(X, y)

Lasso(alpha=1)

Lasso(alpha=1)

# first twenty coefficients (excluding intercept) 

model.coef_[:20]

array([ 0.10226887,  0.16364445,  0.        ,  0.10361974,  0.03833025,
        0.04410687,  0.        ,  0.08786848,  0.        ,  0.        ,
        0.        , -0.        , -0.        , -0.        ,  0.        ,
       -0.        ,  0.        ,  0.        , -0.        ,  0.        ])

# compute model score on test data 

model.score(X_test, y_test)

0.002859434185740306

# compute out-of-sample R2's for a range of alphas

alphas = np.linspace(0.001, 1, 20)
scores = []
for alpha in alphas:
    model = Lasso(alpha=alpha)
    model.fit(X, y)
    scores.append(model.score(X_test, y_test))

import matplotlib.pyplot as plt
plt.plot(alphas, scores)

[<matplotlib.lines.Line2D at 0x217b0374c20>]

# Diebold-Mariano test 

best_alpha = alphas[np.argmax(scores)]
model = Lasso(alpha=best_alpha)
model.fit(X, y)
model_predict = model.predict(X_test)
dm_test(y_test, model_predict, benchmark_predict, one_sided=True)

(np.float64(-2.0765971405984662), np.float64(0.019561588615378156))

# full hypothetical sample 
X = np.random.normal(size=(1500, 100))
y = 100 + X[:, :10].sum(axis=1) + 10 * np.random.normal(size=1500)

# randomly split into training and test samples
from sklearn.model_selection import train_test_split
np.random.seed(0)  # just so we all get the same results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=300)
X_train0, X_val, y_train0, y_val = train_test_split(X_train, y_train, test_size=200)

# compute out-of-sample R2's on validation data for a range of alphas

alphas = np.linspace(0.001, 1, 20)
scores = []
for alpha in alphas:
    model = Lasso(alpha=alpha)
    model.fit(X_train0, y_train0)
    scores.append(model.score(X_val, y_val))

import matplotlib.pyplot as plt
plt.plot(alphas, scores)

[<matplotlib.lines.Line2D at 0x217b040b1a0>]

# Diebold-Mariano test 

benchmark_predict = np.repeat(y_test.mean(), len(y_test))
best_alpha = alphas[np.argmax(scores)]
model = Lasso(alpha=best_alpha)
model.fit(X_train, y_train)  # fit using all data other than test data
model_predict = model.predict(X_test)
dm_test(y_test, model_predict, benchmark_predict, one_sided=True)

(np.float64(-3.3433227041855127), np.float64(0.00046659134228845705))

Day 6¶

BUSI 520: Python for Business Research¶

Kerry Back, JGSB, Rice University¶

Overview¶

Some toy data¶

Goal of machine learning¶

Diebold-Mariano test¶

Lasso¶

Out of sample $R^2$¶

Train, Validate, and Test¶

Cross Validation¶

Ridge regression¶

Exercise¶

Ask Julius¶