import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
colors = sns.color_palette()


# Generate data
np.random.seed(0)
x = np.linspace(-2, 2, 100)
x = np.sort(x)
y = 2.9 * np.sin(1.5 * x) + 2*np.random.normal(size=100)

# View data
plt.scatter(x, y)
plt.show()


# fit a cubic spline through every point

from scipy.interpolate import UnivariateSpline

f1 = UnivariateSpline(x, y, s=0)
plt.scatter(x, y)
plt.plot(x, f1(x), color=colors[1])

[<matplotlib.lines.Line2D at 0x183be28d690>]


# allow more error in the fitting

f2 = UnivariateSpline(x, y, s=1000)
plt.scatter(x, y)
plt.plot(x, f1(x), color=colors[1])
plt.plot(x, f2(x), color=colors[2])
plt.show()


# split into train and test

train_index = np.random.choice(range(100), 80, replace=False)
train_index = np.sort(train_index)
test_index = [i for i in range(100) if i not in train_index]
X_train = x[train_index]
y_train = y[train_index]
X_test = x[test_index]
y_test = y[test_index]


# train on the training data

f1 = UnivariateSpline(X_train, y_train, s=0)
f2 = UnivariateSpline(X_train, y_train, s=1000)


# test on the test data

y1 = f1(X_test)
y2 = f2(X_test)

MSE1 = ((y1-y_test)**2).mean()
MSE2 = ((y2-y_test)**2).mean()

Rsquared1 = 1 - MSE1 / ((y_test-y_test.mean())**2).mean()
Rsquared2 = 1 - MSE2 / ((y_test-y_test.mean())**2).mean()

print(f"MSE for s=0 is {MSE1:.2f} and MSE for s=1000 is {MSE2: .2f}")
print(f"Rsquared for s=0 is {Rsquared1:.2f} and Rsquared for s=1000 is {Rsquared2: .2f}")

MSE for s=0 is 10.39 and MSE for s=1000 is  3.25
Rsquared for s=0 is -0.14 and Rsquared for s=1000 is  0.64


# split into train, validate, and test

train_index = np.random.choice(range(100), 50, replace=False)
train_index = np.sort(train_index)
remainder = [i for i in range(100) if i not in train_index]

validate_index = np.random.choice(remainder, 30, replace=False)
validate_index = np.sort(validate_index)

test_index = [x for x in remainder if x not in validate_index]


X_train = x[train_index]
X_validate = x[validate_index]
X_test = x[test_index]

y_train = y[train_index]
y_validate = y[validate_index]
y_test = y[test_index]


# train on the training data

f1 = UnivariateSpline(X_train, y_train, s=0)
f2 = UnivariateSpline(X_train, y_train, s=1000)


# evaluate on the validation data

y1 = f1(X_validate)
y2 = f2(X_validate)

MSE1 = ((y1-y_validate)**2).mean()
MSE2 = ((y2-y_validate)**2).mean()

s_star = 0 if MSE1 < MSE2 else 1000
f = f1 if MSE1 < MSE2 else f2

print(f"s_star = {s_star} ")

s_star = 1000


# test on the test data

y = f(X_test)
MSE = ((y-y_test)**2).mean()


# Generate 50 features (predictors)
x = np.random.normal(size=(100, 50))
X = pd.DataFrame(x, columns = [f"feature_{i}" for i in range(50)])

# Generate target 
y = X["feature_0"] + np.random.normal(size=100)

# Split into train and test (cross-validate on train)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=0
)


alphas = (0.01, 0.1, 1, 10)

from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

cv = GridSearchCV(
    Lasso(),
    param_grid = {"alpha": alphas}
)
cv.fit(X_train, y_train)
print(f"best alpha is {cv.best_params_}")
print(f"score on the test data is {cv.score(X_test, y_test)}")

best alpha is {'alpha': 0.1}
score on the test data is 0.5223433808001436


model = Lasso(alpha=0.1)
model.fit(X_train, y_train)

y = model.predict(X_test)
MSE = ((y-y_test)**2).mean()
Rsquared = 1 - MSE / ((y_test - y_test.mean())**2).mean()
Rsquared

0.5223433808001436


model.score(X_test, y_test)

0.5223433808001436


model.coef_

array([ 8.07659090e-01,  1.28241776e-02, -8.16207260e-02, -0.00000000e+00,
        0.00000000e+00, -0.00000000e+00,  0.00000000e+00,  2.49309321e-02,
       -0.00000000e+00,  0.00000000e+00,  5.52442637e-02,  4.31431610e-02,
       -4.74039660e-02, -0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        3.51699519e-02,  0.00000000e+00, -0.00000000e+00, -3.15119743e-02,
        9.51707801e-02, -7.81431547e-02, -0.00000000e+00,  2.64459040e-03,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
       -1.14012069e-01, -3.21462206e-03, -7.80113187e-02, -2.90951829e-03,
        6.41555559e-02,  0.00000000e+00,  0.00000000e+00, -0.00000000e+00,
        1.58550020e-02, -1.42960471e-01,  1.35194834e-03,  1.38864658e-01,
        0.00000000e+00,  1.54431721e-01,  0.00000000e+00, -5.51621435e-04,
       -0.00000000e+00, -0.00000000e+00,  1.27830398e-01, -0.00000000e+00,
        0.00000000e+00,  0.00000000e+00])


model = Lasso(alpha=0.001)
model.fit(X_train, y_train)

y = model.predict(X_test)
print(f"R-squared on the test data is {model.score(X_test, y_test)}")
model.coef_

R-squared on the test data is -0.11831357095359651

array([ 7.44095387e-01,  1.50574525e-01, -2.16406975e-01, -9.37848965e-02,
       -7.88442951e-03, -2.61448749e-01, -2.23115013e-01,  1.41213701e-01,
       -2.05727838e-01,  8.69720498e-02,  1.96430668e-01,  1.90863130e-01,
       -2.04630578e-01,  7.88961108e-02,  2.68283890e-01,  2.58803863e-03,
        1.51014325e-01, -3.57379978e-02,  1.46412081e-02, -2.86623466e-01,
        1.09352789e-01, -2.97383243e-01, -1.52117595e-01,  9.61046088e-03,
        1.89262578e-01,  1.06889402e-01,  9.61902719e-02,  9.97602420e-03,
       -2.84166174e-01, -1.16065700e-01, -5.89673877e-02, -8.47104939e-02,
        1.45569709e-01, -4.19782896e-02,  2.51649095e-01, -1.31626275e-01,
        1.98304824e-01, -3.30684848e-01,  3.12918987e-01,  2.82472769e-01,
       -4.21487651e-02,  3.52603769e-01, -6.46157198e-04,  7.14541984e-02,
       -9.34123556e-02, -2.49428890e-01,  4.07065003e-01, -2.84174664e-01,
       -7.23148234e-03,  1.81110210e-01])


alphas = np.logspace(-3, 0, 20)
train_scores = []
test_scores = []
for alpha in alphas:
    model = Lasso(alpha=alpha)
    model.fit(X_train, y_train)
    train_scores.append(model.score(X_train, y_train))
    test_scores.append(model.score(X_test, y_test))


plt.plot(alphas, train_scores, label=r"in-sample $R^2$ (train data)")
plt.plot(alphas, test_scores, label=r"out-of-sample $R^2$ (test data")
plt.xlabel("alpha (higher means more penalized)")
plt.legend()
plt.show()


from sklearn.linear_model import ElasticNet 
param_grid = {
    "alpha": [0.01, 0.1, 1, 10],
    "l1_ratio": [0.1, 0.25, 0.5, 0.75, 0.9]
}
cv = GridSearchCV(
    ElasticNet(),
    param_grid = param_grid
)
cv.fit(X_train, y_train)
print(f"best parameters are {cv.best_params_}")
print(f"score on the test data is {cv.score(X_test, y_test)}")

best parameters are {'alpha': 0.1, 'l1_ratio': 0.75}
score on the test data is 0.49697698213848684

Introduction to Machine Learning¶

BUSI 520 - Python for Business Research¶

Kerry Back, JGSB, Rice University¶

Fitting and Overfitting¶

Hyperparameters¶

Train and Test¶

Train, Validate, and Test¶

Cross Validation¶

Supervised and unsupervised learning¶

Classification and regression¶

Some models for classification and regression¶

Penalized linear regression example¶

Lasso¶

Ridge regression¶

Elastic net¶

Grid Search CV¶

Exercise¶

Elastic net example¶

Question¶