import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
Choose parameters to maximize fit of $$ \frac{1}{1+e^{-\sum \beta_i x_i}}$$ to the dummy variable.
# Generate 50 features (predictors)
np.random.seed(0)
X = pd.DataFrame(
np.random.normal(size=(100, 50))
)
# Some noise
noise = np.random.normal(size=100)
# Regression target
yr = X[0] + noise
# Binary target
yb = 1*(yr > 0)
# Multi-class target
ym = 1*(yr > -1) + 1*(yr > 1)
from sklearn.model_selection import train_test_split
Xb_train, Xb_test, yb_train, yb_test = train_test_split(
X, yb,
test_size=0.2,
random_state=0
)
from sklearn.model_selection import train_test_split
Xm_train, Xm_test, ym_train, ym_test = train_test_split(
X, ym,
test_size=0.2,
random_state=0
)
from sklearn.model_selection import train_test_split
Xr_train, Xr_test, yr_train, yr_test = train_test_split(
X, yr,
test_size=0.2,
random_state=0
)
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(penalty="l1", C=0.1, solver="saga")
model1.fit(Xb_train, yb_train)
score_train = model1.score(Xb_train, yb_train)
score_test = model1.score(Xb_test, yb_test)
print("fraction of correct predictions on training data is", score_train)
print("fraction of correct predictions on test data is", score_test)
model1.coef_
fraction of correct predictions on training data is 0.775 fraction of correct predictions on test data is 0.75
array([[0.81212582, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]])
model2 = LogisticRegression(penalty="l1", C=10, solver="saga")
model2.fit(Xb_train, yb_train)
score_train = model2.score(Xb_train, yb_train)
score_test = model2.score(Xb_test, yb_test)
print("fraction of correct predictions on training data is", score_train)
print("fraction of correct predictions on test data is", score_test)
model2.coef_
fraction of correct predictions on training data is 1.0 fraction of correct predictions on test data is 0.75
c:\Users\kerry\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_sag.py:350: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
array([[ 4.39484914, 0.70602202, -0.00976995, -0.95366401, -0.56172954, 0.70037444, -0.68074406, -0.48295366, -0.29018539, 0.00463586, 0.55071787, -0.29725181, -1.15324176, 1.49022276, -0.1683152 , -0.14891732, -0.33110891, -0.07961488, -0.58525715, 0.08778821, -1.06723425, 0.13855838, 0.25251624, -0.3314787 , -1.05678255, -0.13192985, 0.03917435, 0.33361992, -1.19925872, 0.69104033, 0.89930758, 0.96306877, -0.64733895, -0.93188081, -0.39631996, 0.26691888, -0.04563392, -0.32488566, 0.87030211, 0.05369104, 1.91678787, 0.89243369, -0.43012215, 0.75254286, -1.15978466, -0.76464907, -0.7606832 , -0.04347331, -0.2423112 , -0.1362384 ]])
model1.predict(Xb_test)
array([0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0])
model1.predict_proba(Xb_test)
array([[0.79577461, 0.20422539], [0.50674884, 0.49325116], [0.20686297, 0.79313703], [0.7164633 , 0.2835367 ], [0.6315142 , 0.3684858 ], [0.37458425, 0.62541575], [0.2767514 , 0.7232486 ], [0.48704699, 0.51295301], [0.85064424, 0.14935576], [0.74601464, 0.25398536], [0.4826039 , 0.5173961 ], [0.2213151 , 0.7786849 ], [0.68433994, 0.31566006], [0.48810558, 0.51189442], [0.69125581, 0.30874419], [0.54355521, 0.45644479], [0.74696746, 0.25303254], [0.7136915 , 0.2863085 ], [0.34993452, 0.65006548], [0.66186093, 0.33813907]])
Run GridSearchCV on the training data to find the best $C$ in (0.001, 0.01, 0.1, 1, 10, 100) for logit with an $\ell^1$ penalty.
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_estimator(
estimator=model1,
X=Xb_test,
y=yb_test
)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1c058dc4e80>
from sklearn.metrics import RocCurveDisplay
RocCurveDisplay.from_estimator(
estimator=model1,
X=Xb_test,
y=yb_test,
)
plt.show()
model = LogisticRegression(penalty="l1", C=0.1, solver="saga")
model.fit(Xm_train, ym_train)
score_train = model.score(Xm_train, ym_train)
score_test = model.score(Xm_test, ym_test)
print("fraction of correct predictions on training data is", score_train)
print("fraction of correct predictions on test data is", score_test)
fraction of correct predictions on training data is 0.5375 fraction of correct predictions on test data is 0.55
ConfusionMatrixDisplay.from_estimator(estimator=model, X=Xm_test, y=ym_test)
<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1c05985dc00>
# Generate data
np.random.seed(0)
size = 100
x = np.linspace(-2, 2, 100)
y = 2.9 * np.sin(1.5 * x) + 2*np.random.normal(size=size)
# View data
plt.scatter(x, y)
plt.show()
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree
model = DecisionTreeRegressor(max_depth=3)
model.fit(x.reshape(-1,1), y)
plot_tree(model)
plt.show()
yhat = model.predict(x.reshape(-1, 1))
plt.scatter(x, y, label="data")
plt.plot(x, 2.9 * np.sin(1.5 * x), c = sns.color_palette()[1], label="true")
plt.plot(x, yhat, c=sns.color_palette()[2], label="fitted")
plt.legend()
plt.show()
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
model = DecisionTreeClassifier(max_depth=3)
model.fit(Xb_train, yb_train)
plot_tree(model)
plt.show()
model.feature_importances_,
(array([0.38800201, 0. , 0. , 0. , 0. , 0. , 0. , 0.13804098, 0. , 0.10808874, 0. , 0. , 0.04526492, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.095504 , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.0983576 , 0.12674176, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ]),)
Fit the decision tree classifier on the training data for max_depth in (2, 3, 4, 5, 6) and plot the train and test scores as a function of max_depth.
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(
max_depth=3,
random_state=0
)
model.fit(Xr_train, yr_train)
score_train = model.score(Xr_train, yr_train)
score_test = model.score(Xr_test, yr_test)
print("R-squared on training data is", score_train)
print("R-squared on test data is", score_test)
R-squared on training data is 0.7841535694978208 R-squared on test data is -0.48772625535569736
prediction in each tree.
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(
max_depth=3,
n_estimators=100,
random_state=0
)
model.fit(Xb_train, yb_train)
score_train = model.score(Xb_train, yb_train)
score_test = model.score(Xb_test, yb_test)
print("fraction correct on training data is", score_train)
print("fraction correct on test data is", score_test)
fraction correct on training data is 1.0 fraction correct on test data is 0.65
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(
max_depth=3,
n_estimators=100,
random_state=0
)
model.fit(Xr_train, yr_train)
score_train = model.score(Xr_train, yr_train)
score_test = model.score(Xr_test, yr_test)
print("R-squared on training data is", score_train)
print("R-squared on test data is", score_test)
R-squared on training data is 0.817628313823389 R-squared on test data is 0.41907924542639485
Run GridSearchCV on the training data for RandomForestClassifier to find the best max_depth in (2, 3, 4, 5, 6).
from sklearn.ensemble import (
GradientBoostingRegressor,
GradientBoostingClassifier,
AdaBoostRegressor,
AdaBoostClassifier
)
from xgboost import XGBRegressor, XGBClassifier
model = XGBRegressor(
max_depth=4,
learning_rate=0.01,
n_estimators=100
)
model.fit(Xr_train, yr_train)
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.01, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=4, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, random_state=None, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, device=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, feature_types=None, gamma=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.01, max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=4, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, multi_strategy=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, random_state=None, ...)
score_train = model.score(Xr_train, yr_train)
score_test = model.score(Xr_test, yr_test)
print("R-squared on training data is", score_train)
print("R-squared on test data is", score_test)
R-squared on training data is 0.7765080341299813 R-squared on test data is 0.2969226702834281
# binary
from sklearn.datasets import load_breast_cancer
X1, y1 = load_breast_cancer(return_X_y=True)
print(f"(numobs, numfeatures) = {X1.shape}")
print(f"target values are {np.unique(y1)}")
(numobs, numfeatures) = (569, 30) target values are [0 1]
# multi-class
from sklearn.datasets import load_digits
X2, y2 = load_digits(return_X_y=True)
print(f"(numobs, numfeatures) = {X2.shape}")
print(f"target values are {np.unique(y2)}")
(numobs, numfeatures) = (1797, 64) target values are [0 1 2 3 4 5 6 7 8 9]
# regression
from sklearn.datasets import load_diabetes
X3, y3 = load_diabetes(return_X_y=True)
print(f"(numobs, numfeatures) = {X3.shape}")
print(f"target values are {np.unique(y3)}")
(numobs, numfeatures) = (442, 10) target values are [ 25. 31. 37. 39. 40. 42. 43. 44. 45. 47. 48. 49. 50. 51. 52. 53. 54. 55. 57. 58. 59. 60. 61. 63. 64. 65. 66. 67. 68. 69. 70. 71. 72. 73. 74. 75. 77. 78. 79. 80. 81. 83. 84. 85. 86. 87. 88. 89. 90. 91. 92. 93. 94. 95. 96. 97. 98. 99. 100. 101. 102. 103. 104. 107. 108. 109. 110. 111. 113. 114. 115. 116. 118. 120. 121. 122. 123. 124. 125. 126. 127. 128. 129. 131. 132. 134. 135. 136. 137. 138. 139. 140. 141. 142. 143. 144. 145. 146. 147. 148. 150. 151. 152. 153. 154. 155. 156. 158. 160. 161. 162. 163. 164. 166. 167. 168. 170. 171. 172. 173. 174. 175. 177. 178. 179. 180. 181. 182. 183. 184. 185. 186. 187. 189. 190. 191. 192. 195. 196. 197. 198. 199. 200. 201. 202. 206. 208. 209. 210. 212. 214. 215. 216. 217. 219. 220. 221. 222. 225. 229. 230. 232. 233. 235. 236. 237. 241. 242. 243. 244. 245. 246. 248. 249. 252. 253. 257. 258. 259. 261. 262. 263. 264. 265. 268. 270. 272. 273. 274. 275. 276. 277. 279. 280. 281. 283. 288. 292. 293. 295. 296. 297. 302. 303. 306. 308. 310. 311. 317. 321. 332. 336. 341. 346.]