import pandas as pd

df = pd.read_csv("housing.data")
y = df.median_house_value
X = df.drop(columns=["median_house_value"])
X.head()


X.ocean_proximity.unique()

array(['NEAR BAY', '<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'ISLAND'],
      dtype=object)


X[:-1].describe()


df = df.dropna()
y = df.median_house_value
X = df.drop(columns=["median_house_value"])


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=0
)


from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

qt = QuantileTransformer(output_distribution="normal")
dummies = OneHotEncoder()
transformer = make_column_transformer(
    (qt, X.columns[:-1]),
    (dummies, [X.columns[-1]])
)


pd.DataFrame(transformer.fit_transform(X)).head()


from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline

model = Lasso(alpha=10, fit_intercept=False)
pipe = Pipeline(
    steps = [("transformer", transformer), ("model", model)]
)


pipe.fit(X_train, y_train)
score_train = pipe.score(X_train, y_train)
score_test = pipe.score(X_test, y_test)
print("R-squared on training data is", score_train)
print("R-squared on test data is", score_test)

R-squared on training data is 0.617063711730554
R-squared on test data is 0.6264072535855956


from sklearn.model_selection import GridSearchCV

alphas = (0.1, 1, 10, 100)

cv = GridSearchCV(
    pipe,
    param_grid = {"model__alpha": alphas}
)
cv.fit(X_train, y_train)
print(f"best alpha is {cv.best_params_}")
print(f"score on the test data is {cv.score(X_test, y_test)}")

best alpha is {'model__alpha': 10}
score on the test data is 0.6264072535855956

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	ocean_proximity
0	-122.23	37.88	41.0	880.0	129.0	322.0	126.0	8.3252	NEAR BAY
1	-122.22	37.86	21.0	7099.0	1106.0	2401.0	1138.0	8.3014	NEAR BAY
2	-122.24	37.85	52.0	1467.0	190.0	496.0	177.0	7.2574	NEAR BAY
3	-122.25	37.85	52.0	1274.0	235.0	558.0	219.0	5.6431	NEAR BAY
4	-122.25	37.85	52.0	1627.0	280.0	565.0	259.0	3.8462	NEAR BAY

	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income
count	20639.000000	20639.000000	20639.000000	20639.000000	20432.000000	20639.000000	20639.000000	20639.000000
mean	-119.569624	35.631680	28.640099	2635.755851	537.866729	1425.478608	499.538204	3.870743
std	2.003547	2.135846	12.585555	2181.667858	421.395028	1132.489526	382.338957	1.899840
min	-124.350000	32.540000	1.000000	2.000000	1.000000	3.000000	1.000000	0.499900
25%	-121.800000	33.930000	18.000000	1447.500000	296.000000	787.000000	280.000000	2.563800
50%	-118.490000	34.260000	29.000000	2127.000000	435.000000	1166.000000	409.000000	3.534900
75%	-118.010000	37.710000	37.000000	3148.000000	647.000000	1725.000000	605.000000	4.743400
max	-114.310000	41.950000	52.000000	39320.000000	6445.000000	35682.000000	6082.000000	15.000100

	0	1	2	3	4	5	6	7	11
0	-1.152175	0.899534	0.912774	-1.355132	-1.694923	-1.697037	-1.639539	1.919214	1.0
1	-1.137677	0.882739	-0.479432	1.813497	1.466763	1.193058	1.623705	1.914455	1.0
2	-1.169401	0.875354	5.199338	-0.654930	-1.334066	-1.317949	-1.324958	1.629912	1.0
3	-1.189522	0.875354	5.199338	-0.874727	-1.059682	-1.176890	-1.068522	1.083588	1.0
4	-1.189522	0.875354	5.199338	-0.471004	-0.773140	-1.164444	-0.810910	0.191633	1.0

Some things to read about machine learning¶

Pre-Processing for Machine Learning¶

BUSI 520 - Python for Business Research¶

Kerry Back, JGSB, Rice University¶

Issues and methods¶

An alternative to logs for dealing with skewness¶

Scikit-learn quantile transformer¶

Scaling¶

Pipelines¶

Dummy variables¶

A Kaggle dataset¶

Drop rows with missing values¶

Train-test split¶

Transformer¶

See what the transformer does¶

Pipeline with lasso¶

Train and test¶

GridSearchCV¶