Midterm
Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
Importing Dataset
from sklearn.datasets import fetch_california_housing
data = fetch_california_housing(as_frame=True)
df = data.frame
X = np.array(data.data)
y = np.array(data.target)
X_names = data.feature_names
Define KFold
def DoKFold(model, X, y, k, standardize=False, random_state=146):
if standardize:
from sklearn.preprocessing import StandardScaler as SS
ss = SS()
kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
train_scores = []
test_scores = []
train_mse = []
test_mse = []
for idxTrain, idxTest in kf.split(X):
Xtrain = X[idxTrain,:]
Xtest = X[idxTest,:]
ytrain = y[idxTrain]
ytest = y[idxTest]
if standardize:
Xtrain = ss.fit_transform(Xtrain)
Xtest = ss.transform(Xtest)
model.fit(Xtrain, ytrain)
train_scores.append(model.score(Xtrain, ytrain))
test_scores.append(model.score(Xtest, ytest))
ytrain_pred = model.predict(Xtrain)
ytest_pred = model.predict(Xtest)
train_mse.append(np.mean((ytrain - ytrain_pred)**2))
test_mse.append(np.mean((ytest - ytest_pred)**2))
return train_scores, test_scores, train_mse, test_mse
Question 15
df.corr()
We can see from the returned table that MedInc is most correlated with the target.
Question 16
from sklearn.preprocessing import StandardScaler as SS
ss = SS()
X1 = SS().fit_transform(X)
df1 = pd.DataFrame(X, columns=X_names)
df1['MedHouseVal'] = y
df1.corr()
The result is the same as Question 15.
Question 17
X = data.data[['MedInc']]
lin_reg = LinearRegression()
np.round(lin_reg.fit(X, y).score(X, y), 2)
Result: 0.47
Question 18
Run the “define DoKFold” process and run the following code:
k=20
train_scores, test_scores, train_mse, test_mse = DoKFold(lin_reg,X,y,k,True)
print(np.mean(train_scores), np.mean(test_scores))
print(np.mean(train_mse), np.mean(test_mse))
Results:
0.6063019182717753 0.6019800920504694
0.524225284184374 0.5287980265178303
Question 19
from sklearn.linear_model import Ridge
k = 20
rid_a_range = np.linspace(20,30,101)
rid_train=[]
rid_test=[]
rid_train_mse=[]
rid_test_mse=[]
for a in rid_a_range:
mdl = Ridge(alpha=a)
train, test, train_mse, test_mse = DoKFold(mdl,X,y,k,standardize = True)
rid_train.append(np.mean(train))
rid_test.append(np.mean(test))
rid_train_mse.append(np.mean(train_mse))
rid_test_mse.append(np.mean(test_mse))
idx = np.argmax(rid_test)
print('Optimal alpha value: ' + format(rid_a_range[idx], '.5f'))
print('Training score for this value: ' + format(rid_train[idx],'.5f'))
print('Testing score for this value: ' + format(rid_test[idx], '.5f'))
Result:
Optimal alpha value: 25.80000
Training score for this value: 0.60627
Testing score for this value: 0.60201
Question 20
Simply change from “Ridge” to “Lasso”
from sklearn.linear_model import Lasso
las_a_range = np.linspace(0.001,0.003,101)
las_train=[]
las_test=[]
las_train_mse=[]
las_test_mse=[]
for a in las_a_range:
mdl = Lasso(alpha=a)
train,test,train_mse,test_mse = DoKFold(mdl,X,y,k,standardize = True)
las_train.append(np.mean(train))
las_test.append(np.mean(test))
las_train_mse.append(np.mean(train_mse))
las_test_mse.append(np.mean(test_mse))
idx = np.argmax(las_test)
print('Optimal alpha value: ' + format(las_a_range[idx], '.5f'))
print('Training score for this value: ' + format(las_train[idx],'.5f'))
print('Testing score for this value: ' + format(las_test[idx], '.5f'))
Result:
Optimal alpha value: 0.00186
Training score for this value: 0.60616
Testing score for this value: 0.60213
Question 21
print(X_names[5])
lin = LinearRegression(); rid=Ridge(alpha=25.8); las = Lasso(alpha=0.00186)
lin.fit(X1,y); rid.fit(X1,y); las.fit(X1,y)
lin.coef_[5], rid.coef_[5], las.coef_[5]
Result:
AveOccup
(-0.039326266978148866, -0.039412573728940366, -0.03761823364553458)
Question 22
print(X_names[0])
lin = LinearRegression(); rid=Ridge(alpha=25.8); las = Lasso(alpha=0.00186)
lin.fit(X1,y); rid.fit(X1,y); las.fit(X1,y)
lin.coef_[0], rid.coef_[0], las.coef_[0]
Result:
MedInc
(0.82961930428045, 0.8288892465528181, 0.8200140807502059)
Question 23
idx = np.argmin(rid_test_mse)
print(rid_a_range[idx], rid_train[idx], rid_test[idx], rid_train_mse[idx], rid_test_mse[idx])
Result: 26.1 is a different value from 25.8 in Question 19
26.1 0.6062700593574847 0.6020111660228638 0.5242677048909694 0.5287556631434892
Question 24
Run codes for Question 20 and then the following codes:
idx = np.argmin(las_test_mse)
print(las_a_range[idx], las_train[idx], las_test[idx], las_train_mse[idx], las_test_mse[idx])
Results:
0.00186 0.6061563795668891 0.6021329052825213 0.524419071473502 0.5286007023316681
Reflection
In my original code, apart from certain typo error, I also didn’t standardize the data. These two mistakes end up giving me a 0.00100 optimal alpha value. I immediately found that 0.00100 is a weird result, but I just didn’t realized that I missed the standardization process. After checking my code, what I found was only a typo error (in the last for loop, I mistakenly typed las_reg
into rid_reg
), but this didn’t change the result. However, if the standardization process is added but the typo error is not removed, the optimal alpha value would be 0.00244, so the typo error is also insignificant. When typing in all the codes, I wrote the lines of code one after another, so it is quite difficult to locate the mistake after running all the codes. Maybe breaking these codes into small parts and run them little by little can make checking errors easier. I also need to review the questions carefully because I should have check the requirements of Question 20, this could have reminded me of the standardization problem.
I’ve attached the revision based on my original codes.
from sklearn.metrics import mean_squared_error
def DoKFold(model, X, y, k, standardize=False, random_state=146):
from sklearn.model_selection import KFold
if standardize:
from sklearn.preprocessing import StandardScaler as SS
ss = SS()
kf = KFold(n_splits=k, shuffle=True, random_state=random_state)
mse_train = []
mse_test = []
for idxTrain, idxTest in kf.split(X):
Xtrain = X[idxTrain, :]
Xtest = X[idxTest, :]
ytrain = y[idxTrain]
ytest = y[idxTest]
if standardize:
Xtrain = ss.fit_transform(Xtrain)
Xtest = ss.transform(Xtest)
model.fit(Xtrain, ytrain)
ytrain_predict = model.predict(Xtrain)
ytest_predict = model.predict(Xtest)
mse_train.append(mean_squared_error(ytrain,ytrain_predict))
mse_test.append(mean_squared_error(ytest,ytest_predict))
return mse_train, mse_test
a_range = np.linspace(0.001, 0.003, 101)
k = 20
avg_mse_train = []
avg_mse_test = []
for a in a_range:
las_reg = Lasso(alpha=a)
mse_train, mse_test = DoKFold(las_reg,X,y,k,standardize=True)
avg_mse_test.append(np.mean(mse_test))
idx = np.argmin(avg_mse_test)
print('Optimal alpha value: ' + format(a_range[idx], '.5f'))