Source: View original notebook on GitHub
Category: Machine Learning / Learn ML
Linear Regression over Multivariate data
lets work this time with boston dataset in sklearn.datasets
- in single feature datasets we got a linear function hx=theta0 + theta1*x
- in two-variate datasets we got a plane hx=theta0 + theta1x1 + theta2x2
- in more than two variable datasets we got a hyperplane
from sklearn.datasets import load_boston
boston = load_boston()
X = boston.data
Y = boston.target
print(X.shape) # see number of features are 13
print(Y.shape)
Output:
(506, 13)
(506,)
1. Using sklearn to do the calculation of theta values for us
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# 1. splitting data in training and test part
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=101)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
Output:
(404, 13) (404,)
(102, 13) (102,)
# 1. creating an object of model
alg = LinearRegression(normalize=True)
# 2. training model on training data
alg.fit(X_train,Y_train)
Output:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
# 3. predicting the output
Y_pred = alg.predict(X_test)
%matplotlib inline
# 4. comparing Y_pred with Y_test using plotting to see how good our model is: graphically
import matplotlib.pyplot as plt
import numpy as np
plt.scatter(Y_test,Y_pred)
ranger = np.arange(5,50)
plt.plot(ranger,ranger,'r',label = 'Y=X')
plt.axis([0,60,0,60])
plt.legend()
plt.show()
Accuracy of our Model
Coefficeient of Determination (R square)
R^2 = 1-u/v
where
u is the residual sum of squares ((y_true - y_pred) ** 2).sum() and
v is the total sum of squares ((y_true - y_true.mean()) ** 2).sum().
Interpretation of R^2
- Basically we are predicitng how bad or good our algorithm with respect to (if we predict all values over x_test to be mean value)
- 1.) if our algo predict as bad as means values , u == v is True hence R^2 being Zero score
- 2.) if our algo predict values closer to actual values, u < v is True hence R^2 being more toward score One-> so better score is good score
- 3.) if our algo predict values far from actual values and worse than mean values, u > v is True hence R^2 being more toward score Zero
Result
higher the score(Coefficient of Determination) better is our algo.
# it internally find the predicted values for X_test using alg model that we fitted and than compare
# it with Y_test that we passed as second argument
test_score = alg.score(X_test,Y_test)
train_score = alg.score(X_train,Y_train) # how well our model remember the data we use to fit the model
test_score, train_score
Output:
(0.7034623076515882, 0.7513747062933072)
coeffiecients and intercepts
print(len(alg.coef_)) # since there are 13 features there will be 14 parameters value form theta0 to theta13
alg.coef_ # thirteen are here
Output:
13
array([-9.89913563e-02, 4.25047578e-02, 1.67390094e-02, 3.06437882e+00,
-1.56462528e+01, 4.01888422e+00, -8.31558474e-04, -1.44628034e+00,
2.67827281e-01, -1.04725485e-02, -8.88651815e-01, 8.25262805e-03,
-5.49367192e-01])
alg.intercept_ # 1 is here
Output:
32.906130908637024
2. Writting above algo from scratch for multivariate data using Gradient descent
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
Output:
(404, 13) (404,)
(102, 13) (102,)
# theta will be np.array([theta0,theta1,.............,thetaN])
def error(X,Y,theta):
hx = hypothesis(X,theta)
return (0.5) * np.sum((hx-Y)**2)
def hypothesis(X,theta):
# returning h(Xi) for each sample (Xi) hence returning an array of size X.shape[0]
data = np.sum(X*theta[1:], axis=1) + theta[0]
return data
def gradient(X,Y,theta):
grad = np.zeros(X.shape[1]+1) # n-featured data have (n+1) parameters.
hx = hypothesis(X,theta)
grad[0] = np.sum((hx-Y))
for i in range(1,len(grad)) :
# d(cost) / d(theta[i]) = sum((hx-y)*x[i])
grad[i] = np.sum( (hx-Y) * X[:,i-1])
return grad
def gradientDescent_multivariate(X,Y,learning_rate=0.0001):
theta = np.zeros(X.shape[1]+1)
error_list = []
err = error(X,Y,theta)
error_list.append(err)
while True:
grad = gradient(X,Y,theta)
theta = theta - learning_rate * grad
err = error(X,Y,theta)
error_change = abs(err - error_list[-1])
error_list.append(err)
if error_change < 0.00001:
break
return theta,error_list
# using sklearn to do the scaling data
from sklearn.preprocessing import StandardScaler
sd = StandardScaler()
X = sd.fit_transform(X)
# or can be done using X = (X-X.mean(axis=0)) / X.std(axis=0)
# 1. splitting data in training and test part
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=101) # splitting exactly same as we did above
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)
Output:
(404, 13) (404,)
(102, 13) (102,)
tht,error_list = gradientDescent_multivariate(X_train,Y_train)
# intercept
tht
Output:
array([22.33658176, -0.85004138, 0.98937129, 0.1119804 , 0.77801762,
-1.81071759, 2.82152356, -0.02390664, -3.0425964 , 2.32256481,
-1.75532792, -1.92170291, 0.75262121, -3.9187913 ])
Y_pred = hypothesis(X_test,tht)
plt.scatter(Y_test,Y_pred)
ranger = np.arange(5,50)
plt.plot(ranger,ranger,'r',label = 'Y=X')
plt.axis([0,60,0,60])
plt.legend()
plt.show()
from sklearn.metrics import r2_score
test_score = r2_score(Y_test,Y_pred)
Y_train_pred = hypothesis(X_train,tht)
train_score = r2_score(Y_train,Y_train_pred)
test_score,train_score # same as we got from sklearn
Output:
(0.7034337844873662, 0.7513745935685816)
Note -> how score is calculated using sklearn.metric
from sklearn.metrics import mean_absolute_error,mean_squared_error
u = mean_squared_error(Y_train,Y_train_pred)
u
Output:
19.19822645644905
Y_train_mean_arr = np.full(Y_train.shape, Y_train.mean())
v = mean_squared_error(Y_train,Y_train_mean_arr)
v
Output:
77.21747641162631
score = 1 - u/v
score
Output:
0.7513745935685816
