Source: View original notebook on GitHub
Category: Machine Learning / Learn ML
Challenge - Hardwork Pays Off
About Walkatime Since you are a student of Coding Blocks, you are expected to work hard and get better results than others. To track your time and expected performance in the Machine Learning Challenge, Prateek bhayia has asked to you install walkatime on your device which is an efficient time tracking tool to track your daily coding activity.
Challenge
In this challenge, Prateek bhayia gives you walkatime data of his past students and how they performed in the evaluation exam. Your task is to predict the score you will get given the amount of time you spend on coding daily. Input You are given one feature corresponding to time noted by walkatime. Output A scalar denoting the level of perfomance student achived by devoting the given time.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# 1. loading Dataset
X = pd.read_csv('Datasets/Assignment1_x_train.csv')
Y = pd.read_csv('Datasets/Assignment1_y_train.csv')
test = pd.read_csv('Datasets/Assignment1_x_test.csv')
X.shape, Y.shape
Output:
((3750, 1), (3750, 1))
X.describe()
Output:
x
count 3750.000000
mean -0.037795
std 0.992212
min -3.546299
25% -0.698443
50% -0.035028
75% 0.629425
max 4.091393
Y.describe() # continious data
Output:
y
count 3750.000000
mean 0.683869
std 81.102629
min -286.959739
25% -54.036989
50% 0.180640
75% 54.695511
max 348.899461
plt.scatter(X,Y)
Output:
<matplotlib.collections.PathCollection at 0x13b8ebb0>
# 2. Data preprocessing
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
X = s.fit_transform(X)
test = s.transform(test)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=42)
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=None)
model.fit(X_train,Y_train)
Output:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
# model.predict(X_test)
model.coef_
Output:
array([[80.03060346]])
model.intercept_
Output:
array([0.81652466])
model.score(X_test,Y_test) # good
Output:
0.9698213937984322
output = model.predict(test)
np.savetxt('Datasets/Assignment1_result_output.csv',output, delimiter= ',',header='y',comments='')
df = pd.read_csv('Datasets/Assignment1_result_output.csv')
df # result , got 97 % after submittion
Output:
y
0 -147.627427
1 -66.238965
2 -200.243138
3 226.529781
4 51.541083
5 -100.650035
6 -12.012615
7 27.427134
8 -62.420167
9 -117.265198
10 -10.554614
11 9.689493
12 -36.842677
13 68.492889
14 11.801847
15 -9.333989
16 26.667465
17 108.026288
18 177.635113
19 23.262928
20 -30.996375
21 22.355121
22 -52.445547
23 -147.105662
24 69.788052
25 110.420358
26 155.665626
27 11.984797
28 34.833908
29 -92.007050
... ...
1220 -12.654623
1221 34.548221
1222 63.982002
1223 40.029062
1224 -90.407882
1225 -101.843792
1226 63.976112
1227 -101.386852
1228 56.849624
1229 -80.194740
1230 -85.748081
1231 25.527114
1232 -97.182339
1233 -0.121599
1234 -34.762941
1235 -71.215933
1236 -36.297917
1237 -25.084648
1238 91.807021
1239 76.414579
1240 -9.507748
1241 -0.688644
1242 89.517671
1243 31.870062
1244 -72.715199
1245 -9.691792
1246 79.067324
1247 14.191727
1248 -18.563751
1249 -51.024175
[1250 rows x 1 columns]
Thank You!!
Using code from Scratch
def hypothesis(X,theta):
return theta[0] + theta[1]*X
def gradient(X,Y,theta):
grad = np.zeros(2)
hx = hypothesis(X,theta)
grad[0] = np.sum(hx-Y)
grad[1] = np.sum((hx-Y)*X)
return grad
def error(X,Y,theta):
hx = hypothesis(X,theta)
return (1/2)*(np.sum((hx-Y)**2))
def gradientDescent(X,Y,learning_rate=0.0001):
# starting- point ()
theta = np.zeros(2)
# using fixed number of iteration
itr = 0
max_itr = 1000
error_list = []
while itr<max_itr:
grad = gradient(X,Y,theta)
error_list.append(error(X,Y,theta))
theta[0] = theta[0] - learning_rate * grad[0] # grad 0 is d(cost) / d(theta 0)
theta[1] = theta[1] - learning_rate * grad[1] # grad 1 is d(cost) / d(theta 1)
itr += 1
return theta,error_list
theta,error = gradientDescent(X_train,Y_train)
theta
Output:
array([ 0.81652466, 80.03060346])
output = hypothesis(test, theta)
output
Output:
array([[-147.62742695],
[ -66.23896503],
[-200.24313801],
...,
[ 14.19172725],
[ -18.56375107],
[ -51.0241749 ]])
np.savetxt('Datasets/Assignment1_result_output_scratch.csv',output, delimiter= ',',header='y',comments='')
df = pd.read_csv('Datasets/Assignment1_result_output_scratch.csv')
df # result , got 97 % after submittion
Output:
y
0 -147.627427
1 -66.238965
2 -200.243138
3 226.529781
4 51.541083
5 -100.650035
6 -12.012615
7 27.427134
8 -62.420167
9 -117.265198
10 -10.554614
11 9.689493
12 -36.842677
13 68.492889
14 11.801847
15 -9.333989
16 26.667465
17 108.026288
18 177.635113
19 23.262928
20 -30.996375
21 22.355121
22 -52.445547
23 -147.105662
24 69.788052
25 110.420358
26 155.665626
27 11.984797
28 34.833908
29 -92.007050
... ...
1220 -12.654623
1221 34.548221
1222 63.982002
1223 40.029062
1224 -90.407882
1225 -101.843792
1226 63.976112
1227 -101.386852
1228 56.849624
1229 -80.194740
1230 -85.748081
1231 25.527114
1232 -97.182339
1233 -0.121599
1234 -34.762941
1235 -71.215933
1236 -36.297917
1237 -25.084648
1238 91.807021
1239 76.414579
1240 -9.507748
1241 -0.688644
1242 89.517671
1243 31.870062
1244 -72.715199
1245 -9.691792
1246 79.067324
1247 14.191727
1248 -18.563751
1249 -51.024175
[1250 rows x 1 columns]
