Skip to main content

Using code from Scratch

· 6 min read
Shaurya Singhal

Source: View original notebook on GitHub

Category: Machine Learning / Learn ML

Challenge - Air Pollution(https://www.kaggle.com/c/air-pollution-prediction/overview)

It is winter time in Delhi so Cody decided to go for a walk to the news stand and on reaching was surprised to see the air quality index of Delhi in the newspaper . So he decided to collect the air samples of different locations and then took these samples to his lab where he extracted five features of the air he collected which can be used to predict the air quality index and combined it with the air quality index given in the newspapers. You are provided with the data collected by Cody and your job is to design a machine learning model , which is given by the features extracted by Cody to predict air quality.

submit file like sample_submittion format :-

Idtarget
0ans0
1ans1
2ans2
3ans3
4ans4
5ans5
6ans6
# loading Dataset
import pandas as pd
train = pd.read_csv('Datasets/Assignment2_Train.csv')
train.head()

Output:

feature_1  feature_2  feature_3  feature_4  feature_5     target
0 0.293416 -0.945599 -0.421105 0.406816 0.525662 -82.154667
1 -0.836084 -0.189228 -0.776403 -1.053831 0.597997 -48.897960
2 0.236425 0.132836 -0.147723 0.699854 -0.187364 77.270371
3 0.175312 0.143194 -0.581111 -0.122107 -1.292168 -2.988581
4 -1.693011 0.542712 -2.798729 -0.686723 1.244077 -37.596722
train.shape # 5 are features; 1 is target

Output:

(1600, 6)
train.columns

Output:

Index(['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
'target'],
dtype='object')
X = train[train.columns[:-1]]
X.head()

Output:

feature_1  feature_2  feature_3  feature_4  feature_5
0 0.293416 -0.945599 -0.421105 0.406816 0.525662
1 -0.836084 -0.189228 -0.776403 -1.053831 0.597997
2 0.236425 0.132836 -0.147723 0.699854 -0.187364
3 0.175312 0.143194 -0.581111 -0.122107 -1.292168
4 -1.693011 0.542712 -2.798729 -0.686723 1.244077
Y = train[train.columns[-1]]
Y.head()

Output:

0   -82.154667
1 -48.897960
2 77.270371
3 -2.988581
4 -37.596722
Name: target, dtype: float64
X.shape, Y.shape

Output:

((1600, 5), (1600,))
type(X)

Output:

pandas.core.frame.DataFrame
import pandas as pd
# preprocessing 
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
X = pd.DataFrame(s.fit_transform(X))
# output for test file
test = pd.read_csv('Datasets/Assignment2_Test.csv')
test = s.transform(test.values)
type(X)

Output:

pandas.core.frame.DataFrame
X.describe()

Output:

0             1             2             3             4
count 1.600000e+03 1.600000e+03 1.600000e+03 1.600000e+03 1.600000e+03
mean -2.331468e-17 -1.776357e-17 -1.776357e-17 2.331468e-17 8.881784e-18
std 1.000313e+00 1.000313e+00 1.000313e+00 1.000313e+00 1.000313e+00
min -3.394334e+00 -3.218189e+00 -3.073464e+00 -3.154539e+00 -2.927091e+00
25% -6.532217e-01 -6.631960e-01 -6.544315e-01 -6.560276e-01 -6.417809e-01
50% -4.487509e-03 -1.582564e-02 3.151454e-03 1.244233e-02 -2.609701e-02
75% 6.800261e-01 6.589081e-01 6.758504e-01 6.772709e-01 6.522049e-01
max 3.292885e+00 3.393682e+00 3.223719e+00 2.977582e+00 3.383015e+00
Y.describe()

Output:

count    1600.000000
mean 0.318835
std 110.741562
min -379.829794
25% -71.897040
50% -0.610665
75% 71.226603
max 337.643014
Name: target, dtype: float64
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,random_state=101)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

Output:

((1200, 5), (400, 5), (1200,), (400,))
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, Y_train)

Output:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
model.coef_

Output:

array([29.85207419, 94.83165412,  8.05996353, 45.23534964,  2.34253763])
model.intercept_

Output:

0.6828984772412013
Y_pred = model.predict(X_test)
model.score(X_test,Y_test)

Output:

0.9682476621478511
from sklearn.metrics import r2_score
r2_score(Y_test,Y_pred)

Output:

0.9682476621478511
%matplotlib inline
import matplotlib.pyplot as plt 
plt.scatter(Y_test,Y_pred) # approximately linear

Output

Output:

<matplotlib.collections.PathCollection at 0x1286ca30>
# output for test file
test = pd.DataFrame(test)
test.head()

Output:

0         1         2         3         4
0 1.014156 2.059621 -0.219462 -2.336264 -1.014474
1 -0.380266 0.960186 0.663580 0.734423 -0.375628
2 -1.031635 0.969787 -0.359367 -1.788688 0.453005
3 -2.501395 0.560382 0.638368 -0.555246 -1.355645
4 -0.389949 -0.789089 1.387127 -0.497137 0.408771
test.describe()

Output:

0           1           2           3           4
count 400.000000 400.000000 400.000000 400.000000 400.000000
mean -0.025996 0.014487 0.033092 0.007419 0.019432
std 0.982084 1.000121 1.030062 1.017343 0.958216
min -2.576064 -2.902958 -2.692526 -2.949381 -2.560900
25% -0.791048 -0.600358 -0.727477 -0.676770 -0.614510
50% -0.008899 -0.048541 -0.064106 0.001629 0.036103
75% 0.679685 0.749801 0.789482 0.687178 0.587112
max 3.250423 2.564306 2.920374 2.659714 2.921637
output = model.predict(test)
output.shape

Output:

(400,)
type(output)

Output:

numpy.ndarray
output.dtype

Output:

dtype('float64')
df = pd.DataFrame(output, index = [i for i in range(output.shape[0])], columns= ['target'])
df.head()

Output:

target
0 116.447831
1 118.077583
2 -20.894286
3 -43.994147
4 -96.138932
df.index.name = 'Id'
df.head()

Output:

target
Id
0 116.447831
1 118.077583
2 -20.894286
3 -43.994147
4 -96.138932
df.to_csv('Datasets/Assignment2_output.csv') # 33 rank with score of 0.96802

Using code from Scratch

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# theta will be np.array([theta0,theta1,.............,thetaN])
def error(X,Y,theta):
hx = hypothesis(X,theta)
return (0.5) * np.sum((hx-Y)**2)

def hypothesis(X,theta):
# returning h(Xi) for each sample (Xi) hence returning an array of size X.shape[0]
data = np.sum(X*theta[1:], axis=1) + theta[0]
return data


def gradient(X,Y,theta):
grad = np.zeros(X.shape[1]+1) # n-featured data have (n+1) parameters.
hx = hypothesis(X,theta)
grad[0] = np.sum((hx-Y))

for i in range(1,len(grad)) :
# d(cost) / d(theta[i]) = sum((hx-y)*x[i])
grad[i] = np.sum( (hx-Y) * X[:,i-1])
return grad


def gradientDescent_multivariate(X,Y,learning_rate=0.0001):

theta = np.zeros(X.shape[1]+1)
error_list = []
err = error(X,Y,theta)
error_list.append(err)

while True:
grad = gradient(X,Y,theta)
theta = theta - learning_rate * grad
err = error(X,Y,theta)
error_change = abs(err - error_list[-1])
error_list.append(err)
if error_change < 0.00001:
break

return theta,error_list
theta,error = gradientDescent_multivariate(X_train.values,Y_train.values)
theta

Output:

array([ 0.68287854, 29.85199475, 94.83151151,  8.05984701, 45.23524455,
2.34249223])
# prediction
test.shape

Output:

(400, 5)
hypo = theta[0] + theta[1]*test[0] + theta[2]*test[1] + theta[3]*test[2] + theta[4]*test[3] + theta[5]*test[4]
hypo.shape

Output:

(400,)
output = hypo.values
type(output)

Output:

numpy.ndarray
output.dtype

Output:

dtype('float64')
df = pd.DataFrame(output, index = [i for i in range(output.shape[0])], columns= ['target'])
df.head()

Output:

target
0 116.447754
1 118.077318
2 -20.894153
3 -43.994003
4 -96.138936
df.index.name = 'Id'
df.head()

Output:

target
Id
0 116.447754
1 118.077318
2 -20.894153
3 -43.994003
4 -96.138936
df.to_csv('Datasets/Assignment2_output_scratch.csv') # 33 rank with score of 0.96802

Thank you!