Source: View original notebook on GitHub
Category: Machine Learning / Learn ML

Challenge - Air Pollution(https://www.kaggle.com/c/air-pollution-prediction/overview)

It is winter time in Delhi so Cody decided to go for a walk to the news stand and on reaching was surprised to see the air quality index of Delhi in the newspaper . So he decided to collect the air samples of different locations and then took these samples to his lab where he extracted five features of the air he collected which can be used to predict the air quality index and combined it with the air quality index given in the newspapers. You are provided with the data collected by Cody and your job is to design a machine learning model , which is given by the features extracted by Cody to predict air quality.

submit file like sample_submittion format :-

Id	target
0	ans0
1	ans1
2	ans2
3	ans3
4	ans4
5	ans5
6	ans6

# loading Dataset
import pandas as pd
train = pd.read_csv('Datasets/Assignment2_Train.csv')
train.head()

Output:

feature_1  feature_2  feature_3  feature_4  feature_5     target
 0.293416  -0.945599  -0.421105   0.406816   0.525662 -82.154667
-0.836084  -0.189228  -0.776403  -1.053831   0.597997 -48.897960
 0.236425   0.132836  -0.147723   0.699854  -0.187364  77.270371
 0.175312   0.143194  -0.581111  -0.122107  -1.292168  -2.988581
-1.693011   0.542712  -2.798729  -0.686723   1.244077 -37.596722

train.shape # 5 are features; 1 is target

Output:

(1600, 6)

train.columns

Output:

Index(['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5',
       'target'],
      dtype='object')

X = train[train.columns[:-1]]
X.head()

Output:

feature_1  feature_2  feature_3  feature_4  feature_5
 0.293416  -0.945599  -0.421105   0.406816   0.525662
-0.836084  -0.189228  -0.776403  -1.053831   0.597997
 0.236425   0.132836  -0.147723   0.699854  -0.187364
 0.175312   0.143194  -0.581111  -0.122107  -1.292168
-1.693011   0.542712  -2.798729  -0.686723   1.244077

Y = train[train.columns[-1]]
Y.head()

Output:

 -82.154667
 -48.897960
  77.270371
  -2.988581
 -37.596722
Name: target, dtype: float64

X.shape, Y.shape

Output:

((1600, 5), (1600,))

type(X)

Output:

pandas.core.frame.DataFrame

import pandas as pd

# preprocessing 
from sklearn.preprocessing import StandardScaler
s = StandardScaler()
X = pd.DataFrame(s.fit_transform(X))
# output for test file
test = pd.read_csv('Datasets/Assignment2_Test.csv')
test = s.transform(test.values)

type(X)

Output:

pandas.core.frame.DataFrame

X.describe()

Output:

0             1             2             3             4
count  1.600000e+03  1.600000e+03  1.600000e+03  1.600000e+03  1.600000e+03
mean  -2.331468e-17 -1.776357e-17 -1.776357e-17  2.331468e-17  8.881784e-18
std    1.000313e+00  1.000313e+00  1.000313e+00  1.000313e+00  1.000313e+00
min   -3.394334e+00 -3.218189e+00 -3.073464e+00 -3.154539e+00 -2.927091e+00
25%   -6.532217e-01 -6.631960e-01 -6.544315e-01 -6.560276e-01 -6.417809e-01
50%   -4.487509e-03 -1.582564e-02  3.151454e-03  1.244233e-02 -2.609701e-02
75%    6.800261e-01  6.589081e-01  6.758504e-01  6.772709e-01  6.522049e-01
max    3.292885e+00  3.393682e+00  3.223719e+00  2.977582e+00  3.383015e+00

Y.describe()

Output:

count    1600.000000
mean        0.318835
std       110.741562
min      -379.829794
25%       -71.897040
50%        -0.610665
75%        71.226603
max       337.643014
Name: target, dtype: float64

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25,random_state=101)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

Output:

((1200, 5), (400, 5), (1200,), (400,))

from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, Y_train)

Output:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

model.coef_

Output:

array([29.85207419, 94.83165412,  8.05996353, 45.23534964,  2.34253763])

model.intercept_

Output:

0.6828984772412013

Y_pred = model.predict(X_test)

model.score(X_test,Y_test)

Output:

0.9682476621478511

from sklearn.metrics import r2_score
r2_score(Y_test,Y_pred)

Output:

0.9682476621478511

%matplotlib inline

import matplotlib.pyplot as plt 
plt.scatter(Y_test,Y_pred) # approximately linear

Output

Output:

&lt;matplotlib.collections.PathCollection at 0x1286ca30&gt;

# output for test file
test = pd.DataFrame(test)

test.head()

Output:

       1         2         3         4
1.014156  2.059621 -0.219462 -2.336264 -1.014474
-0.380266  0.960186  0.663580  0.734423 -0.375628
-1.031635  0.969787 -0.359367 -1.788688  0.453005
-2.501395  0.560382  0.638368 -0.555246 -1.355645
-0.389949 -0.789089  1.387127 -0.497137  0.408771

test.describe()

Output:

0           1           2           3           4
count  400.000000  400.000000  400.000000  400.000000  400.000000
mean    -0.025996    0.014487    0.033092    0.007419    0.019432
std      0.982084    1.000121    1.030062    1.017343    0.958216
min     -2.576064   -2.902958   -2.692526   -2.949381   -2.560900
25%     -0.791048   -0.600358   -0.727477   -0.676770   -0.614510
50%     -0.008899   -0.048541   -0.064106    0.001629    0.036103
75%      0.679685    0.749801    0.789482    0.687178    0.587112
max      3.250423    2.564306    2.920374    2.659714    2.921637

output = model.predict(test)

output.shape

Output:

(400,)

type(output)

Output:

numpy.ndarray

output.dtype

Output:

dtype('float64')

df = pd.DataFrame(output, index = [i for i in range(output.shape[0])], columns= ['target'])

df.head()

Output:

target
116.447831
118.077583
-20.894286
-43.994147
-96.138932

df.index.name = 'Id'

df.head()

Output:

df.to_csv('Datasets/Assignment2_output.csv') # 33 rank with score of 0.96802

Using code from Scratch

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# theta will be np.array([theta0,theta1,.............,thetaN])
def error(X,Y,theta):
    hx = hypothesis(X,theta)    
    return (0.5) * np.sum((hx-Y)**2)
    
def hypothesis(X,theta):    
    # returning h(Xi) for each sample (Xi) hence returning an array of size X.shape[0]
    data = np.sum(X*theta[1:], axis=1) + theta[0]
    return data

    
def gradient(X,Y,theta):    
    grad = np.zeros(X.shape[1]+1) # n-featured data have (n+1) parameters.
    hx = hypothesis(X,theta)    
    grad[0] = np.sum((hx-Y))
    
    for i in range(1,len(grad)) :
        # d(cost) / d(theta[i]) = sum((hx-y)*x[i])
        grad[i] = np.sum( (hx-Y) * X[:,i-1])
    return grad
    
    
def gradientDescent_multivariate(X,Y,learning_rate=0.0001):

    theta = np.zeros(X.shape[1]+1)
    error_list = []    
    err = error(X,Y,theta)
    error_list.append(err)
    
    while True:
        grad = gradient(X,Y,theta)
        theta = theta - learning_rate * grad     
        err = error(X,Y,theta)
        error_change = abs(err - error_list[-1]) 
        error_list.append(err)        
        if error_change < 0.00001:
            break
        
    return theta,error_list

theta,error = gradientDescent_multivariate(X_train.values,Y_train.values)

theta

Output:

array([ 0.68287854, 29.85199475, 94.83151151,  8.05984701, 45.23524455,
        2.34249223])

# prediction
test.shape

Output:

(400, 5)

hypo = theta[0] + theta[1]*test[0] + theta[2]*test[1] + theta[3]*test[2] + theta[4]*test[3] + theta[5]*test[4]

hypo.shape

Output:

(400,)

output = hypo.values

type(output)

Output:

numpy.ndarray

output.dtype

Output:

dtype('float64')

df = pd.DataFrame(output, index = [i for i in range(output.shape[0])], columns= ['target'])

df.head()

Output:

target
116.447754
118.077318
-20.894153
-43.994003
-96.138936

df.index.name = 'Id'

df.head()

Output:

df.to_csv('Datasets/Assignment2_output_scratch.csv') # 33 rank with score of 0.96802

Challenge - Air Pollution(https://www.kaggle.com/c/air-pollution-prediction/overview)​

Using code from Scratch

Thank you!​

Challenge - Air Pollution(https://www.kaggle.com/c/air-pollution-prediction/overview)

Thank you!