Skip to main content

1. Data Generation for Supervised Learing(using Scipy(numpy,pandas,matplotlib))

· 4 min read
Shaurya Singhal

Source: View original notebook on GitHub

Category: Machine Learning / Learn ML

1. Data Generation for Supervised Learing(using Scipy(numpy,pandas,matplotlib))

Regression Data

import numpy as np
x = np.random.randn(300)
x.shape

Output:

(300,)
def hypothesis(x):
return 6*x +5
y = hypothesis(x)
y.shape

Output:

(300,)
import matplotlib.pyplot as plt
plt.scatter(x,y)

Output

Output:

<matplotlib.collections.PathCollection at 0xf94f510>
def addnoise(y):
return 2*np.random.randn(y.shape[0]) # JUST DOUBLING more more noise
plt.scatter(x,y+addnoise(y),color ='orange')
plt.plot(x,hypothesis(x))
plt.show()

Output

here is the regression data along line 6x+5

Generating Classification data

# lets generate multi-variate data this time
# with two features x1 and x2


# Docstring:
# multivariate_normal(mean, cov[, size, check_valid, tol])

# Class 1 data , value being 0
mean1 = np.array([2,4])
cov1 = np.array([[0.5,0.4],
[0.4,0.7]])
class1_arr = np.random.multivariate_normal(mean1,cov1,300)


# Class 2 data , value being 1
mean2 = np.array([-1,2])
cov2 = np.array([[1,0.4],
[0.4,1]])
class2_arr = np.random.multivariate_normal(mean2,cov2,400)
x = np.vstack((class1_arr,class2_arr))
y  = np.ones(700)
y[:300] = 0
y.shape

Output:

(700,)
data  = np.column_stack((x,y))
np.random.shuffle(data)
data
# here is my classification data

Output:

array([[-1.20215667,  1.81619577,  1.        ],
[-2.18227822, 1.0554847 , 1. ],
[ 0.30946201, 2.3828793 , 1. ],
...,
[-0.38716961, 2.23936831, 1. ],
[ 0.25692066, 2.31069276, 1. ],
[ 2.04968868, 4.48527163, 0. ]])
import pandas as pd
df = pd.DataFrame(data,columns = ['x1','x2','class'])
df.head()

Output:

x1        x2  class
0 -1.202157 1.816196 1.0
1 -2.182278 1.055485 1.0
2 0.309462 2.382879 1.0
3 1.876904 4.091469 0.0
4 -0.772710 2.171740 1.0

ploting classification data

hardway
def colorplotting(x,y):
for i in range(y.shape[0]):
if y[i] == 1:
plt.scatter(x[i,0],x[i,1],color = 'green')
else:
plt.scatter(x[i,0],x[i,1],color = 'red')
colorplotting(x,y)

Output

shortcut
plt.scatter(x[:,0],x[:,1], c= y )# giving y to color so that based on value colors are decided,works for catagorical data

Output

Output:

<matplotlib.collections.PathCollection at 0x11e06b70>

2. Making data using scikit learn

(2a)Regression Data(make_regression)

from sklearn.datasets import make_regression
# try running to see the documentation 
# make_regression?
XR,YR = make_regression(n_samples=400, n_features = 1,n_informative=1,n_targets=1,noise=25.1,random_state=21,shuffle=False)
XR.shape

Output:

(400, 1)
YR.shape

Output:

(400,)
import matplotlib.pyplot as plt
plt.scatter(XR,YR)
plt.show()

Output

(2b)Classification Data(make_classification)

from sklearn.datasets import make_classification # Generate a random n-class classification problem
XC,YC = make_classification(n_samples=400,n_features=2,n_classes=3,n_clusters_per_class=1,n_informative=2,
n_redundant=0
)
print(XC.shape)
print(YC.shape)

Output:

(400, 2)
(400,)
plt.scatter(XC[:,0],XC[:,1],c=YC)

Output

Output:

<matplotlib.collections.PathCollection at 0x14146870>

(2c)more Classification data(make_blobs)

from sklearn.datasets import make_blobs
XB,YB = make_blobs(n_samples=400,n_features=2,centers=5)
XB.shape

Output:

(400, 2)
YB.shape

Output:

(400,)
plt.scatter(XB[:,0], XB[:,1],c =YB)

Output

Output:

<matplotlib.collections.PathCollection at 0x1410a910>

(2d) more (make_moons)

from sklearn.datasets import make_moons
XM,YM = make_moons(n_samples=50)
XM.shape

Output:

(50, 2)
YM.shape

Output:

(50,)
plt.scatter(XM[:,0],XM[:,1],c=YM)
plt.show()

Output

3. subplots having all four above datasets

import matplotlib.pyplot as plt
import numpy as np
# way 1 - object oriented way
canvas, axes = plt.subplots(nrows=2,ncols=2,figsize=(15,10))

axes[0][0].scatter(XR,YR)
axes[0][0].set_title('Regression DATA')

axes[0][1].scatter(XC[:,0], XC[:,1],c=YC)
axes[0][1].set_title('Classification DATA')

axes[1][0].scatter(XB[:,0], XB[:,1],c=YB)
axes[1][0].set_title('Blobs DATA')

axes[1][1].scatter(XM[:,0], XM[:,1],c=YM)
axes[1][1].set_title('Moon DATA')

Output

Output:

Text(0.5, 1.0, 'Moon DATA')
# canvas # all subpots are stored in canvas
# way 2 - Functional WAY
fig = plt.figure(figsize=(15,10))

plt.subplot(221)
plt.scatter(XR,YR)
plt.title('Regression DATA')

plt.subplot(222)
plt.scatter(XC[:,0], XC[:,1],c=YC)
plt.title('Classification DATA')

plt.subplot(223)
plt.scatter(XB[:,0], XB[:,1],c=YB)
plt.title('Blobs DATA')

plt.subplot(224)
plt.scatter(XM[:,0], XM[:,1],c=YM)
plt.title('Moon DATA')

Output

Output:

Text(0.5, 1.0, 'Moon DATA')
# fig
# way3 (optional)
fig = plt.figure(figsize=(15,15))

plt.subplot2grid((10,10),(0,0),rowspan=4,colspan=4)
plt.scatter(XR,YR)
plt.title('Regression DATA')

plt.subplot2grid((10,10),(0,5),rowspan=4,colspan=4)
plt.scatter(XC[:,0], XC[:,1],c=YC)
plt.title('Classification DATA')

plt.subplot2grid((10,10),(5,0),rowspan=4,colspan=4)
plt.scatter(XB[:,0], XB[:,1],c=YB)
plt.title('Blobs DATA')

plt.subplot2grid((10,10),(5,5),rowspan=4,colspan=4)
plt.scatter(XM[:,0], XM[:,1],c=YM)
plt.title('Moon DATA')

Output

Output:

Text(0.5, 1.0, 'Moon DATA')