%pylab inline
import pandas as pd
import numpy as np
Populating the interactive namespace from numpy and matplotlib
data = pd.read_csv('adultdata.csv')
data
def calc_error(y_arr, x_arr, w, train=False):
err_sum=0.0
for j in range(len(y_arr)):
y_out = (x_arr[j].dot(w)>=0.0) and 1 or -1 # make a prediction, +1 (versi) or -1 (vir)
err = y_arr[j]-y_out # compare to known species
err_sum += abs(err) and 1 or 0 # accumulate the total error
if train:
w += eta*err*x_arr[j] # adjust the weights during training only
return err_sum
def train(y, x, w):
return calc_error(y, xnew, w, train=True)
def test(y, x, w):
return calc_error(y, x, w)
data.Race = np.array([x.strip() for x in data.Race.values])
data
data.Race = np.where(data.Race.values!='White',1,2)
data
data.Sex = np.array([x.strip() for x in data.Sex.values])
data
data.Sex = np.where(data.Sex.values!='Male',1,2)
data
data.Income = np.array([x.strip() for x in data.Income.values])
data
data.Country = np.array([x.strip() for x in data.Country.values])
data
data.Country = np.where(data.Country.values!='United-States',1,2)
data
Trainingdata = []
Traininganswers = []
for i in range(0,99):
Trainingdata.append([1, data['Age'][i] * data['Race'][i]* data['Country'][i], data['Sex'][i]])
for i in range(0,99):
if data['Income'][i] == '>50K':
Traininganswers.append(1)
else:
Traininganswers.append(-1)
Trainingdata = np.array(Trainingdata)
Traininganswers = np.array(Traininganswers)
print(Trainingdata)
print(Traininganswers)
[[ 1 200 2]
[ 1 152 2]
[ 1 106 2]
[ 1 28 1]
[ 1 148 1]
[ 1 49 1]
[ 1 208 2]
[ 1 124 1]
[ 1 168 2]
[ 1 74 2]
[ 1 30 2]
[ 1 92 1]
[ 1 64 2]
[ 1 80 2]
[ 1 34 2]
[ 1 100 2]
[ 1 128 2]
[ 1 152 2]
[ 1 172 1]
[ 1 160 2]
[ 1 108 1]
[ 1 70 2]
[ 1 172 2]
[ 1 236 1]
[ 1 224 2]
[ 1 76 2]
[ 1 54 2]
[ 1 156 2]
[ 1 196 2]
[ 1 92 2]
[ 1 40 2]
[ 1 180 2]
[ 1 120 2]
[ 1 44 2]
[ 1 96 2]
[ 1 84 2]
[ 1 76 1]
[ 1 124 2]
[ 1 192 2]
[ 1 124 2]
[ 1 212 2]
[ 1 96 2]
[ 1 196 1]
[ 1 100 2]
[ 1 114 2]
[ 1 212 2]
[ 1 176 1]
[ 1 228 2]
[ 1 160 2]
[ 1 50 1]
[ 1 72 1]
[ 1 94 1]
[ 1 200 2]
[ 1 188 2]
[ 1 172 2]
[ 1 92 2]
[ 1 70 2]
[ 1 164 2]
[ 1 120 2]
[ 1 120 2]
[ 1 128 2]
[ 1 192 2]
[ 1 168 2]
[ 1 144 2]
[ 1 112 1]
[ 1 212 1]
[ 1 196 2]
[ 1 100 2]
[ 1 76 2]
[ 1 62 1]
[ 1 116 2]
[ 1 92 2]
[ 1 224 2]
[ 1 316 2]
[ 1 54 2]
[ 1 160 2]
[ 1 268 2]
[ 1 72 1]
[ 1 124 2]
[ 1 72 2]
[ 1 104 2]
[ 1 160 2]
[ 1 160 2]
[ 1 184 1]
[ 1 236 2]
[ 1 176 1]
[ 1 212 1]
[ 1 196 2]
[ 1 132 2]
[ 1 120 2]
[ 1 172 1]
[ 1 228 2]
[ 1 148 1]
[ 1 56 1]
[ 1 136 2]
[ 1 116 2]
[ 1 192 2]
[ 1 148 2]
[ 1 96 1]]
[ 1 1 1 -1 1 -1 1 1 1 -1 -1 -1 -1 1 -1 1 1 1 1 1 -1 -1 1 -1
1 -1 1 -1 -1 -1 -1 1 -1 -1 -1 -1 -1 1 -1 -1 1 -1 1 -1 1 1 -1 1
1 -1 -1 1 1 -1 1 -1 -1 1 1 1 1 1 1 1 -1 1 1 -1 -1 -1 1 -1
1 -1 -1 1 1 -1 -1 -1 -1 1 1 1 1 1 -1 1 -1 -1 1 1 -1 -1 1 -1
1 1 -1]
xlim(0, 300) # plot x from 0 to 4
ylim(0, 3) # plot y from 0 to 4
greaterthan5k = (Traininganswers==1) # boolean, have covid?
lessthanorequal5k = invert(greaterthan5k) # boolean, inverse of have covid?
title("Income data")
xlabel("Age x Race x Country")
ylabel("Gender (M(2), F(1)")
plot(Trainingdata[greaterthan5k,1], Trainingdata[greaterthan5k,2],'b.',label="Earns greater than 5k")
plot(Trainingdata[lessthanorequal5k,1], Trainingdata[lessthanorequal5k,2],'r.',label="Earns less than 5K")
legend()
grid()
def calc_error(y_arr, x_arr, w, train=False):
err_sum=0.0
for j in range(len(y_arr)):
y_out = (x_arr[j].dot(w)>=0.0) and 1 or -1 # make a prediction, +1 (versi) or -1 (vir)
err = y_arr[j]-y_out # compare to known species
err_sum += abs(err) and 1 or 0 # accumulate the total error
if train:
w += eta*err*x_arr[j] # adjust the weights during training only
return err_sum
def train(y, x, w):
return calc_error(y, x, w, train=True)
def test(y, x, w):
return calc_error(y, x, w)
eta=0.1 # how much should we "nudge" the weights each time?
w = rand(len(Trainingdata[0]))*.05 # random array of "small" weights
err_test=[]
err_train=[]
train_iters=100
print(Traininganswers[:20])
for i in range(train_iters):
err_train.append(train(Traininganswers[:10], Trainingdata[:10], w))
err_test.append(test(Traininganswers[10:], Trainingdata[10:], w))
title("Training and testing error.")
plot(range(train_iters),err_train,'b', label="Train")
plot(range(train_iters),err_test,'r', label="Test")
xlabel("Iteration")
ylabel("Error")
legend()
grid()
[ 1 1 1 -1 1 -1 1 1 1 -1 -1 -1 -1 1 -1 1 1 1 1 1]
xx,yy = meshgrid(linspace(min(Trainingdata[:,1])-1,max(Trainingdata[:,1]+1),100),
linspace(min(Trainingdata[:,2])-1,max(Trainingdata[:,2]+1),100))
z=w[0]+xx*w[1] + yy*w[2]
contourf(xx,yy,z,levels=[-10,0,+10])
title("Income data")
xlabel("Age x Race x Country")
ylabel("Gender (M(2), F(1)")
plot(Trainingdata[greaterthan5k,1], Trainingdata[greaterthan5k,2],'b.',label="Earns greater than 5k")
plot(Trainingdata[lessthanorequal5k,1], Trainingdata[lessthanorequal5k,2],'r.',label="Earns less than 5K")
legend()
grid()