Machine Learning Assignment 02
Question 1
Question 2
Q2.1
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
Cleaning data
df = pd.read_csv('data.csv')
df.head(10)
pd.set_option('display.max_rows', df.shape[0]+1)
#Printing the types of each column. If the column type is solely integers, it is assumed that there are no errors.
print(df.dtypes)
Users object
Addthis int64
Bebo int64
Blogcatalog int64
Blogger int64
Buddymedia int64
Cnet int64
Conduit int64
Customerlobby int64
Delicious int64
Digg int64
Diigo int64
Docs int64
Docstoc int64
Download int64
Dropbox int64
Drupal int64
Epinions int64
Evernote int64
Facebook int64
Faves int64
Feedburner int64
Flickr int64
Foursquare int64
Friendfeed int64
Hootsuite int64
Joomla int64
Jumptags int64
Kaboodle int64
Kickapps int64
Linkedin int64
Lithium int64
Livejournal int64
Mashable int64
Meetup int64
Metacafe int64
Mixx int64
Mouthshut int64
Multiply int64
Mybloglog int64
Myspace int64
Netvibes int64
Newsvine int64
Ning int64
Orkut int64
Photobucket int64
Ping int64
Pinterest int64
Plaxo int64
Plurk int64
Posterous int64
Propeller int64
Radian6 int64
Reddit int64
Screencast int64
Scribd int64
Sharethis int64
Slashdot int64
Sliderocket int64
Slideshare int64
Squidoo int64
Startaid int64
Stumbleupon int64
Sysomos int64
Technorati int64
Thisnext int64
Tumblr int64
Tweetdeck int64
Twine int64
Twitter int64
Typepad int64
Ubertwitter int64
Viadeo int64
Vimeo int64
Vocus int64
Wetpaint int64
Wordpress int64
Xanga int64
Yelp int64
Yfrog int64
Youtube int64
Yuku int64
Click int64
dtype: object
df = df.rename(columns={"Unnamed: 0": "Users"})
df.groupby(by='Diigo').count()
#Changing the error to a 0.
df['Diigo'] = df['Diigo'].replace(['Error: value not found'],0)
#Changing column type Diigo from objet to int
df['Diigo'] = df['Diigo'].astype(int)
df = df.astype({'Diigo':int})
df.groupby(by='Diigo').count()
df.describe()
#One value is very high in column Newsvine, therefore we set this value to 0.
big_ind = df[df['Newsvine'] > 1000].index
df.loc[big_ind,'Newsvine'] = 0
df.loc[big_ind]
Support Vector Machine classifier
df.set_index('Users')
target = df['Click']
data = df.copy()
data.drop(['Click', 'Users'],axis=1,inplace=True)
#train, test = train_test_split(df, test_size = 0.2, random_state =100, shuffle = True)
Xtrain, Xtest, ytrain, ytest = train_test_split(data, target, test_size=0.2, random_state=42, shuffle=True)
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(Xtrain, ytrain)
clf.cv_results_
clf.best_estimator_
#Best paramters for svm is C=10 and linear kernel
model = clf.best_estimator_
yfit = model.predict(Xtest)
print('SVM:\n',classification_report(ytest, yfit))
SVM:
precision recall f1-score support
0 0.98 0.99 0.99 1761
1 0.90 0.88 0.89 239
accuracy 0.97 2000
macro avg 0.94 0.93 0.94 2000
weighted avg 0.97 0.97 0.97 2000
Random Forest model
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
rf_model = RandomForestClassifier(n_estimators=1000)
rf_model.fit(Xtrain,ytrain)
ypred = rf_model.predict(Xtest)
print('Random Forest: \n',classification_report(ytest, ypred))
print('\n\nSVM:\n',classification_report(ytest, yfit))
Random Forest:
precision recall f1-score support
0 0.88 1.00 0.94 1761
1 1.00 0.03 0.05 239
accuracy 0.88 2000
macro avg 0.94 0.51 0.49 2000
weighted avg 0.90 0.88 0.83 2000
SVM:
precision recall f1-score support
0 0.98 0.99 0.99 1761
1 0.90 0.88 0.89 239
accuracy 0.97 2000
macro avg 0.94 0.93 0.94 2000
weighted avg 0.97 0.97 0.97 2000
plot_confusion_matrix(model,Xtest,ytest)
plt.title('SVM model confusion matrix')
plot_confusion_matrix(rf_model,Xtest,ytest)
plt.title('Random Forest Confusion matrix')
plt.show()
Q2.2 - Feature Selection
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np
df_data = df.drop(columns= ['Users','Click'])
df_target = df['Click']
X_new = SelectKBest(chi2, k=5).fit(df_data, df_target)
X_5b = SelectKBest(chi2, k=5).fit_transform(df_data,df_target)
X_scores = -np.log10(X_new.pvalues_)
X_indices = np.arange(df_data.shape[-1])
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:8: RuntimeWarning: divide by zero encountered in log10
#The following five columns are the best features
X_5b.shape
#Making sure indices and scores are same shape
print(X_scores.shape)
print(X_indices.shape)
(81,)
(81,)
from operator import itemgetter
lst = []
for i in range(len(X_indices)):
lst.append([X_indices[i], X_scores[i]])
sorted_list = sorted(lst, key=itemgetter(1), reverse=True)
type(sorted_list[0][1])
#First three were either infinite or nan values.
#Top five are are the following list, index followed by value.
top_five = [sorted_list[3:8]]
print(top_five)
[[[31, 278.3462330867876], [69, 229.00854417311433], [5, 198.78161183128535], [77, 144.47215081094944], [2, 98.43412231046212]]]
plt.bar(X_indices, X_scores, label='Feature selection')
plt.title('Plot of feature selection')
plt.yticks()
plt.axis('tight')
plt.show()
Q2.3 - Recommendations
Based on the list, top_five, the best two channels for marketing that could be recommended would be column index 31 and 69, which correlate to:
colname1 = df_data.columns[31]
colname2 = df_data.columns[69]
col_l = sorted_list[-1]
print('Col31:',colname1,'\nCol69:', colname2)
Col31: Livejournal
Col69: Typepad
Q2.4 - Cost per impression
#Calculating sum of all columns, i.e. how many impressions each customer has
x_impsum = df_data.sum()
#Converting the df to a numpy array
impsum_lst = x_impsum.to_list()
impsum_arr = np.array(impsum_lst)
#Creating a numpy array of the list with [index, score]
info_arr = np.array(lst)
#Calculating an impression score: score / sum of impressions for each customer
imp_score = info_arr[:,1] / impsum_arr
#Multiplying the impression score with the value weight in dkk, and zipping it with a customer.
imp_score_dkk = list(zip(info_arr[:,0],imp_score*0.1))
#The each nested list was a tuple, so they were converted to a list instead
lst2 = [list(i) for i in imp_score_dkk]
#Sorts the list of lists with respect to the value_dkk per impression score
sorted_list_dkk = sorted(lst2, key=itemgetter(1), reverse=True)
top3_valdkk = sorted_list_dkk[3:6]
#SHowcasing the top 3 indices with respective to their value
top3_valdkk
#Printing the col names
col1 = df_data.columns[7]
col2 = df_data.columns[11]
col3 = df_data.columns[64]
print('Col7:',col1, top3_valdkk[0][1],'\nCol11:', col2,top3_valdkk[1][1], '\nCol64:',col3,top3_valdkk[2][1])
Col7: Customerlobby 0.189294838854117
Col11: Docs 0.00431669133607607
Col64: Thisnext 0.003547698536362109
#How did these three fare in the comparison before?
pos=[]
index = 0
for i in sorted_list:
if i[0] == 7 or i[0] == 11 or i[0] == 64:
pos.append([str(i[0])+', '+str(df_data.columns[i[0]]), index])
index +=1
print(pos)
[['64, Thisnext', 12], ['7, Customerlobby', 60], ['11, Docs', 74]]
print('Thisnext value_counts:\n',df_data['Thisnext'].value_counts(),'\n',
'\nCustomerlobby value_counts:\n',df_data['Customerlobby'].value_counts(),'\n',
'\nDocs value_counts:\n',df_data['Docs'].value_counts())
Thisnext value_counts:
0 8170
1 1642
2 187
3 1
Name: Thisnext, dtype: int64
Customerlobby value_counts:
0 9998
1 2
Name: Customerlobby, dtype: int64
Docs value_counts:
0 9967
1 33
Name: Docs, dtype: int64
Question 3
Q3.1 - Compute forward pass error
#Creating a sigmoid function to work as an activation function
import numpy as np
def sigmoid(x):
return 1/(1+np.exp(-x))
#Instantiating inputs, weights, biases and actual output
x1=0.5;x2=0.2
w1=2.0;w2=0.1;w3=0.15;w4=1.5
b1=0.5;b2=0.6
w5=0.2;w6=0.25;w7=0.35;w8=0.15
y1=0.01;y2=0.99;
#Computing the hidden layers
net_h1 = x1*w1 + x2*w2 + b1
net_h2 = x1*w3 + x2*w4 + b1
print(net_h1, net_h2)
1.52 0.875
#Applying activation function to the hidden layers
squashed_h1 = sigmoid(net_h1)
squashed_h2 = sigmoid(net_h2)
print(squashed_h1, squashed_h2)
0.8205384805926733 0.7057850278370112
#Computing the outplut layer
out1 = squashed_h1*w5 + squashed_h2*w6 + b2
out2 = squashed_h1*w7 + squashed_h2*w8 + b2
#Applying activation function
sq_out1 = sigmoid(out1)
sq_out2 = sigmoid(out2)
#Calculating the error for both neurons
err1 = 1/2*(y1-sq_out1)**2
err2 = 1/2*(y2-sq_out2)**2
print('error 1:',err1, 'error2:', err2)
error 1: 0.25149050407474033 error2: 0.03388034591637203
Q3.2 - Backpropagation for output weights
#Using the reduced derivative function, each weight in the output layer will be changed by:
#updated_w_n = w_n - lr * (-(y1-sq_out1) *sq_out1*(1-sq_out1)) * squashed_h1
#Initiating learnign rate of 0.5. This value determines how drastic the changes in weights should be, a lr of 0.5 is quite high.
lr = 0.5
#Inserting relevant variables in the specified function above.
u_w5 = w5 - lr*(-(y1-sq_out1)*sq_out1*(1-sq_out1)*squashed_h1)
u_w6 = w6 - lr*(-(y1-sq_out1)*sq_out1*(1-sq_out1)*squashed_h2)
u_w7 = w7 - lr*(-(y2-sq_out2)*sq_out2*(1-sq_out2)*squashed_h1)
u_w8 = w8 - lr*(-(y2-sq_out2)*sq_out2*(1-sq_out2)*squashed_h2)
print('Old W5:', w5, ', updated W5:', u_w5)
print('Old W6:', w6, ', updated W6:', u_w6)
print('Old W7:', w7, ', updated W7:', u_w7)
print('Old W8:', w8, ', updated W8:', u_w8)
Old W5: 0.2 , updated W5: 0.14124015343718643
Old W6: 0.25 , updated W6: 0.1994578000631014
Old W7: 0.35 , updated W7: 0.3710647934971505
Old W8: 0.15 , updated W8: 0.1681188527002764