DAT490 - Assignment 2
Question 1
Imports and reading the data frame
# Several imports which are used throughout the assignment
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from sklearn.datasets import load_iris
import seaborn as sns #seaborn is a package for nice-looking graphics
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
# Read csv file as a data frame
df1 = pd.read_csv("houses_data.csv")
i)
# Creating two arrays from the data frame, where x is the input and y is the output
x = np.array(df1['Area']).reshape((-1, 1)) #Reshaping the x-array so it is two-dimensional
y = np.array(df1['Price'])
# Create a linear regression model
model = LinearRegression().fit(x,y)
# Plotting the regresion line
xfit = np.linspace(0, 300, 1000)
yfit = model.predict(xfit[:, np.newaxis])
# Plotting a scatter plot of the model
plt.scatter(x, y)
plt.plot(xfit, yfit)
plt.show()
# Printing the slope and intercept values from the model
print(model.coef_)
print(model.intercept_)
ii)
# Predicting the outcome (price) of three different values of x (living area in square meters), using the predict function on the model
print(model.predict([[100]]))
print(model.predict([[150]]))
print(model.predict([[200]]))
iii)
# Drawing a residual plot from the data frame where x = Area and y = Price
sns.residplot(x = "Area", y = "Price", data = df1)
plt.show()
iv)
Question 2
# Load the iris dataset from the sklearn into variable "data"
data = load_iris()
# Iris data is split into Training sets and Test sets, where the test size is set to 0.25
x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=0)
# One-vs-rest instance of the model
logisticRegr = LogisticRegression(multi_class='ovr', solver='liblinear')
# The model is trained based on the data
logisticRegr.fit(x_train, y_train)
# Predictions on the entire data set
predictions = logisticRegr.predict(x_test)
# Score is measuring how accurate our model performs, i.e. the correct predictions / total number of data points
score = logisticRegr.score(x_test, y_test)
# Creating and plotting a confusion model
cm = metrics.confusion_matrix(y_test, predictions)
plt.figure(figsize=(9,9))
ax = sns.heatmap(cm, annot=True, linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual label');
plt.xlabel('\nPredicted label');
plt.title('Accuracy Score: {0}'.format(score), size = 15);
ax.xaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
ax.yaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
plt.savefig('iris_cm.png')
plt.show();
Question 3
# Could have been done using a loop instead of repeated code
# Creating the KNN Classifier for K = 1, uniform-based
knn1_u = KNeighborsClassifier(n_neighbors=1)
# Creating the KNN Classifier for K = 1, distance-based
knn1_d = KNeighborsClassifier(n_neighbors=1, weights = 'distance')
# Using training sets created in Q2 to train the model
knn1_u.fit(x_train, y_train)
knn1_d.fit(x_train, y_train)
# Predicting the response for test dataset
y_pred1_u = knn1_u.predict(x_test)
y_pred1_d = knn1_d.predict(x_test)
# Printing the accuracy of the classifier
print("Accuracy for K = 1, uniform-based: ",metrics.accuracy_score(y_test, y_pred1_u))
print("Accuracy for K = 1, distance-based: ",metrics.accuracy_score(y_test, y_pred1_d))
# -----------------------------------------------------------
# Creating the KNN Classifier for K = 3, uniform-based
knn3_u = KNeighborsClassifier(n_neighbors=3)
# Creating the KNN Classifier for K = 3, distance-based
knn3_d = KNeighborsClassifier(n_neighbors=3, weights = 'distance')
# Using training sets created in Q2 to train the model
knn3_u.fit(x_train, y_train)
knn3_d.fit(x_train, y_train)
# Predicting the response for test dataset
y_pred3_u = knn3_u.predict(x_test)
y_pred3_d = knn3_d.predict(x_test)
# Printing the accuracy of the classifier
print("Accuracy for K = 3, uniform-based: ",metrics.accuracy_score(y_test, y_pred3_u))
print("Accuracy for K = 3, distance-based: ",metrics.accuracy_score(y_test, y_pred3_d))
# -----------------------------------------------------------
# Creating the KNN Classifier for K = 21, uniform-based
knn21_u = KNeighborsClassifier(n_neighbors=21)
# Creating the KNN Classifier for K = 1, distance-based
knn21_d = KNeighborsClassifier(n_neighbors=21, weights = 'distance')
# Using training sets created in Q2 to train the model
knn21_u.fit(x_train, y_train)
knn21_d.fit(x_train, y_train)
# Predicting the response for test dataset
y_pred21_u = knn21_u.predict(x_test)
y_pred21_d = knn21_d.predict(x_test)
# Printing the accuracy of the classifier
print("Accuracy for K = 21, uniform-based: ",metrics.accuracy_score(y_test, y_pred21_u))
print("Accuracy for K = 21, distance-based: ",metrics.accuracy_score(y_test, y_pred21_d))
# -----------------------------------------------------------
# Creating the KNN Classifier for K = 35, uniform-based
knn35_u = KNeighborsClassifier(n_neighbors=35)
# Creating the KNN Classifier for K = 35, distance-based
knn35_d = KNeighborsClassifier(n_neighbors=35, weights = 'distance')
# Using training sets created in Q2 to train the model
knn35_u.fit(x_train, y_train)
knn35_d.fit(x_train, y_train)
# Predicting the response for test dataset
y_pred35_u = knn35_u.predict(x_test)
y_pred35_d = knn35_d.predict(x_test)
# Printing the accuracy of the classifier
print("Accuracy for K = 35, uniform-based: ",metrics.accuracy_score(y_test, y_pred35_u))
print("Accuracy for K = 35, distance-based: ",metrics.accuracy_score(y_test, y_pred35_d))
# -----------------------------------------------------------
# Creating the KNN Classifier for K = 50, uniform-based
knn50_u = KNeighborsClassifier(n_neighbors=50)
# Creating the KNN Classifier for K = 50, distance-based
knn50_d = KNeighborsClassifier(n_neighbors=50, weights = 'distance')
# Using training sets created in Q2 to train the model
knn50_u.fit(x_train, y_train)
knn50_d.fit(x_train, y_train)
# Predicting the response for test dataset
y_pred50_u = knn50_u.predict(x_test)
y_pred50_d = knn50_d.predict(x_test)
# Printing the accuracy of the classifier
print("Accuracy for K = 50, uniform-based: ",metrics.accuracy_score(y_test, y_pred50_u))
print("Accuracy for K = 50, distance-based: ",metrics.accuracy_score(y_test, y_pred50_d))
# -----------------------------------------------------------
# Creating the KNN Classifier for K = 100, uniform-based
knn100_u = KNeighborsClassifier(n_neighbors=100)
# Creating the KNN Classifier for K = 100, distance-based
knn100_d = KNeighborsClassifier(n_neighbors=100, weights = 'distance')
# Using training sets created in Q2 to train the model
knn100_u.fit(x_train, y_train)
knn100_d.fit(x_train, y_train)
# Predicting the response for test dataset
y_pred100_u = knn100_u.predict(x_test)
y_pred100_d = knn100_d.predict(x_test)
# Printing the accuracy of the classifier
print("Accuracy for K = 100, uniform-based: ",metrics.accuracy_score(y_test, y_pred100_u))
print("Accuracy for K = 100, distance-based: ",metrics.accuracy_score(y_test, y_pred100_d))
Question 4
# Confusion matrix for classification model where K = 1
# Also represents all the distance-based models since they all had the same score
cmknn1 = metrics.confusion_matrix(y_test, y_pred1_u)
plt.figure(figsize=(9,9))
ax = sns.heatmap(cmknn1, annot=True, linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('K = 1', size = 15);
ax.xaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
ax.yaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
plt.savefig('iris_cm.png')
plt.show();
# ----------------------------------------------------------------------------------------------
# Confusion matrix for classification model where K = 3
cmknn3 = metrics.confusion_matrix(y_test, y_pred3_u)
plt.figure(figsize=(9,9))
ax = sns.heatmap(cmknn3, annot=True, linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('K = 3', size = 15);
ax.xaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
ax.yaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
plt.savefig('iris_cm.png')
plt.show();
# ----------------------------------------------------------------------------------------------
# Confusion matrix for classification model where K = 21
cmknn21 = metrics.confusion_matrix(y_test, y_pred21_u)
plt.figure(figsize=(9,9))
ax = sns.heatmap(cmknn21, annot=True, linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('K = 21', size = 15);
ax.xaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
ax.yaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
plt.savefig('iris_cm.png')
plt.show();
# ----------------------------------------------------------------------------------------------
# Confusion matrix for classification model where K = 35
cmknn35 = metrics.confusion_matrix(y_test, y_pred35_u)
plt.figure(figsize=(9,9))
ax = sns.heatmap(cmknn35, annot=True, linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('K = 35', size = 15);
ax.xaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
ax.yaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
plt.savefig('iris_cm.png')
plt.show();
# ----------------------------------------------------------------------------------------------
# Confusion matrix for classification model where K = 50
cmknn50 = metrics.confusion_matrix(y_test, y_pred50_u)
plt.figure(figsize=(9,9))
ax = sns.heatmap(cmknn50, annot=True, linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('K = 50', size = 15);
ax.xaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
ax.yaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
plt.savefig('iris_cm.png')
plt.show();
# ----------------------------------------------------------------------------------------------
# Confusion matrix for classification model where K = 100
cmknn100 = metrics.confusion_matrix(y_test, y_pred100_u)
plt.figure(figsize=(9,9))
ax = sns.heatmap(cmknn100, annot=True, linewidths=.5, square = True, cmap = 'Reds_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
plt.title('K = 100', size = 15);
ax.xaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
ax.yaxis.set_ticklabels([data.target_names[0],data.target_names[1], data.target_names[2]])
plt.savefig('iris_cm.png')
plt.show();