DAT405-Assignment2

%matplotlib inline import matplotlib.pyplot as plt import pandas as pd from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split import numpy as np import matplotlib.ticker as mtick

#Read data from csv df_org = pd.read_csv("/work/data/data_assignment2.csv") #Format figure fig,axs = plt.subplots(figsize=(10,6)) #Format unit of y-axis fmt1 = '{x:,.0f} kr' tick1 = mtick.StrMethodFormatter(fmt1) axs.yaxis.set_major_formatter(tick1) axs.set_xlabel('Living area ($m^2$)') #Extract training data train_x = df_org['Living_area'] train_y = df_org['Selling_price'] axs.set_title('Living area VS Housing prices') axs.scatter(train_x,train_y) fig.show()

#Reshape to fit linear regression parameters train_x = train_x.values.reshape(-1,1) train_y = train_y.values.reshape(-1,1) #Train model reg = LinearRegression().fit(train_x,train_y) #Print fitted line info print("Slope of the fitted line is: " + str(reg.coef_[0][0])) print("Interception of the fitted line is: " + str(reg.intercept_[0]))

# Predicted line of model predicts_y = reg.predict(train_x) # Create subplots fig,axs = plt.subplots(1,2,figsize=(10,7)) # Draw the regression line (better quality line than with predicts_y) x = np.arange(60,215) y = reg.coef_[0][0] * x + reg.intercept_[0] for ax in axs: ax.plot(x, y,color='r',zorder=1) axs[0].scatter(train_x,train_y) axs[0].set_title('Fitted line on Living Area VS Housing Prices\nusing linear regression') # List of areas which house prices we wish to predict predicts_x = [100,150,200] predicts_set = reg.predict(np.matrix(predicts_x).T) # Scatter the three predicted house prices axs[1].scatter(predicts_x,predicts_set,zorder=2,marker=(4, 0, 45), s=150) #Formatting for all axis for ax in axs: ax.set_xlabel('Living area ($m^2$)') ax.yaxis.set_major_formatter(tick1) ax.set_ylim([1500000,7000000]) axs[1].set_title('Predict housing prices for livings areas\nof 100, 150, 200 $m^2$') plt.tight_layout()

fig,axs = plt.subplots(1,1,figsize=(10,6)) # Calculate difference between training and fitted line and save in residuals = [] residuals = [] for a,b in zip(train_y,predicts_y): residuals.append(a - b) axs.scatter(train_x,residuals) #Formatting axs.set_ylim([-4500000,4500000]) axs.axhline(y=1.0,color='k',linestyle='--') axs.yaxis.set_major_formatter(tick1) axs.set_title('Residual plot') axs.set_xlabel('Living area ($m^2$)') fig.show()

from sklearn.datasets import load_iris from sklearn.linear_model import LogisticRegression import seaborn as sn import matplotlib.pyplot as plt from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.neighbors import KNeighborsClassifier

data = load_iris(as_frame=True) iris_df = data.frame # Split the data set into a traning set and validation set (3:1) # The split is random, but reproducable with the seed 1337 # Note: The number of data points for the three different classes will not be even X_train, X_test, y_train, y_test = train_test_split(iris_df.iloc[:,:4], iris_df.iloc[:,4], test_size=0.25, random_state=41)

# Fit the mlogit curve with the training data clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train) # Predict the test data set pred = clf.predict(X_test) # Create confusion matrix conf_matrix = confusion_matrix(y_test, pred) plot_conf_matrix(conf_matrix, title=f'Mlogit acc:{round(clf.score(X_test, y_test), 4)}')

# Test a couple of values for k nn = [1,3,5,10,15,25,100] # Create subplots for all the confusion matrices figs = [] for i in range(len(nn)): fig, axs = plt.subplots(1, 2, figsize=(11,4)) figs.append((fig, axs)) for i, (fig, ax) in enumerate(figs): # Init a KNN classifier with uniform weights KNN_uni = KNeighborsClassifier(n_neighbors=nn[i]) KNN_uni.fit(X_train, y_train) pred = KNN_uni.predict(X_test) conf_matrix = confusion_matrix(y_test, pred) # Plot its confusion matrix plot_conf_matrix(conf_matrix,fig=fig, axs=ax[0], title=f'Uniform, n: {nn[i]} acc:{round(accuracy_score(y_test, pred), 4)}') # Init a KNN classifier with weight 1/distance KNN_dis = KNeighborsClassifier(n_neighbors=nn[i], weights='distance') KNN_dis.fit(X_train, y_train) pred = KNN_dis.predict(X_test) conf_matrix = confusion_matrix(y_test, pred) # Plot its confusion matrix plot_conf_matrix(conf_matrix,fig=fig, axs=ax[1], title=f'Distance, n: {nn[i]} acc: {round(accuracy_score(y_test, pred),4)}')