%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.ticker as mtick
#Read data from csv
df_org = pd.read_csv("/work/data/data_assignment2.csv")
#Format figure
fig,axs = plt.subplots(figsize=(10,6))
#Format unit of y-axis
fmt1 = '{x:,.0f} kr'
tick1 = mtick.StrMethodFormatter(fmt1)
axs.yaxis.set_major_formatter(tick1)
axs.set_xlabel('Living area ($m^2$)')
#Extract training data
train_x = df_org['Living_area']
train_y = df_org['Selling_price']
axs.set_title('Living area VS Housing prices')
axs.scatter(train_x,train_y)
fig.show()
#Reshape to fit linear regression parameters
train_x = train_x.values.reshape(-1,1)
train_y = train_y.values.reshape(-1,1)
#Train model
reg = LinearRegression().fit(train_x,train_y)
#Print fitted line info
print("Slope of the fitted line is: " + str(reg.coef_[0][0]))
print("Interception of the fitted line is: " + str(reg.intercept_[0]))
# Predicted line of model
predicts_y = reg.predict(train_x)
# Create subplots
fig,axs = plt.subplots(1,2,figsize=(10,7))
# Draw the regression line (better quality line than with predicts_y)
x = np.arange(60,215)
y = reg.coef_[0][0] * x + reg.intercept_[0]
for ax in axs:
ax.plot(x, y,color='r',zorder=1)
axs[0].scatter(train_x,train_y)
axs[0].set_title('Fitted line on Living Area VS Housing Prices\nusing linear regression')
# List of areas which house prices we wish to predict
predicts_x = [100,150,200]
predicts_set = reg.predict(np.matrix(predicts_x).T)
# Scatter the three predicted house prices
axs[1].scatter(predicts_x,predicts_set,zorder=2,marker=(4, 0, 45), s=150)
#Formatting for all axis
for ax in axs:
ax.set_xlabel('Living area ($m^2$)')
ax.yaxis.set_major_formatter(tick1)
ax.set_ylim([1500000,7000000])
axs[1].set_title('Predict housing prices for livings areas\nof 100, 150, 200 $m^2$')
plt.tight_layout()
fig,axs = plt.subplots(1,1,figsize=(10,6))
# Calculate difference between training and fitted line and save in residuals = []
residuals = []
for a,b in zip(train_y,predicts_y):
residuals.append(a - b)
axs.scatter(train_x,residuals)
#Formatting
axs.set_ylim([-4500000,4500000])
axs.axhline(y=1.0,color='k',linestyle='--')
axs.yaxis.set_major_formatter(tick1)
axs.set_title('Residual plot')
axs.set_xlabel('Living area ($m^2$)')
fig.show()
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
data = load_iris(as_frame=True)
iris_df = data.frame
# Split the data set into a traning set and validation set (3:1)
# The split is random, but reproducable with the seed 1337
# Note: The number of data points for the three different classes will not be even
X_train, X_test, y_train, y_test = train_test_split(iris_df.iloc[:,:4], iris_df.iloc[:,4], test_size=0.25, random_state=41)
# Fit the mlogit curve with the training data
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train, y_train)
# Predict the test data set
pred = clf.predict(X_test)
# Create confusion matrix
conf_matrix = confusion_matrix(y_test, pred)
plot_conf_matrix(conf_matrix, title=f'Mlogit acc:{round(clf.score(X_test, y_test), 4)}')
# Test a couple of values for k
nn = [1,3,5,10,15,25,100]
# Create subplots for all the confusion matrices
figs = []
for i in range(len(nn)):
fig, axs = plt.subplots(1, 2, figsize=(11,4))
figs.append((fig, axs))
for i, (fig, ax) in enumerate(figs):
# Init a KNN classifier with uniform weights
KNN_uni = KNeighborsClassifier(n_neighbors=nn[i])
KNN_uni.fit(X_train, y_train)
pred = KNN_uni.predict(X_test)
conf_matrix = confusion_matrix(y_test, pred)
# Plot its confusion matrix
plot_conf_matrix(conf_matrix,fig=fig, axs=ax[0], title=f'Uniform, n: {nn[i]} acc:{round(accuracy_score(y_test, pred), 4)}')
# Init a KNN classifier with weight 1/distance
KNN_dis = KNeighborsClassifier(n_neighbors=nn[i], weights='distance')
KNN_dis.fit(X_train, y_train)
pred = KNN_dis.predict(X_test)
conf_matrix = confusion_matrix(y_test, pred)
# Plot its confusion matrix
plot_conf_matrix(conf_matrix,fig=fig, axs=ax[1], title=f'Distance, n: {nn[i]} acc: {round(accuracy_score(y_test, pred),4)}')