## Import all the necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
## Import the dataset
data = pd.read_csv('/home/seetha/data/salary_data.csv')
# change the path as necessary
X = data.iloc[:, 0]
Y = data.iloc[:, 1]
# View the size of the arrays X and Y
print(X.shape)
print(Y.shape)
FileNotFoundError: [Errno 2] No such file or directory: '/home/seetha/data/salary_data.csv'
# Visualize the dataset
plt.scatter(X, Y, color='blue')
plt.title('Salary VS Experience (Observations)')
plt.xlabel('Year of Experience')
plt.ylabel('Salary')
plt.show()
# Build the model
theta1 = 0
theta0 = 0
alpha = 0.0001 # Learning Rate
epochs = 10000 # Number of iterations to perform gradient descent
m = float(len(X)) # Number of elements in X
cost_history = []
# Performing Gradient Descent
for i in range(epochs):
Y_pred = theta1 * X + theta0
temp1 = (-1/m) * sum(X * (Y - Y_pred))
temp0 = (-1/m) * sum(Y - Y_pred)
theta1 = theta1 - alpha * temp1
theta0 = theta0 - alpha * temp0
cost = (1/2*m) * sum((Y - Y_pred)**2)
cost_history.append(cost)
# The coefficients
# print the parameter theta1
print('Theta1 = ', theta1)
# print the parameter theta0
print('Theta0 = ', theta0)
Theta1 = 12316.1630541413
Theta0 = 6477.634197171642
# Predict the values for the given X
Y_pred = theta1 * X + theta0
# Visualize the dataset and plot the residuals
fig, ax = plt.subplots()
ax.scatter(X, Y, color='blue') # observed values
ax.scatter(X, Y_pred, color='green') # predicted values
ax.vlines(X,Y, Y_pred, color='red') # residual lines
plt.plot([min(X), max(X)], [min(Y_pred), max(Y_pred)], color='black') # regression line
plt.title('Salary VS Experience')
plt.xlabel('Year of Experience')
plt.ylabel('Salary')
plt.show()
# plot the cost function
plt.plot(cost_history)
plt.title('Cost Function using Gradient Descent')
plt.xlabel("Number of iterations")
plt.ylabel("Cost")
plt.show()
from sklearn.metrics import mean_squared_error, r2_score
# The mean squared error
print("Mean squared error = %.2f" % mean_squared_error(Y, Y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score = %.2f' % r2_score(Y, Y_pred))
Mean squared error = 111918031.56
Variance score = 0.85