#Import libraries
import numpy as np
import matplotlib.pyplot as plt

#Unpack data
data_1 = np.loadtxt('ex1data1.txt', delimiter=',')
data_2 = np.loadtxt('ex1data2.txt', delimiter=',')

#Task A
#Setup training data for Task A
task_A_X_population = data_1[:, 0]
task_A_Y_profit = data_1[:, 1]
#Ploting the graph
plt.scatter(task_A_X_population, task_A_Y_profit)
plt.xlabel("Population in city (in 10,000s)")
plt.ylabel("Profit of a food truck (in $10,000s)")
plt.show()
"""
For sure, this graph is not readable for any information.
There is no obvious trends and lines that could represent something informative
"""

#Setting up training enviroment
learning_rate = 0.01
epoch = 1500
m = len(task_A_X_population)
thethas = [0,0]
cost_function_result = []
"""
What is the learning rate? Learning rate is a step coefficient of how fast our theta will increase/decrease.
What if I will set small or larger values for alpha (Explain)? For the smaller value of learning rate (further lr.)
the calculations and convergence would go slow. On the other side, if the lr. is big, the cost function J(thetas)
could never converge
What is the epoch? Is the number of iterations where the gradient descent would be used to figure out the best
thetas for our hypothesis
What if I will set too large or too small values for epoch? The too large number of epoch gives us inefficient
countings that don't lead to any result. However, when the number of epoch is small, you could not come to
a desired result
"""
for _ in range(epoch):
Y_pred = thethas[0] + thethas[1] * task_A_X_population
cost_function = (1/(2*m)) * sum(np.square(task_A_Y_profit - Y_pred))
cost_function_result.append(cost_function)
gradient_0 = (-1/m) * sum(task_A_Y_profit - Y_pred)
gradient_1 = (-1/m) * sum((task_A_Y_profit - Y_pred) * task_A_X_population)
thethas[0] = thethas[0] - (learning_rate * gradient_0)
thethas[1] = thethas[1] - (learning_rate * gradient_1)
print(thethas)
plt.plot(list(range(epoch)), cost_function_result)
plt.xlabel("epochs")
plt.ylabel("J(thetha0, thetha1)")
plt.show()
plt.scatter(task_A_X_population, task_A_Y_profit)
plt.plot(
(min(task_A_X_population), max(task_A_X_population)),
(min(Y_pred), max(Y_pred)),
color='red'
)
plt.xlabel("Population in city (in 10,000s)")
plt.ylabel("Prediction of profit of a food truck (in $10,000s)")
plt.show()

#Task B
number_of_rows = len(task_A_X_population)
task_A_X_population = task_A_X_population.reshape((number_of_rows, 1))
task_A_Y_profit = task_A_Y_profit.reshape((number_of_rows, 1))
"""
Here we add a column ones to the beginning because of the theta0 that must be single calculated in hypothesis
We just always multiply it by 1.
"""
task_A_X_population = np.insert(task_A_X_population, 0, np.ones((1, number_of_rows)), axis=1)
thethas = np.zeros((task_A_X_population.shape[1], 1))
print(task_A_X_population.shape)
print(task_A_Y_profit.shape)
print(thethas.shape)
for _ in range(epoch):
G = task_A_X_population.T.dot(task_A_X_population.dot(thethas)-task_A_Y_profit) * (1/m)
thethas = thethas - (learning_rate * G)
print(thethas)
"""
Dimension of X is number of features + 1 (because ones are added)
Dimension of y is the target numbers that is one in this case
Dimension of theta is the number is one column to the number of features + 1
I got a close same result as in Task A
"""

#Task C
y = data_2[:, 2]
#Pointer for Standart Scaler
for i in range(data_2.shape[1]-1):
data_2[:, i] = (data_2[:, i] - data_2[:, i].mean()) / (data_2[:, i].std())
"""
Feature normalization is used for scaling, cause most of the time features are not in the same range. Some could be
thousands while others could be digits from 1 to 10, so feature normalization is used to scale them in one format
for better usage and plotting
"""
x1 = data_2[:, 0]
x2 = data_2[:, 1]
thetas = [0,0,0]
cost_function_result = []
for _ in range(epoch):
Y_pred = thetas[0] + thetas[1] * x1 + thetas[2] * x2
cost_function = (1/(2*m)) * sum(np.square(y - Y_pred))
cost_function_result.append(cost_function)
gradient_0 = (-1/m) * sum(y - Y_pred)
gradient_1 = (-1/m) * sum((y - Y_pred) * x1)
gradient_2 = (-1/m) * sum((y - Y_pred) * x2)
thetas[0] = thetas[0] - (learning_rate * gradient_0)
thetas[1] = thetas[1] - (learning_rate * gradient_1)
thetas[2] = thetas[2] - (learning_rate * gradient_2)
print(thetas)
plt.plot(list(range(epoch)), cost_function_result)
plt.xlabel("epochs")
plt.ylabel("J(thetha0, thetha1)")
plt.show()
"""
It's obvious that our cost function decreases during the calculations, so in the latest epoch's(tries) cost function
is going close to zero
"""

#Task D
X = data_2[:, :2]
y = y.reshape((len(data_2),1))
#I've already done it upper. Look to Standart Scaler Pointer
X = np.insert(X, 0, np.ones((1, len(data_2))), axis=1)
tetas = np.zeros((3,1))
for _ in range(epoch):
G = X.T.dot(X.dot(tetas)-y) * (1/m)
tetas = tetas - (learning_rate * G)
print(tetas)
"""
They are close the same.
"""

tetas = np.zeros((3,1))
tetas = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
print(tetas)
"""
There is a little difference, however tetas are close the same
"""