import numpy as np
# Create reward matrix and initial matrix
reward_matrix = np.asarray([[0, 0, 0], [0, 10, 0], [0, 0, 0]]).astype(float)
initial_matrix = np.asarray([[0, 0, 0], [0, 0, 0], [0, 0, 0]]).astype(float)
# Dimensions of the matrix
nr_of_rows = 3
nr_of_columns = 3
# Choose an epsilon close to 0
eps = 0.0001
# The discount (gamma) value
discount = 0.9
# Action probability
action = 0.8
no_action = 1 - action
# Start value
value = 10
# Fucntion that calculate the rewards and returns the highest rewards for each coordinate
def calculate_reward(i, j, reward_matrix, initial_matrix, discount):
# List that will be used to store the rewards of each direction
reward_for_each_direction = []
# All possible directions
north = (i, j + 1)
east = (i + 1, j)
west = (i - 1, j)
south = (i, j - 1)
directions = [north, east, west, south]
# Iterate over the list of directions
for direction in directions:
# Use try-except to ignore when a direction is out of bounds
try:
reward = action * (reward_matrix[direction] + initial_matrix[direction] * discount) + no_action * (reward_matrix[i, j] + initial_matrix[i, j] * discount)
reward_for_each_direction.append(reward)
except IndexError:
pass
# Picks out the highest reward from the list of rewards
max_reward = np.max(reward_for_each_direction)
return max_reward
while value > eps:
# Create a copy of the inital matrix to be used in the new iteration
new_matrix = initial_matrix.copy()
for j in range(nr_of_rows):
for i in range(nr_of_columns):
# Calculates the reward for the current coordinate
reward = calculate_reward(i, j, reward_matrix, initial_matrix, discount)
# Updates the copy of the initial_matrix with the highest reward
new_matrix[i, j] = reward
# Updates the value with the difference between the current and previous iteration
value = np.abs(new_matrix - initial_matrix).sum()
initial_matrix = new_matrix
# Print the results
for i in range(len(initial_matrix)):
print(initial_matrix[i])
# From this we can find the optimal policy for each coordinate.
# |E/S|S|S/W|
# |E|N/E/S/W|W|
# |N/E|N|N/W|