import copy
#function that returns action_space for a 3x3 grid
def getActionSpace(state):
a = ["N", "E", "S", "W"]
row = state[0]
col = state[1]
if row == 0: #If at the bottom
a.remove("S") #Can't go down
if row == 2:
a.remove("N")
if col == 0:
a.remove("W")
if col == 2:
a.remove("E")
return a
epsilon = 1
disc_f = 0.9
#recursive function that breaks when difference in value for each state space is less than epsilon
def iterate (r, V, itNr):
"""
Parameters
----------
r : array
reward_function.
V : array
Value function.
itNr : int
Counter
Returns
-------
Pi : Array, V : Array
The optimal policy and the converging optimal value function
"""
V_prev = copy.deepcopy(V)
#List storing policies for states at their position
Pi = [["","",""],["","",""],["","",""]]
#Looping through every state space
for row in range(3):
for col in range(3):
s = [row, col]
#gets the avaliable actions for state_space s
action_space = getActionSpace(s)
max_value = 0;
curr_pi = ""
#if action go direction -> move 1 coordinate up/down/left/right
for a in action_space:
if a == "N":
s_new = [s[0]+1,s[1]]
elif a == "E":
s_new = [s[0],s[1]+1]
elif a == "S":
s_new = [s[0]-1,s[1]]
elif a == "W":
s_new = [s[0],s[1]-1]
#reward and value matrix value for s
r_old = r[s[0]][s[1]] #r_old = r[1][2]
V_s_old = V_prev[s[0]][s[1]]
#reward and value matrix value for s_new
r_curr = r[s_new[0]][s_new[1]]
V_s_new = V_prev[s_new[0]][s_new[1]]
#calculates the current expected value for given state and action
curr_value = 0.8*(r_curr + disc_f * V_s_new) + 0.2*(r_old + disc_f * V_s_old)
#updates what is the highest expected value at this state
if (curr_value >= max_value):
curr_pi = a
max_value = curr_value
#update value matrix with highest expected value at this state
V[row][col] = round(max_value,2)
#updates pi matrix with optimal action / policy at this state
Pi[row][col] = curr_pi
#prints V and policy for each state
print("Iteration nr", itNr, ":")
for v in V:
print(v)
for pi in Pi:
print(pi)
print("\n")
for row in range(3):
for col in range (3):
s = [row, col]
#if difference of old and updated value matrix is higher than epsilon, run another iteration
if (abs(V_prev[row][col] - V[row][col]) >= epsilon):
itNr += 1
iterate(r, V, itNr)
return Pi, V
return Pi, V
#Reward matrix
r = [[0,0,0],
[0,10,0],
[0,0,0]]
#Value matrix
V = [[0,0,0],
[0,0,0],
[0,0,0]]
itNr = 1
Pi, V = iterate(r, V, itNr) #Calling the value-function
print("Method returns: ", Pi, V)
Iteration nr 1 :
[0.0, 8.0, 0.0]
[8.0, 2.0, 8.0]
[0.0, 8.0, 0.0]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 2 :
[5.76, 10.88, 5.76]
[10.88, 8.12, 10.88]
[5.76, 10.88, 5.76]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 3 :
[8.87, 15.8, 8.87]
[15.8, 11.3, 15.8]
[8.87, 15.8, 8.87]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 4 :
[12.97, 18.98, 12.97]
[18.98, 15.41, 18.98]
[12.97, 18.98, 12.97]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 5 :
[16.0, 22.51, 16.0]
[22.51, 18.44, 22.51]
[16.0, 22.51, 16.0]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 6 :
[19.09, 25.33, 19.09]
[25.33, 21.53, 25.33]
[19.09, 25.33, 19.09]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 7 :
[21.67, 28.06, 21.67]
[28.06, 24.11, 28.06]
[21.67, 28.06, 21.67]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 8 :
[24.1, 30.41, 24.1]
[30.41, 26.54, 30.41]
[24.1, 30.41, 24.1]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 9 :
[26.23, 32.58, 26.23]
[32.58, 28.67, 32.58]
[26.23, 32.58, 26.23]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 10 :
[28.18, 34.51, 28.18]
[34.51, 30.62, 34.51]
[28.18, 34.51, 28.18]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 11 :
[29.92, 36.26, 29.92]
[36.26, 32.36, 36.26]
[29.92, 36.26, 29.92]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 12 :
[31.49, 37.83, 31.49]
[37.83, 33.93, 37.83]
[31.49, 37.83, 31.49]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 13 :
[32.91, 39.24, 32.91]
[39.24, 35.34, 39.24]
[32.91, 39.24, 32.91]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 14 :
[34.18, 40.51, 34.18]
[40.51, 36.61, 40.51]
[34.18, 40.51, 34.18]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 15 :
[35.32, 41.65, 35.32]
[41.65, 37.76, 41.65]
[35.32, 41.65, 35.32]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 16 :
[36.35, 42.68, 36.35]
[42.68, 38.78, 42.68]
[36.35, 42.68, 36.35]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Iteration nr 17 :
[37.27, 43.6, 37.27]
[43.6, 39.71, 43.6]
[37.27, 43.6, 37.27]
['E', 'N', 'W']
['E', 'W', 'W']
['S', 'S', 'W']
Method returns: [['E', 'N', 'W'], ['E', 'W', 'W'], ['S', 'S', 'W']] [[37.27, 43.6, 37.27], [43.6, 39.71, 43.6], [37.27, 43.6, 37.27]]
import gym
import gym_toytext
import numpy as np
import random
import math
#Creating the environment
env = gym.make("NChain-v0")
#Setting the parameters
num_episodes = 1000 #10000
gamma = 0.95 #given in question
learning_rate = 0.1 #given in question
epsilon = 0.3
# initialize the Q table
Q = np.zeros([5, 2])
#Algorithm for creating the Q-matrix
for episode in range(num_episodes):
state = env.reset()
done = False
while done == False:
# First we select an action:
if random.uniform(0, 1) < epsilon: # Flip a skewed coin
action = env.action_space.sample() # Explore action space
else:
action = np.argmax(Q[state,:]) # Exploit learned values
# Then we perform the action and receive the feedback from the environment
new_state, reward, done, info = env.step(action)
# Finally we learn from the experience by updating the Q-value of the selected action
prediction_error = reward + (gamma*np.max(Q[new_state,:])) - Q[state, action]
Q[state,action] += learning_rate*prediction_error
state = new_state
#Printing the first 10 and last 10 iteration to see plateau convergence
if episode < 10 or episode > num_episodes - 10:
print("Episode:", episode)
print(Q)
print()
Episode: 0
[[19.92459035 23.65257468]
[ 9.82543522 22.1185096 ]
[13.22734691 9.41498023]
[ 4.06920233 18.48398242]
[33.27639427 9.51956127]]
Episode: 1
[[27.22826953 29.74240955]
[24.98893741 27.84863945]
[27.07800875 16.9050052 ]
[17.8567401 29.26034517]
[40.81157802 14.00004009]]
Episode: 2
[[29.26685158 31.02117077]
[28.95849906 30.75816923]
[31.49603304 24.33372163]
[28.08811302 33.98962491]
[53.15012165 18.45638464]]
Episode: 3
[[30.85643843 32.02368447]
[31.79677294 32.49451895]
[33.51639667 30.0193245 ]
[33.48752642 35.6566544 ]
[52.54369801 20.94639542]]
Episode: 4
[[33.74105999 35.06589162]
[36.88561774 34.02128448]
[40.51156764 32.69681629]
[44.14774459 35.44865598]
[57.28970764 31.34450418]]
Episode: 5
[[42.9244702 42.97878048]
[44.66478954 41.37816085]
[47.69049181 40.71440895]
[54.4593157 37.36002429]
[68.56371762 49.2407836 ]]
Episode: 6
[[55.09103539 49.86704303]
[60.51855875 48.76058704]
[67.20416744 48.3011053 ]
[69.07563122 51.36443419]
[73.6651332 62.56430072]]
Episode: 7
[[57.90890252 57.3010934 ]
[60.24221498 57.18920047]
[63.93542125 59.25729382]
[67.73904637 58.08331319]
[77.47061978 67.43855583]]
Episode: 8
[[54.36776125 54.14284319]
[57.71001995 55.28921642]
[61.98360506 57.30603128]
[68.10074021 55.80506127]
[71.25121004 62.59670785]]
Episode: 9
[[58.0189586 56.71709151]
[60.86590496 58.48395717]
[65.04079223 59.16972492]
[69.47711248 59.96433838]
[80.27262205 58.04731972]]
Episode: 991
[[63.59100561 60.79463775]
[66.95638134 61.6397458 ]
[70.67852563 63.12679068]
[80.55536179 63.91752397]
[85.34299679 67.32361426]]
Episode: 992
[[58.08164907 57.73547222]
[61.32086777 59.03651091]
[65.93564564 59.39522292]
[70.88308361 63.25787774]
[75.88679417 62.93480298]]
Episode: 993
[[62.51498512 61.81415464]
[65.58943252 62.68590447]
[70.62515966 63.87957956]
[75.94798599 66.76193397]
[88.53867255 69.16401698]]
Episode: 994
[[53.29254322 53.2440336 ]
[55.93674093 55.60252199]
[60.06602285 57.17048995]
[65.89212958 59.66874795]
[70.66250764 58.94168497]]
Episode: 995
[[57.79646376 57.02464692]
[61.41064576 57.39035618]
[66.9265079 59.01273308]
[78.75474175 58.49867991]
[87.76712544 66.94584719]]
Episode: 996
[[67.01870943 64.39311817]
[71.13766494 64.19932956]
[74.13647157 63.42195877]
[83.29367625 62.73239138]
[94.96086151 65.5615279 ]]
Episode: 997
[[70.99772537 69.029626 ]
[75.58916925 70.53369936]
[78.55495311 70.03322482]
[83.84347271 68.84588982]
[85.97879898 75.56066488]]
Episode: 998
[[64.5432941 64.31748235]
[67.16699953 65.03519061]
[71.05742708 67.20596457]
[75.66746695 68.09161718]
[81.81316297 68.47782262]]
Episode: 999
[[57.2151085 56.9175277 ]
[60.54696689 57.74516172]
[64.77472533 59.27772027]
[70.41804281 60.52605094]
[85.51077501 62.41358329]]
import copy
#Defining the MDP
S = [0, 1, 2, 3, 4] #State_space
A = [0,1] #Same for all states, 0 = forward, 1 = back to start
P = 0.8 #Transition probability: 80% probability to get to state s' given action a
R = [2, 0, 0, 0, 10] #Reward function
V = [0, 0, 0, 0, 0] #Initial value-function, V_0
#Algorithm to calculate the value-function
#Changes in number of iterations showed that the value will converge long before 1000 iterations
for _ in range(1000):
V_prev = copy.deepcopy(V)
s_backward = S[0] #Back to start
for s in S:
max_value = 0
if s <= 3:
s_forward = S[s+1] #Moving forward
else:
s_forward = S[4] #Standin still
for a in A:
if a == 0:
p = 0.8 #Probability to move forward
else:
p = 0.2 #Probability to move forward
#calculates the current expected value for given state and action
curr_value = p*(R[s_forward] + gamma*(V_prev[s_forward])) + (1-p)*(R[s_backward] + gamma*(V_prev[s_backward]))
if (curr_value >= max_value):
curr_pi = a
max_value = curr_value
#update value matrix with highest expected value at this state
V[s] = round(max_value,2)
print("Q:", Q)
print()
print("V:", V)
Q: [[57.2151085 56.9175277 ]
[60.54696689 57.74516172]
[64.77472533 59.27772027]
[70.41804281 60.52605094]
[85.51077501 62.41358329]]
V: [78.15, 82.77, 88.85, 96.85, 96.85]