#Task 2
import numpy as np
#return array of possible moves and their rewards
def getActionsMatrix (State, X_pos, Y_pos):
#array of possible actions, 0 = WEST, 1 = NORTH, 2 = EAST, 3 = SOUTH
nextPos = []
if(X_pos-1 >= 0):
nextPos.append([X_pos-1,Y_pos])
if(X_pos+1 <= State.shape[0]-1):
nextPos.append([X_pos+1,Y_pos])
if(Y_pos-1 >= 0):
nextPos.append([X_pos,Y_pos-1])
if(Y_pos+1 <= State.shape[1]-1):
nextPos.append([X_pos,Y_pos+1])
return nextPos
#takes environement and position and returns bestReward at current state for this iteration
def getBestReward (state, rState, X_pos,Y_pos, gamma, moveProb, firstMove, getActions):
stayProb = 1-moveProb
posReward = state[X_pos,Y_pos]
bestReward = 0
nextPos = getActions(state, X_pos, Y_pos)
for p in range (0,len(nextPos)):
nextPosition = (nextPos[p][0],nextPos[p][1])
if (firstMove):
reward = moveProb*(state[nextPosition]) + stayProb*(posReward)
else:
reward = moveProb*(rState[nextPosition] + gamma*state[nextPosition]) + stayProb*(rState[X_pos,Y_pos] + gamma*posReward)
if (reward > bestReward):
bestReward = reward
return bestReward
#takes current V(s), base reward (revState), gamma and boolean for firstMove
#returns the nextState
def nextState (state,revState, moveProb, gamma, firstMove, getActions):
nState = np.zeros(state.shape)
for X_pos in range (0, state.shape[0]):
for Y_pos in range (0, state.shape[1]):
nState[X_pos][Y_pos] = getBestReward(state,revState, X_pos, Y_pos, gamma, moveProb, firstMove, getActions)
return nState
def loop (revState, moveProb, gamma, epsilon, getActions):
cont = True
oldState = revState[:]
state = revState[:]
print('First state:')
print(revState)
#first state needs different input, therefor outside the loop
state = nextState(state, revState, moveProb, gamma, True,getActions)
#iteration loop
while cont:
state = nextState(state, revState, moveProb, gamma, False, getActions)
for x in range(0,state.shape[0]):
for y in range(0,state.shape[1]):
if abs(state[x][y] - abs(oldState[x][y])) < epsilon:
cont = False
oldState = state
#print(state)
print('Converged state:')
print(state)
#init statements,
revState = np.array([[0,0,0],[0,10,0],[0,0,0]])
epsilon = 0.1
moveProb = 0.8
gamma = 0.9
loop( revState, moveProb, gamma, epsilon, getActionsMatrix )
#Note. You need to install gym! Sometimes difficult on Windows. Google for advise.
!pip install gym==0.7.4
!pip install gym-legacy-toytext
import gym
import numpy as np
import random
import math
import gym_toytext
#Training the Q-table
#Code is partly taken from q_learning_frozen_lake.ipynb
#Inspiration with average calculation is taken from
#https://towardsdatascience.com/q-learning-algorithm-from-explanation-to-implementation-cdbeda2ea187
def q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon):
# initialize the Q table [state, action], this case [5,2]
n_states = env.observation_space.n
n_actions = env.action_space.n
Q = np.zeros((n_states,n_actions))
#Alternative way of writing: Q = np.zeros([5, 2])
for _ in range(num_episodes):
state = env.reset()
done = False
while done == False:
# First we select an action:
if random.uniform(0, 1) < epsilon: # Flip a skewed coin
action = env.action_space.sample() # Explore action space
else:
action = np.argmax(Q[state,:]) # Exploit learned values
# Then we perform the action and receive the feedback from the environment
new_state, reward, done, info = env.step(action)
# Finally we learn from the experience by updating the Q-value of the selected action
update = reward + (gamma*np.max(Q[new_state,:])) - Q[state, action]
Q[state,action] += learning_rate*update
state = new_state
return Q
env = gym.make("NChain-v0")
num_episodes = 10000
gamma = 0.95
learning_rate = 0.1 #(in the description alpha = 0.1, so learning rate = 0.1)
epsilon = 0.1
#Used when num_episodes is very big
Q = q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon)
print(Q)
#Used for smaller num_episodes
'''
Q_list = []
for i in range(5):
Q_list.append(q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon))
avg_Q = np.zeros([5, 2])
for q in Q_list:
avg_Q+=q
avg_Q /= len(Q_list)
print(avg_Q)
'''
#Question 4a
def getActionsChain (State, Y_pos, X_pos):
#list of possible next possitions
nextPos = []
nextPos.append([0,0])
if(X_pos+1 <= State.shape[1]-1):
nextPos.append([0,X_pos+1])
return nextPos
revState =np.array( [[2,0,0,0,10]])
epsilon = 0.1
gamma = 0.95
moveProb = 0.9 # will make the best move 90% of the time
loop(revState, moveProb,gamma, epsilon, getActionsChain)