Assignment 5

#Task 2 import numpy as np #return array of possible moves and their rewards def getActionsMatrix (State, X_pos, Y_pos): #array of possible actions, 0 = WEST, 1 = NORTH, 2 = EAST, 3 = SOUTH nextPos = [] if(X_pos-1 >= 0): nextPos.append([X_pos-1,Y_pos]) if(X_pos+1 <= State.shape[0]-1): nextPos.append([X_pos+1,Y_pos]) if(Y_pos-1 >= 0): nextPos.append([X_pos,Y_pos-1]) if(Y_pos+1 <= State.shape[1]-1): nextPos.append([X_pos,Y_pos+1]) return nextPos #takes environement and position and returns bestReward at current state for this iteration def getBestReward (state, rState, X_pos,Y_pos, gamma, moveProb, firstMove, getActions): stayProb = 1-moveProb posReward = state[X_pos,Y_pos] bestReward = 0 nextPos = getActions(state, X_pos, Y_pos) for p in range (0,len(nextPos)): nextPosition = (nextPos[p][0],nextPos[p][1]) if (firstMove): reward = moveProb*(state[nextPosition]) + stayProb*(posReward) else: reward = moveProb*(rState[nextPosition] + gamma*state[nextPosition]) + stayProb*(rState[X_pos,Y_pos] + gamma*posReward) if (reward > bestReward): bestReward = reward return bestReward #takes current V(s), base reward (revState), gamma and boolean for firstMove #returns the nextState def nextState (state,revState, moveProb, gamma, firstMove, getActions): nState = np.zeros(state.shape) for X_pos in range (0, state.shape[0]): for Y_pos in range (0, state.shape[1]): nState[X_pos][Y_pos] = getBestReward(state,revState, X_pos, Y_pos, gamma, moveProb, firstMove, getActions) return nState def loop (revState, moveProb, gamma, epsilon, getActions): cont = True oldState = revState[:] state = revState[:] print('First state:') print(revState) #first state needs different input, therefor outside the loop state = nextState(state, revState, moveProb, gamma, True,getActions) #iteration loop while cont: state = nextState(state, revState, moveProb, gamma, False, getActions) for x in range(0,state.shape[0]): for y in range(0,state.shape[1]): if abs(state[x][y] - abs(oldState[x][y])) < epsilon: cont = False oldState = state #print(state) print('Converged state:') print(state) #init statements, revState = np.array([[0,0,0],[0,10,0],[0,0,0]]) epsilon = 0.1 moveProb = 0.8 gamma = 0.9 loop( revState, moveProb, gamma, epsilon, getActionsMatrix )

#Note. You need to install gym! Sometimes difficult on Windows. Google for advise. !pip install gym==0.7.4 !pip install gym-legacy-toytext import gym import numpy as np import random import math import gym_toytext

#Training the Q-table #Code is partly taken from q_learning_frozen_lake.ipynb #Inspiration with average calculation is taken from #https://towardsdatascience.com/q-learning-algorithm-from-explanation-to-implementation-cdbeda2ea187 def q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon): # initialize the Q table [state, action], this case [5,2] n_states = env.observation_space.n n_actions = env.action_space.n Q = np.zeros((n_states,n_actions)) #Alternative way of writing: Q = np.zeros([5, 2]) for _ in range(num_episodes): state = env.reset() done = False while done == False: # First we select an action: if random.uniform(0, 1) < epsilon: # Flip a skewed coin action = env.action_space.sample() # Explore action space else: action = np.argmax(Q[state,:]) # Exploit learned values # Then we perform the action and receive the feedback from the environment new_state, reward, done, info = env.step(action) # Finally we learn from the experience by updating the Q-value of the selected action update = reward + (gamma*np.max(Q[new_state,:])) - Q[state, action] Q[state,action] += learning_rate*update state = new_state return Q

env = gym.make("NChain-v0") num_episodes = 10000 gamma = 0.95 learning_rate = 0.1 #(in the description alpha = 0.1, so learning rate = 0.1) epsilon = 0.1 #Used when num_episodes is very big Q = q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon) print(Q) #Used for smaller num_episodes ''' Q_list = [] for i in range(5): Q_list.append(q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon)) avg_Q = np.zeros([5, 2]) for q in Q_list: avg_Q+=q avg_Q /= len(Q_list) print(avg_Q) '''

#Question 4a def getActionsChain (State, Y_pos, X_pos): #list of possible next possitions nextPos = [] nextPos.append([0,0]) if(X_pos+1 <= State.shape[1]-1): nextPos.append([0,X_pos+1]) return nextPos revState =np.array( [[2,0,0,0,10]]) epsilon = 0.1 gamma = 0.95 moveProb = 0.9 # will make the best move 90% of the time loop(revState, moveProb,gamma, epsilon, getActionsChain)