#Task 2
import numpy as np
#return array of possible moves and their rewards
def getActionsMatrix (State, X_pos, Y_pos):
#array of possible actions, 0 = WEST, 1 = NORTH, 2 = EAST, 3 = SOUTH
nextPos = []
if(X_pos-1 >= 0):
nextPos.append([X_pos-1,Y_pos])
if(X_pos+1 <= State.shape[0]-1):
nextPos.append([X_pos+1,Y_pos])
if(Y_pos-1 >= 0):
nextPos.append([X_pos,Y_pos-1])
if(Y_pos+1 <= State.shape[1]-1):
nextPos.append([X_pos,Y_pos+1])
return nextPos
#takes environement and position and returns bestReward at current state for this iteration
def getBestReward (state, rState, X_pos,Y_pos, gamma, moveProb, firstMove, getActions):
stayProb = 1-moveProb
posReward = state[X_pos,Y_pos]
bestReward = 0
nextPos = getActions(state, X_pos, Y_pos)
for p in range (0,len(nextPos)):
nextPosition = (nextPos[p][0],nextPos[p][1])
if (firstMove):
reward = moveProb*(state[nextPosition]) + stayProb*(posReward)
else:
reward = moveProb*(rState[nextPosition] + gamma*state[nextPosition]) + stayProb*(rState[X_pos,Y_pos] + gamma*posReward)
if (reward > bestReward):
bestReward = reward
return bestReward
#takes current V(s), base reward (revState), gamma and boolean for firstMove
#returns the nextState
def nextState (state,revState, moveProb, gamma, firstMove, getActions):
nState = np.zeros(state.shape)
for X_pos in range (0, state.shape[0]):
for Y_pos in range (0, state.shape[1]):
nState[X_pos][Y_pos] = getBestReward(state,revState, X_pos, Y_pos, gamma, moveProb, firstMove, getActions)
return nState
def loop (revState, moveProb, gamma, epsilon, getActions):
cont = True
oldState = revState[:]
state = revState[:]
print('First state:')
print(revState)
#first state needs different input, therefor outside the loop
state = nextState(state, revState, moveProb, gamma, True,getActions)
#iteration loop
while cont:
state = nextState(state, revState, moveProb, gamma, False, getActions)
for x in range(0,state.shape[0]):
for y in range(0,state.shape[1]):
if abs(state[x][y] - abs(oldState[x][y])) < epsilon:
cont = False
oldState = state
#print(state)
print('Converged state:')
print(state)
#init statements,
revState = np.array([[0,0,0],[0,10,0],[0,0,0]])
epsilon = 0.1
moveProb = 0.8
gamma = 0.9
loop( revState, moveProb, gamma, epsilon, getActionsMatrix )
First state:
[[ 0 0 0]
[ 0 10 0]
[ 0 0 0]]
Converged state:
[[44.7917635 51.12689178 44.7917635 ]
[51.12689178 47.23078789 51.12689178]
[44.7917635 51.12689178 44.7917635 ]]
#Note. You need to install gym! Sometimes difficult on Windows. Google for advise.
!pip install gym==0.7.4
!pip install gym-legacy-toytext
import gym
import numpy as np
import random
import math
import gym_toytext
Collecting gym==0.7.4
Downloading gym-0.7.4.tar.gz (152 kB)
|████████████████████████████████| 152 kB 3.5 MB/s
Requirement already satisfied: numpy>=1.10.4 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from gym==0.7.4) (1.19.5)
Requirement already satisfied: requests>=2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from gym==0.7.4) (2.27.1)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from gym==0.7.4) (1.16.0)
Collecting pyglet>=1.2.0
Downloading pyglet-1.5.21-py3-none-any.whl (1.1 MB)
|████████████████████████████████| 1.1 MB 9.8 MB/s
Requirement already satisfied: certifi>=2017.4.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from requests>=2.0->gym==0.7.4) (2021.10.8)
Requirement already satisfied: charset-normalizer~=2.0.0; python_version >= "3" in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from requests>=2.0->gym==0.7.4) (2.0.12)
Requirement already satisfied: idna<4,>=2.5; python_version >= "3" in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from requests>=2.0->gym==0.7.4) (3.3)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from requests>=2.0->gym==0.7.4) (1.26.8)
Building wheels for collected packages: gym
Building wheel for gym (setup.py) ... done
Created wheel for gym: filename=gym-0.7.4-py3-none-any.whl size=204693 sha256=2d48c5bdfc30e53b74608febcf59a4cef1e47d1ae5317b5437fbbb08980a5b2c
Stored in directory: /root/.cache/pip/wheels/28/9f/e8/5acdfe381cbae5bac0aafe5cf17603c6d1386952b2967331dd
Successfully built gym
Installing collected packages: pyglet, gym
Successfully installed gym-0.7.4 pyglet-1.5.21
WARNING: You are using pip version 20.1.1; however, version 22.0.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
Collecting gym-legacy-toytext
Downloading gym_legacy_toytext-0.0.1-py3-none-any.whl (9.9 kB)
Collecting gym>=0.19.0
Downloading gym-0.22.0.tar.gz (631 kB)
|████████████████████████████████| 631 kB 15.8 MB/s
Installing build dependencies ... done
Getting requirements to build wheel ... done
Preparing wheel metadata ... done
Requirement already satisfied: importlib-metadata>=4.10.0; python_version < "3.10" in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from gym>=0.19.0->gym-legacy-toytext) (4.11.0)
Collecting cloudpickle>=1.2.0
Downloading cloudpickle-2.0.0-py3-none-any.whl (25 kB)
Collecting gym-notices>=0.0.4
Downloading gym_notices-0.0.4-py3-none-any.whl (2.7 kB)
Requirement already satisfied: numpy>=1.18.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from gym>=0.19.0->gym-legacy-toytext) (1.19.5)
Requirement already satisfied: typing-extensions>=3.6.4; python_version < "3.8" in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=4.10.0; python_version < "3.10"->gym>=0.19.0->gym-legacy-toytext) (4.1.1)
Requirement already satisfied: zipp>=0.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from importlib-metadata>=4.10.0; python_version < "3.10"->gym>=0.19.0->gym-legacy-toytext) (3.7.0)
Building wheels for collected packages: gym
Building wheel for gym (PEP 517) ... done
Created wheel for gym: filename=gym-0.22.0-py3-none-any.whl size=708396 sha256=4d2399ded310e54ea554c7b698f79c513aa5391b2dd7dc9e50ac54a368f94787
Stored in directory: /root/.cache/pip/wheels/7d/5e/87/7d50e0179edda70feff5bba05c381041e1c1fd80c6b06a4cc3
Successfully built gym
Installing collected packages: cloudpickle, gym-notices, gym, gym-legacy-toytext
Attempting uninstall: gym
Found existing installation: gym 0.7.4
Uninstalling gym-0.7.4:
Successfully uninstalled gym-0.7.4
Successfully installed cloudpickle-2.0.0 gym-0.22.0 gym-legacy-toytext-0.0.1 gym-notices-0.0.4
WARNING: You are using pip version 20.1.1; however, version 22.0.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
#Training the Q-table
#Code is partly taken from q_learning_frozen_lake.ipynb
#Inspiration with average calculation is taken from
#https://towardsdatascience.com/q-learning-algorithm-from-explanation-to-implementation-cdbeda2ea187
def q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon):
# initialize the Q table [state, action], this case [5,2]
n_states = env.observation_space.n
n_actions = env.action_space.n
Q = np.zeros((n_states,n_actions))
#Alternative way of writing: Q = np.zeros([5, 2])
for _ in range(num_episodes):
state = env.reset()
done = False
while done == False:
# First we select an action:
if random.uniform(0, 1) < epsilon: # Flip a skewed coin
action = env.action_space.sample() # Explore action space
else:
action = np.argmax(Q[state,:]) # Exploit learned values
# Then we perform the action and receive the feedback from the environment
new_state, reward, done, info = env.step(action)
# Finally we learn from the experience by updating the Q-value of the selected action
update = reward + (gamma*np.max(Q[new_state,:])) - Q[state, action]
Q[state,action] += learning_rate*update
state = new_state
return Q
env = gym.make("NChain-v0")
num_episodes = 10000
gamma = 0.95
learning_rate = 0.1 #(in the description alpha = 0.1, so learning rate = 0.1)
epsilon = 0.1
#Used when num_episodes is very big
Q = q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon)
print(Q)
#Used for smaller num_episodes
'''
Q_list = []
for i in range(5):
Q_list.append(q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon))
avg_Q = np.zeros([5, 2])
for q in Q_list:
avg_Q+=q
avg_Q /= len(Q_list)
print(avg_Q)
'''
[[55.41016607 54.17389858]
[57.84299124 55.55706674]
[59.73528571 56.40702823]
[64.92159004 58.68673488]
[70.08993991 65.59943775]]
#Question 4a
def getActionsChain (State, Y_pos, X_pos):
#list of possible next possitions
nextPos = []
nextPos.append([0,0])
if(X_pos+1 <= State.shape[1]-1):
nextPos.append([0,X_pos+1])
return nextPos
revState =np.array( [[2,0,0,0,10]])
epsilon = 0.1
gamma = 0.95
moveProb = 0.9 # will make the best move 90% of the time
loop(revState, moveProb,gamma, epsilon, getActionsChain)
First state:
[[ 2 0 0 0 10]]
Converged state:
[[42.87286494 45.25607917 48.01302259 50.93153631 43.49417941]]