#Task 2
import numpy as np
#return array of possible moves and their rewards
def getActionsMatrix (State, X_pos, Y_pos):
    #array of possible actions, 0 = WEST, 1 = NORTH, 2 = EAST, 3 = SOUTH
    nextPos = []
    if(X_pos-1 >= 0):
        nextPos.append([X_pos-1,Y_pos])
    
    if(X_pos+1 <= State.shape[0]-1):
        nextPos.append([X_pos+1,Y_pos])
    if(Y_pos-1 >= 0):
        nextPos.append([X_pos,Y_pos-1])
    if(Y_pos+1 <= State.shape[1]-1):
        nextPos.append([X_pos,Y_pos+1])
         
    return nextPos
#takes environement and position and returns bestReward at current state for this iteration
def getBestReward (state, rState, X_pos,Y_pos, gamma, moveProb, firstMove, getActions):
    stayProb = 1-moveProb
    posReward = state[X_pos,Y_pos]
    bestReward = 0
    
    nextPos = getActions(state, X_pos, Y_pos)
    for p in range (0,len(nextPos)):
        nextPosition = (nextPos[p][0],nextPos[p][1])
        if (firstMove):
            reward = moveProb*(state[nextPosition]) + stayProb*(posReward)
        else:
            reward = moveProb*(rState[nextPosition] + gamma*state[nextPosition]) + stayProb*(rState[X_pos,Y_pos] + gamma*posReward)
        if (reward > bestReward):
            bestReward = reward
    return bestReward
#takes current V(s), base reward (revState), gamma and boolean for firstMove
#returns the nextState
def nextState (state,revState, moveProb, gamma, firstMove, getActions):
    nState = np.zeros(state.shape)
    for X_pos in range (0, state.shape[0]):
        for Y_pos in range (0, state.shape[1]):
            nState[X_pos][Y_pos] = getBestReward(state,revState, X_pos, Y_pos, gamma, moveProb, firstMove, getActions)
    return nState
def loop (revState, moveProb, gamma, epsilon, getActions):
    cont = True
    oldState = revState[:]
    state = revState[:]
    print('First state:')
    print(revState)
    #first state needs different input, therefor outside the loop
    state = nextState(state, revState, moveProb, gamma, True,getActions)
    #iteration loop
    while cont:
        state = nextState(state, revState, moveProb, gamma, False, getActions)
        for x in range(0,state.shape[0]):
            for y in range(0,state.shape[1]):
                if abs(state[x][y] - abs(oldState[x][y])) < epsilon:
                    cont = False
        oldState = state
        #print(state)
    print('Converged state:')
    print(state)
#init statements, 
revState = np.array([[0,0,0],[0,10,0],[0,0,0]])
epsilon = 0.1
moveProb = 0.8
gamma = 0.9
loop( revState, moveProb, gamma, epsilon, getActionsMatrix )
#Note. You need to install gym! Sometimes difficult on Windows. Google for advise.
!pip install gym==0.7.4
!pip install gym-legacy-toytext
import gym
import numpy as np
import random
import math
import gym_toytext 
#Training the Q-table
#Code is partly taken from q_learning_frozen_lake.ipynb
#Inspiration with average calculation is taken from
#https://towardsdatascience.com/q-learning-algorithm-from-explanation-to-implementation-cdbeda2ea187
def q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon):
    # initialize the Q table [state, action], this case [5,2]
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    Q = np.zeros((n_states,n_actions))
    #Alternative way of writing: Q = np.zeros([5, 2])
    for _ in range(num_episodes):
        state = env.reset()
        done = False
        while done == False:
            # First we select an action:
            if random.uniform(0, 1) < epsilon: # Flip a skewed coin
                action = env.action_space.sample() # Explore action space
            else:
                action = np.argmax(Q[state,:]) # Exploit learned values
            # Then we perform the action and receive the feedback from the environment
            new_state, reward, done, info = env.step(action)
            # Finally we learn from the experience by updating the Q-value of the selected action
            update = reward + (gamma*np.max(Q[new_state,:])) - Q[state, action]
            Q[state,action] += learning_rate*update 
            state = new_state
    
    return Q
env = gym.make("NChain-v0")
num_episodes = 10000
gamma = 0.95
learning_rate = 0.1 #(in the description alpha = 0.1, so learning rate = 0.1)
epsilon = 0.1
#Used when num_episodes is very big
Q  = q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon)
print(Q)
#Used for smaller num_episodes
'''
Q_list = []
for i in range(5):
    Q_list.append(q_learn(env, num_episodes,max_iter_episode, gamma, learning_rate, epsilon))
avg_Q = np.zeros([5, 2])
for q in Q_list:
    avg_Q+=q
avg_Q /= len(Q_list)
print(avg_Q)
'''
#Question 4a
def getActionsChain (State, Y_pos, X_pos):
    #list of possible next possitions
    nextPos = []
    
    nextPos.append([0,0])
    if(X_pos+1 <= State.shape[1]-1):
        nextPos.append([0,X_pos+1])
    return nextPos
revState =np.array( [[2,0,0,0,10]])
epsilon = 0.1
gamma = 0.95
moveProb = 0.9 # will make the best move 90% of the time
loop(revState, moveProb,gamma, epsilon, getActionsChain)