import random
def action_x(s):
prob = random.randint(1, 100)
if s == 'A':
if prob <= 60:
s_prime = 'A'
else:
s_prime = 'B'
else:
if prob <= 30:
s_prime = 'A'
else:
s_prime = 'B'
return s_prime
def action_y(s):
prob = random.randint(1, 100)
if s == 'A':
if prob <= 50:
s_prime = 'A'
else:
s_prime = 'B'
#50% to A, 50% to B
else: #B
#80% to A, 20% to B
if prob <= 80:
s_prime = 'A'
else:
s_prime = 'B'
return s_prime
def step(state, action):
new_state = action
if new_state == 'A':
reward = 1
else:
reward = 0
return new_state, reward
state = 'A'
reward = 1
for i in range(100):
result = step(state, action_x(state))
state = result[0]
reward += result[1]
print(reward)
state = 'A'
reward = 1
for i in range(100):
result = step(state, action_y(state))
state = result[0]
reward += result[1]
print(reward)
state = 'A'
reward = 1
for i in range(100):
r = random.randint(0,1)
if r == 0:
result = step(state, action_x(state))
else:
result = step(state, action_y(state))
state = result[0]
reward += result[1]
print(reward)
#!pip install gym
import gym
from gym import spaces
import numpy as np
class BedEnv(gym.Env):
"""
Beds custom environment, built with the gym interface
"""
metadata= {'render.modes': ['human']}
def __init__(self):
super(BedEnv, self).__init__()
self.observation_space = spaces.Discrete(3)
self.action_space = spaces.Tuple((spaces.Discrete(1), spaces.Discrete(1), spaces.Discrete(1)))
self.agent_pos = np.random.randint(3)
def _next_observation(self):
return {'state': self.agent_pos}
def step(self, action):
# dictionary of functions that return [reward, next_state] transitions for each action
state_0_transitions = { #asleep
0: lambda: [[30, 1],[50, 2]][np.random.choice(2,p=[0.75,0.25])] #wake up
}
state_1_transitions = { #awake in bed
0: lambda: [[5, 0],[20, 2]][np.random.choice(2,p=[0.45,0.55])] #turn over
}
state_2_transitions = { #out of bed
0: lambda: [[0, 0],[10, 1]][np.random.choice(2,p=[0.15,0.85])] #get in bed
}
reward = None
next_state = None
if (self.agent_pos==0):
# Agent is asleep. Use state 0 transitions
reward, next_state = state_0_transitions[action]()
elif (self.agent_pos == 1):
# Agent is awake in bed. Use state 1 transitions
reward, next_state = state_1_transitions[action]()
else:
# Agent is out of bed. Use state 2 transitions
reward, next_state = state_2_transitions[action]()
# Transition the agent to the next state
self.agent_pos = next_state
# Return the data as defined by the gym interface
return self._next_observation(), reward, False, {}
def reset(self):
# Reset the agent's position
self.agent_pos = np.random.randint(3)
return self._next_observation()
def render(self, mode='human'):
if mode == 'human':
pretty_print_state = {
0: "asleep",
1: "awake in bed",
2: "out of bed"
}
print('Current State: {}'.format(pretty_print_state[self.agent_pos]))
else:
raise NotImplementedError()
env = BedEnv()
curr_state = env.agent_pos
steps = 100
pretty_print_a_0 = {
0: "wake up",
}
pretty_print_a_1 = {
0: "turn over",
}
pretty_print_a_2 = {
0: "get in bed",
}
reward = 0
for _ in range(steps):
env.render()
action = env.action_space[curr_state].sample() # Sample an action from the current state
print_a = None
if curr_state==0:
print_a = pretty_print_a_0[action]
elif curr_state == 1:
print_a = pretty_print_a_1[action]
else:
print_a = pretty_print_a_2[action]
print("Action taken: {}".format(print_a))
obs, new_reward, _, _ = env.step(action) # Execute a step, get observation and reward
curr_state = obs['state']
reward += new_reward
print("Cumulative reward obtained: {}".format(reward))
print("===========================")