import numpy as np
from numpy.random import *
import matplotlib.pyplot as plt
import argparse
args = argparse.ArgumentParser()
args.num_of_timestep = 1000
args.num_of_runstep = 2000
args.num_of_action = 10
args.noise=10
args.timestep = np.linspace(1,args.num_of_timestep,args.num_of_timestep, dtype=int)
args.reward = args.noise * randn(args.num_of_action)
args.noise_reward = randn(args.num_of_runstep)
args.epsilon = [1e-2, 1e-1, 0] # 0 is greedy selection
def greedy(x):
greedy_idx = np.argwhere(x == np.max(x)).squeeze().tolist()
if isinstance(greedy_idx, list):
greedy_idx = choice(greedy_idx)
return greedy_idx
def action_value_method(eps,alpha=None,nonstationary=False):
reward_total = np.zeros(args.num_of_timestep)
for epoch in range(args.num_of_runstep):
reward_list = np.zeros(args.num_of_action)
reward_count = np.zeros(args.num_of_action)
reward_timestep = []
for t in args.timestep:
if eps > random():
action = randint(args.num_of_action)
else:
action = greedy(reward_list)
if nonstationary: reward=args.reward[action] + args.noise_reward[t-1]
else: reward=args.reward[action]
reward_obs = reward + randn()
reward_timestep.append(reward_obs)
if reward_count[action] != 0:
if not alpha: alpha = 1 / reward_count[action]
reward_list[action] += (reward_obs - reward_list[action]) * alpha
else:
reward_list[action] = reward_obs
reward_count[action] += 1
reward_total += np.array(reward_timestep)
return reward_total / args.num_of_runstep
#initialization
g = action_value_method(args.epsilon[-1]) # eps = 0
e2 = action_value_method(args.epsilon[0]) # eps = 0.01
e1 = action_value_method(args.epsilon[1]) # eps = 0.1
print(f"True : {np.max(args.reward):.2f}")
print(f"Greedy : {np.mean(g):.2f}")
print(f"Epsilon=1e2 : {np.mean(e2):.2f}")
print(f"Epsilon=1e1 : {np.mean(e1):.2f}")
plt.plot(args.timestep,g,label='Greedy')
plt.plot(args.timestep,e2, label = 'e=0.01')
plt.plot(args.timestep, e1, label='e=0.1')
plt.legend()
plt.show()
args.num_of_runstep = 10000
args.noise_reward = 0.01 * randn(args.num_of_timestep)
fixed = action_value_method(args.epsilon[1], alpha=0.1,nonstationary=True) # eps = 0.1
dependency = action_value_method(args.epsilon[1],nonstationary=True) # eps = 0.1
plt.plot(args.timestep,fixed,label='Fixed')
plt.plot(args.timestep,dependency, label = 'Time dep')
plt.legend()
plt.show()