Ch.2 Bandit

import numpy as np from numpy.random import * import matplotlib.pyplot as plt import argparse args = argparse.ArgumentParser()

args.num_of_timestep = 1000 args.num_of_runstep = 2000 args.num_of_action = 10 args.noise=10 args.timestep = np.linspace(1,args.num_of_timestep,args.num_of_timestep, dtype=int) args.reward = args.noise * randn(args.num_of_action) args.noise_reward = randn(args.num_of_runstep) args.epsilon = [1e-2, 1e-1, 0] # 0 is greedy selection

def greedy(x): greedy_idx = np.argwhere(x == np.max(x)).squeeze().tolist() if isinstance(greedy_idx, list): greedy_idx = choice(greedy_idx) return greedy_idx

def action_value_method(eps,alpha=None,nonstationary=False): reward_total = np.zeros(args.num_of_timestep) for epoch in range(args.num_of_runstep): reward_list = np.zeros(args.num_of_action) reward_count = np.zeros(args.num_of_action) reward_timestep = [] for t in args.timestep: if eps > random(): action = randint(args.num_of_action) else: action = greedy(reward_list) if nonstationary: reward=args.reward[action] + args.noise_reward[t-1] else: reward=args.reward[action] reward_obs = reward + randn() reward_timestep.append(reward_obs) if reward_count[action] != 0: if not alpha: alpha = 1 / reward_count[action] reward_list[action] += (reward_obs - reward_list[action]) * alpha else: reward_list[action] = reward_obs reward_count[action] += 1 reward_total += np.array(reward_timestep) return reward_total / args.num_of_runstep

#initialization g = action_value_method(args.epsilon[-1]) # eps = 0 e2 = action_value_method(args.epsilon[0]) # eps = 0.01 e1 = action_value_method(args.epsilon[1]) # eps = 0.1

print(f"True : {np.max(args.reward):.2f}") print(f"Greedy : {np.mean(g):.2f}") print(f"Epsilon=1e2 : {np.mean(e2):.2f}") print(f"Epsilon=1e1 : {np.mean(e1):.2f}")

plt.plot(args.timestep,g,label='Greedy') plt.plot(args.timestep,e2, label = 'e=0.01') plt.plot(args.timestep, e1, label='e=0.1') plt.legend() plt.show()

args.num_of_runstep = 10000 args.noise_reward = 0.01 * randn(args.num_of_timestep) fixed = action_value_method(args.epsilon[1], alpha=0.1,nonstationary=True) # eps = 0.1 dependency = action_value_method(args.epsilon[1],nonstationary=True) # eps = 0.1

plt.plot(args.timestep,fixed,label='Fixed') plt.plot(args.timestep,dependency, label = 'Time dep') plt.legend() plt.show()