Wordle RL

import random import numpy as np from collections import defaultdict, deque from gym import Env, spaces from keras.models import Sequential from keras.layers import Dense from keras.losses import Huber from keras.initializers import Zeros from keras.optimizers import adam_v2

# Words WORDS = open("words.txt", "r").read().splitlines() WORDS_N = len(WORDS)

# Hyperparameters ALPHA = 0.002 EPSILON = 0.9 EPSILON_DECAY = 0.99995 EPSILON_MIN = 0.1 GAMMA = 0.95 # DQN Configuration BATCH_SIZE = 64 MAX_MEMORY = 100000 # Agent class Agent: def __init__(self, env): # Wordle environment self.env = env # Wordle observation space dimensions self.dimensions = env.observation_space.shape # Hyperparameters self.alpha = ALPHA self.epsilon = EPSILON self.epsilon_decay = EPSILON_DECAY self.epsilon_min = EPSILON_MIN self.gamma = GAMMA # Batch replay size self.batch_size = BATCH_SIZE # Batch replay memory self.memory = deque(maxlen=MAX_MEMORY) # Training model self.q = self.get_model() # Prediction model self.target_q = self.get_model() # This returns a neural network model. def get_model(self): model = Sequential() model.add(Dense(512, input_dim=self.dimensions[1], activation="relu", bias_initializer=Zeros())) model.add(Dense(512, activation="relu", bias_initializer=Zeros())) model.add(Dense(self.env.action_space.n, activation="linear")) model.compile(loss=Huber(), optimizer=adam_v2.Adam(lr=self.alpha)) return model # This returns a random action with epsilon probability. def get_epsilon_action(self, state): self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) return self.env.action_space.sample() if np.random.random() < self.epsilon else np.argmax(self.target_q.predict(state.reshape(self.dimensions))[0]) # This records an experience. def remember(self, state, action, reward, state_, done): self.memory.append((state, action, reward, state_, done)) # This copies the weights from the trained model to the prediction model. def target_train(self): weights = self.q.get_weights() self.target_q.set_weights(weights[:len(self.target_q.get_weights())]) # This fits the training model according to a random subset of experience. def replay(self): if len(self.memory) >= self.batch_size: samples = random.sample(self.memory, self.batch_size) x = [] y = [] for sample in samples: state, action, reward, state_, done = sample target = self.target_q.predict(state.reshape(self.dimensions)) if done: target[0][action] = reward else: target[0][action] = reward + self.gamma * max(self.target_q.predict(state_.reshape(self.dimensions))[0]) x.append(state) y.append(target) self.q.fit(np.array(x), np.array(y), batch_size=self.batch_size, verbose=0) # This is the main training loop in which the agent plays multiple games of Wordle. def train(self, steps): self.env.set_explore() for i in range(steps): rewards = 0 state = self.env.reset() done = False while not done: action = self.get_epsilon_action(state) state_, reward, done, info = self.env.step(action) self.remember(state, action, reward, state_, done) # The prediction model is trained every 100 steps. if i % 100 == 0: self.target_train() # The agent replays memory every 4 steps. if i % 4 == 0: self.replay() rewards += reward state = state_ print("Traing step {}".format(i)) # This returns the optimal action. def get_optimal_action(self, state): return np.argmax(self.target_q.predict(state.reshape(self.dimensions))[0]) # This saves the weights of the neural networks. def save(self, f, target_f): self.q.save_weights(f) self.target_q.save_weights(target_f) # This loads the weights of the neural networks from a file. def load(self, f, target_f): self.q.load_weights(f) self.target_q.load_weights(target_f)

# Environment class Wordle(Env): def __init__(self): self.words = WORDS self.action_space = spaces.Discrete(len(WORDS)) self.observation_space = spaces.Box(low=0, high=7, shape=(1, 26), dtype=np.int8) self.test = False self.prev_answer = self.words[np.random.randint(0, len(self.words))] # This resets the environment, chooses a new Wordle solution, and returns the initial game state. def reset(self): answer = self.words[np.random.randint(0, len(self.words))] # Get a new Wordle solution with 0.1 probability if in training mode. self.answer = answer if self.test else answer if np.random.random() < 0.1 else self.prev_answer self.prev_answer = self.answer self.state = np.zeros(26, dtype=np.int8) self.counter = 0 self.success = set() self.used = set() return np.copy(self.state) # This takes one step in the Wordle game with guess `action`, and returns the new state, reward, done status, and information. def step(self, action): reward = self._get_reward(action) self._step(action) done = self._get_done() info = self._get_info() return np.copy(self.state), reward, done, info # If implemented, this hides the game board. def close(self): pass # If implemented, this shows the game board. def render(self): pass # This is a helper function for taking a step in the game. def _step(self, action): for i, c in enumerate(self.words[action]): a = ord(c) - ord("a") if self.state[a] == 0: self.state[a] = 2 if c in self.answer else 1 if self.state[a] == 2 and self.answer[i] == c: self.state[a] = i + 3 self.success.add(i) self.counter += 1 self.used.add(action) # This calculates the reward given guess `action`. def _get_reward(self, action): word = self.words[action] if word == self.answer: return 1000 if word in self.used: return -500 reward = -1000 if self.counter == 5 else 0 for i, c in enumerate(word): a = ord(c) - ord("a") if self.state[a] >= 3: reward += 100 if self.answer[i] == c else -100 elif self.state[a] == 1: reward += -100 else: reward += 100 if self.answer[i] == c else 50 if c in self.answer else 0 return reward # This determines whether the game is over. def _get_done(self): return self.counter == 6 or len(self.success) == 5 # This returns metadata about the current game. def _get_info(self): return { "answer": self.answer, "state": self.state, "counter": self.counter, "success": self.success, "used": self.used } # This sets the environment to testing. def set_test(self): self.test = True # This sets the environment to training. def set_explore(self): self.test = False

# Initialization env = Wordle() agent = Agent(env)

# Training agent.train(10000)

# This simulates n games of Wordle with in testing mode. def simulate(agent, n): agent.env.set_test() for _ in range(n): data = [] state = agent.env.reset() done = False while not done: action = agent.get_optimal_action(state) state_, reward, done, info = agent.env.step(action) data.append((state, agent.env.words[action], reward, state_, done, info)) state = state_ print(data)

# Testing simulate(agent, 1)