Let's solve Wordle with Python!
First, let's download the English words dataset from NLTK:
import nltk
nltk.data.path.append('/work/words')
nltk.download('words', download_dir='/work/words')
from nltk.corpus import words
word_list = words.words()
# prints 236736
len(word_list)
We're only interested in 5-letter words:
words_five = [word.lower() for word in word_list if len(word) == 5]
len(words_five)
Next, we could rank our options by sorting by letter frequency in the English language, from https://en.wikipedia.org/wiki/Letter_frequency :
letter_frequencies = {
"e": 0.13,
"t": 0.091,
"a": 0.082,
"o": 0.075,
"i": 0.07,
"n": 0.067,
"s": 0.063,
"h": 0.061,
"r": 0.06,
"d": 0.043,
"l": 0.04,
"c": 0.028,
"u": 0.028,
"m": 0.025,
"w": 0.024,
"f": 0.022,
"g": 0.02,
"y": 0.02,
"p": 0.019,
"b": 0.015,
"v": 0.0098,
"k": 0.0077,
"j": 0.0015,
"x": 0.0015,
"q": 0.00095,
"z": 0.00074,
}
Now, let's calculate which word gives us the best chance of hitting letters:
example_word = words_five[320]
example_word
letters_in_word = list(set(example_word))
letters_in_word
word_letter_frequencies = [letter_frequencies[letter] for letter in letters_in_word]
word_score = sum(word_letter_frequencies)
word_score
# now let's put the above into a function
def word_score(word):
letters_in_word = list(set(word))
word_letter_frequencies = [letter_frequencies[letter] for letter in letters_in_word]
word_score = sum(word_letter_frequencies)
return word_score
word_score(words_five[4])
Now let's see which word has the highest score – this one should be our first Wordle guess:
max(words_five, key=word_score)
word_score('atone')
Okay, so let's start with ATONE:
Now let's filter out only those words that have O in the third place and don't have the letters ATNE:
def second_word_filter(word):
if not word[2] == 'o':
return False
word_set = set(word)
forbidden = set('atne')
intersection = word_set.intersection(forbidden)
if len(intersection) > 0:
return False
return True
second_words = [word for word in words_five if second_word_filter(word)]
len(second_words)
max(second_words, key=word_score)
Now let's also add a filter for words that DO contain the letter R, but don't have in the fifth spot:
def third_word_filter(word):
if word[4] == 'r':
return False
if not word[2] == 'o':
return False
word_set = set(word)
forbidden = set('chi')
intersection = word_set.intersection(forbidden)
if len(intersection) > 0:
return False
must_have = set('r')
intersection_must = word_set.intersection(must_have)
if len(intersection_must) == 0:
return False
return True
third_words = [word for word in second_words if third_word_filter(word)]
len(third_words)
list(reversed(sorted(third_words, key=word_score)))[:10]
And now we just rinse and repeat:
def fourth_word_filter(word):
if word[3] == 'r':
return False
word_set = set(word)
forbidden = set('swd')
intersection = word_set.intersection(forbidden)
if len(intersection) > 0:
return False
must_have = set('r')
intersection_must = word_set.intersection(must_have)
if len(intersection_must) == 0:
return False
return True
fourth_words = [word for word in third_words if fourth_word_filter(word)]
len(fourth_words)
list(reversed(sorted(fourth_words, key=word_score)))[:10]
GROUF doesn't sound like a word so let's go with GROUP :D
def fifth_word_filter(word):
if word[4] == 'p':
return False
if not word[1] == 'r':
return False
if not word[2] == 'o':
return False
word_set = set(word)
forbidden = set('gu')
intersection = word_set.intersection(forbidden)
if len(intersection) > 0:
return False
must_have = set('p')
intersection_must = word_set.intersection(must_have)
if len(intersection_must) == 0:
return False
return True
fifth_words = [word for word in fourth_words if fifth_word_filter(word)]
len(fifth_words)
fifth_words