Pset6-Voice Recognition

import numpy as np import numpy.random as random from numpy.fft import fft from scipy.io import wavfile from scipy import stats import matplotlib.pyplot as plt import seaborn as sns import os %matplotlib inline sns.set() sns.set(font_scale=1.5)

data_dir = './recordings/' # determine digits of interest (0 to 9) digits = [1,2] # change here to load more digits but Aneel said to wait until Q4. # dictionary that will store our values signals = {d:[] for d in digits} file_names = {d:[] for d in digits} # import files for filename in os.listdir(data_dir): # iterate over digits for d in digits: if filename.startswith(str(d)+'_'): wav = wavfile.read(data_dir+filename)[1] if len(wav.shape)<2: signals[d].append(wav) file_names[d].append(filename) # find maximum of vector length N = max([len(v) for d in digits for v in signals[d]])

# next we split our dataset in train and test # we will use a 80/20 random split. # create train/test split ix = np.arange(100) random.shuffle(ix) # select train entries ix_train = ix[:80] #select test entries ix_test = ix[80:]

# next we compute the average spectrum of each spoken digit in the training set. # we will consider a window up to 1.5 KHz # sampling rate is 8kHz Ts = 1.0/8000 ix_cut = int(np.ceil(1500*Ts*N)) # initialize dictionary for storing transforms transforms = {d:[] for d in digits} # initialize dictionary for storing mean transforms mean_transforms = {d:[] for d in digits} # compute mean transform of each digit and in the training set. # Make sure to only keep the spectrum up to 1.5kHz

# Code Solution to Q1 Here transform_norms = {d:[] for d in digits} x_bar = {d:[] for d in digits} for d in digits: for index in ix_train: transforms[d].append(fft(signals[d][index], n=N)[:ix_cut+1]) transform_norms[d] = np.abs(transforms[d]) for k in range(ix_cut+1): running_sum = 0 for sample in transform_norms[d]: running_sum += sample[k] x_bar[d].append(running_sum/len(ix_train)) for k in range(ix_cut+1): mean_transforms[d].append(x_bar[d][k]/np.linalg.norm(x_bar[d])) assert(len(mean_transforms[d]) == ix_cut+1)

# In this next part, plot the average spectral magnitude of each digit.

# Code Solution to Q2 here xf = np.linspace(0, 1500, ix_cut+1) for d in digits: yf = mean_transforms[d] plt.plot(xf, yf) plt.xlabel("Hz") plt.show()

# classifier function # receives a vector, computes the product with average digits, and returns the max inner product # Input: sample x (vector) def mean_classifier(x): # Code Q3a Here X = fft(x, n=N)[:ix_cut+1] max_digit = 0 max_sum = 0 for d in digits: running_sum = 0 for k in range(ix_cut+1): running_sum += abs(X[k]) * abs(mean_transforms[d][k]) if(max_sum == 0): max_sum = running_sum max_digit = d elif(running_sum > max_sum): max_sum = running_sum max_digit = d return max_digit

# Write anser for Q3b here # The test did swimmingly. It was about 90% accurate with just the two choices (0 or 1)

# Code 3b Here def test(): correct = 0 total = 0 for d in digits: d_correct = 0 d_total = 0 for index in ix_test: guess = mean_classifier(signals[d][index]) if (guess == d): correct += 1 d_correct += 1 d_total += 1 total += 1 print("Digit " + str(d) + " got " + str(d_correct) + "/" + str(d_total) + " for a score of " + str(np.round(d_correct/d_total * 100)) + "%.") print("Overall: " + str(correct) + "/" + str(total) + " for a score of " + str(np.round(correct/total * 100)) + "%.") return test()

# Write answer for Q4 here # The accuracy here got quite a bit worse (down to about 60 or 70%) but is still pretty good considering guessing we would only expect an average of 10% correct.

# Code Q4 here data_dir = './recordings/' # determine digits of interest (0 to 9) digits = [0,1,2,3,4,5,6,7,8,9] # change here to load more digits but Aneel said to wait until Q4. # dictionary that will store our values signals = {d:[] for d in digits} file_names = {d:[] for d in digits} # import files for filename in os.listdir(data_dir): # iterate over digits for d in digits: if filename.startswith(str(d)+'_'): wav = wavfile.read(data_dir+filename)[1] if len(wav.shape)<2: signals[d].append(wav) file_names[d].append(filename) # find maximum of vector length N = max([len(v) for d in digits for v in signals[d]]) # next we split our dataset in train and test # we will use a 80/20 random split. # create train/test split ix = np.arange(100) random.shuffle(ix) # select train entries ix_train = ix[:80] #select test entries ix_test = ix[80:] # next we compute the average spectrum of each spoken digit in the training set. # we will consider a window up to 1.5 KHz # sampling rate is 8kHz Ts = 1.0/8000 ix_cut = int(np.ceil(1500*Ts*N)) # initialize dictionary for storing transforms transforms = {d:[] for d in digits} # initialize dictionary for storing mean transforms mean_transforms = {d:[] for d in digits} # compute mean transform of each digit and in the training set. # Make sure to only keep the spectrum up to 1.5kHz # Code Solution to Q1 Here transform_norms = {d:[] for d in digits} x_bar = {d:[] for d in digits} for d in digits: for index in ix_train: transforms[d].append(fft(signals[d][index], n=N)[:ix_cut+1]) transform_norms[d] = np.abs(transforms[d]) for k in range(ix_cut+1): running_sum = 0 for sample in transform_norms[d]: running_sum += sample[k] x_bar[d].append(running_sum/80) for k in range(ix_cut+1): mean_transforms[d].append(x_bar[d][k]/np.linalg.norm(x_bar[d])) assert(len(mean_transforms[d]) == ix_cut+1) test()

# Assuming I implemented it correctly (which is admittedly a pretty big if), this nearest neigbor approximation w/ N=5 doesn't seem to do # better than the original classifier, takes considerably longer to run, and is biased heavily towards certain numbers (0,5, and 6) though this could easily # be because I implemented it incorrectly. In theory, one could set up a function to loop over values of N to find the optimal one fairly easily given # how I wrote the function, but seeing as the test for individual values of N is taking upwards of 10 minutes to run, I won't do this. # Code Q5 here def mean_classifier(x, num_neighbors): max_sums = [] max_digits = [] X = fft(x, n=N)[:ix_cut+1] for d in digits: for transform in transforms[d]: running_sum = 0 for k in range(ix_cut+1): running_sum += abs(X[k]) * abs(transform[k]) for i in range(len(max_sums)): if(max_sums[i] > running_sum): max_sums.insert(i, running_sum) max_digits.insert(i, d) break if(i == len(max_sums)-1): max_sums.insert(i+1, running_sum) max_digits.insert(i+1, d) break # need to be sure we can get started if(len(max_sums) == 0): max_sums.insert(0, running_sum) max_digits.insert(0, d) if(len(max_sums) > num_neighbors): max_sums.pop(0) max_digits.pop(0) return stats.mode(max_digits)[0][0] def test(): correct = 0 total = 0 for d in digits: d_correct = 0 d_total = 0 for index in ix_test: guess = mean_classifier(signals[d][index], 5) if (guess == d): correct += 1 d_correct += 1 d_total += 1 total += 1 print("Digit " + str(d) + " got " + str(d_correct) + "/" + str(d_total) + " for a score of " + str(np.round(d_correct/d_total * 100)) + "%.") print("Overall: " + str(correct) + "/" + str(total) + " for a score of " + str(np.round(correct/total * 100)) + "%.") return test()

# Test block for fun test purposes num_neighbors = 3 running_sums = [6,5,4,3,2,1] max_sums = [] max_digits = [] d = 0 for running_sum in running_sums: for i in range(len(max_sums)): if(max_sums[i] > running_sum): max_sums.insert(i, running_sum) max_digits.insert(i, d) break if(i == len(max_sums)-1): max_sums.insert(i+1, running_sum) max_digits.insert(i+1, d) break # need to be sure we can get started if(len(max_sums) == 0): max_sums.insert(0, running_sum) max_digits.insert(0, d) if(len(max_sums) > num_neighbors): max_sums.pop(0) max_digits.pop(0) d += 1 print(max_sums) print(max_digits)