VoiceRecognitionNumbers

import numpy as np import numpy.random as random from numpy.fft import fft from scipy.io import wavfile import matplotlib.pyplot as plt import seaborn as sns import os %matplotlib inline sns.set() sns.set(font_scale=1.5)

data_dir = './recordings/' # determine digits of interest (0 to 9) digits = [1,2] # change here to load more digits # dictionary that will store our values signals = {d:[] for d in digits} file_names = {d:[] for d in digits} # import files for filename in os.listdir(data_dir): # iterate over digits for d in digits: if filename.startswith(str(d)+'_'): wav = wavfile.read(data_dir+filename)[1] if len(wav.shape)<2: signals[d].append(wav) file_names[d].append(filename) # find maximum of vector length N = max([len(v) for d in digits for v in signals[d]])

# next we split our dataset in train and test # we will use a 80/20 random split. # create train/test split ix = np.arange(100) random.shuffle(ix) # select train entries ix_train = ix[:80] #select test entries ix_test = ix[80:]

# next we compute the average spectrum of each spoken digit in the training set. # we will consider a window up to 1.5 KHz # sampling rate is 8kHz Ts = 1.0/8000 ix_cut = int(np.ceil(1500*Ts*N)) # initialize dictionary for storing transforms transforms = {d:[] for d in digits} # initialize dictionary for storing mean transforms mean_transforms = {d:[] for d in digits} # compute mean transform of each digit and in the training set. # Make sure to only keep the spectrum up to 1.5kHz

# Code Solution to Q1 Here norm_transforms = {d:[] for d in digits} X_bar = {d:[] for d in digits} # Loop round for each digit e.g. 0, 1, 2, ..., 9 for d in digits: # Loop round all wav files for each digit and store transform in dict for i in ix_train: transforms[d].append(fft(signals[d][i], n=N)[:ix_cut+1]) # Find norm of transforms of digit d norm_transforms[d] = np.abs(transforms[d]) for freq in range(ix_cut+1): X_tilde_sum = 0 for sample in norm_transforms[d]: X_tilde_sum = X_tilde_sum + sample[freq] X_bar[d].append(X_tilde_sum/80) for freq in range(ix_cut+1): mean_transforms[d].append(X_bar[d][freq]/np.linalg.norm(X_bar[d]))

# In this next part, plot the average spectral magnitude of each digit.

# Code Solution to Q2 here xf = np.linspace(0, 1500, (ix_cut+1)) for d in digits: yf = mean_transforms[d] plt.plot(xf, yf) plt.xlabel("Hz") plt.show()

# classifier function # receives a vector, computes the product with average digits, and returns the max inner product # Input: sample x (vector) def mean_classifier(x): X = fft(x, n=N)[:ix_cut+1] max_value = 0 max_sum = 0 for d in digits: running_total = 0 for k in range(ix_cut+1): running_total += abs(X[k]) * abs(mean_transforms[d][k]) if (max_sum == 0): max_sum = running_total max_value = d elif (running_total > max_sum): max_sum = running_total max_value = d return max_value

# Write answer for Q3b here

# Code 3b Here def classifier_test(): correct = 0 total = 0 for d in digits: d_correct = 0 d_total = 0 for i in ix_test: estimate = mean_classifier(signals[d][i]) if (estimate == d): correct += 1 d_correct += 1 d_total += 1 total += 1 print(str(d) + ": " + str(d_correct) + "/" + str(d_total) + ", score: " + str(np.round(d_correct/d_total * 100)) + "%") print("Overall: " + str(correct) + "/" + str(total) + ", score: " + str(np.round(correct/total * 100)) + "%") return classifier_test()

# Write answer for Q4 here # The correctness is still bad (overall 57% here) however, this is still better at guessing (which would be around 10%) # So despite the percentage getting worse with more digits, the accuracy is still pretty good.

# Code Q4 here data_dir = './recordings/' # determine digits of interest (0 to 9) digits = [1,2,3,4,5,6,7,8,9,0] # change here to load more digits # dictionary that will store our values signals = {d:[] for d in digits} file_names = {d:[] for d in digits} # import files for filename in os.listdir(data_dir): # iterate over digits for d in digits: if filename.startswith(str(d)+'_'): wav = wavfile.read(data_dir+filename)[1] if len(wav.shape)<2: signals[d].append(wav) file_names[d].append(filename) # find maximum of vector length N = max([len(v) for d in digits for v in signals[d]]) # next we split our dataset in train and test # we will use a 80/20 random split. # create train/test split ix = np.arange(100) random.shuffle(ix) # select train entries ix_train = ix[:80] #select test entries ix_test = ix[80:] # next we compute the average spectrum of each spoken digit in the training set. # we will consider a window up to 1.5 KHz # sampling rate is 8kHz Ts = 1.0/8000 ix_cut = int(np.ceil(1500*Ts*N)) # initialize dictionary for storing transforms transforms = {d:[] for d in digits} # initialize dictionary for storing mean transforms mean_transforms = {d:[] for d in digits} # compute mean transform of each digit and in the training set. # Make sure to only keep the spectrum up to 1.5kHz norm_transforms = {d:[] for d in digits} X_bar = {d:[] for d in digits} # Loop round for each digit e.g. 0, 1, 2, ..., 9 for d in digits: # Loop round all wav files for each digit and store transform in dict for i in ix_train: transforms[d].append(fft(signals[d][i], n=N)[:ix_cut+1]) # Find norm of transforms of digit d norm_transforms[d] = np.abs(transforms[d]) for freq in range(ix_cut+1): X_tilde_sum = 0 for sample in norm_transforms[d]: X_tilde_sum = X_tilde_sum + sample[freq] X_bar[d].append(X_tilde_sum/80) for freq in range(ix_cut+1): mean_transforms[d].append(X_bar[d][freq]/np.linalg.norm(X_bar[d])) classifier_test()

# Code Q5 here