Pset6-Voice Recognition

import numpy as np import numpy.random as random from numpy.fft import fft, fftfreq #added freq from scipy.io import wavfile import matplotlib.pyplot as plt import seaborn as sns import os %matplotlib inline sns.set() sns.set(font_scale=1.5)

data_dir = './recordings/' # determine digits of interest (0 to 9) #digits = [d for d in range(10)] # change here to load more digits. #fix digits = [1,2] # dictionary that will store our values signals = {d:[] for d in digits} file_names = {d:[] for d in digits} # import files for filename in os.listdir(data_dir): # iterate over digits for d in digits: if filename.startswith(str(d)+'_'): wav = wavfile.read(data_dir+filename)[1] ''' print("...") print(wavfile.read(data_dir+filename)) print(wav) print(wav.shape) print(len(wav.shape)) length gives #columns? and why? and why 2? and are the 2-byte values volume?''' if len(wav.shape)<2: signals[d].append(wav) file_names[d].append(filename) # find maximum of vector length N = max([len(v) for d in digits for v in signals[d]]) ''' a = np.array([1,2]) print(a.shape) print(len(a.shape)) ''' #print(signals[1]) #print(signals) #N = 6112 #lengths = [] #for signal in signals: # lengths.append(len(signal)) #lengths = [signals[d] for d in range(10)] #print(signals[0]) fftfreq(N)

# next we split our dataset in train and test # we will use a 80/20 random split. # create train/test split ix = np.arange(100) random.shuffle(ix) # select train entries ix_train = ix[:80] #select test entries ix_test = ix[80:]

# next we compute the average spectrum of each spoken digit in the training set. # we will consider a window up to 1.5 KHz # sampling rate is 8kHz Ts = 1.0/8000 ix_cut = int(np.ceil(1500*Ts*N)) # = 1146 # initialize dictionary for storing transforms transforms = {} # initialize dictionary for storing mean transforms mean_transforms = {} # compute mean transform of each digit and in the training set. # Make sure to only keep the spectrum up to 1.5kHz #print(ix_cut)

# Code Solution to Q1 Here avg_not_normalized = {d: [] for d in digits} avg_normalized = {d: [] for d in digits} #norms = {d: [] for d in digits} for d in digits: #for every digit transforms[d] = [] # store all instance FFTs in a list for index in ix_train: #if (d==1 and index == 1): # temp1 = fft(signals[d][index], n = N) # print(temp1) temp = fft(signals[d][index], n = N) # compute FFT for each training instance of that digit transforms[d].append(temp[0: ix_cut + 1]) # FFTs are stored as list of lists #for transform in transforms[d]: # norms[d].append(np.abs(transform)) # for DT signal w/ N samples, we get N freqs. Because it's real, the coeffs are symmetric # N = 6112 even. #-> freqs: k/(N*Ts) where k = {-3056,.. -2,-1, 0, 1, 2, ..., 3055} where 3056 is N/2 #but we want to extract k = {-1146,.. -2,-1, 0, 1, 2, ..., 1146} where 1146 is ix_cut #return of fft is coeffs(freq) = coeffs(k) where freq=k/(N*Ts) # f = [0, 1, ..., n/2-1, -n/2, ..., -1] / (d*n) if n is even # s.t. fft_return = [coeff(0), coeff(1), coeff(2),..., coeff(3055), coeff(-3056), coeff(-3055),...,coeff(-1)] #once we have the transforms of instances for that digit, take avg for k in range(ix_cut+1): # 0,..,ix_cut # Xd[k] = Sum(i in #instances) of transforms[d][i][k] summer = 0 for i in range(len(ix_train)): summer += np.abs(transforms[d][i][k]) avg_not_normalized[d].append(summer/len(ix_train)) #now normalize the avgs x_norm = np.linalg.norm(avg_not_normalized[d]) for k in range(ix_cut+1): avg_normalized[d].append(avg_not_normalized[d][k]/x_norm) mean_transforms[d] = avg_normalized[d] #print(mean_transforms[d]) #print(mean_transforms[1])

# In this next part, plot the average spectral magnitude of each digit.

# Code Solution to Q2 here w = np.linspace(0, 1500, ix_cut+1) for d in digits: plt.plot(w, mean_transforms[d]) plt.title(f"FFT of digit {d}") plt.show()

# classifier function # receives a vector, computes the product with average digits, and returns the max inner product # Input: sample x (vector) def mean_classifier(x): # Code Q3a Here x_fft = fft(x, n = N)[0:ix_cut+1] #get transform of x and look at window till ix_cut p = {} # p[d] is p(X, Xd) for d in digits: summer = 0 for k in range(ix_cut+1): x_norm = abs(x_fft[k]) xd_norm = abs(mean_transforms[d][k]) summer += x_norm * xd_norm p[d] = summer # find match based on max similarity match = max(p, key=p.get) return match

# Write anser for Q3b here # The accuracy for digit 1 is 100.0 % and the accuracy for digit 2 is 85.0 %. # However, these accuracies change each time I rerun the cell that splits the signals into ix_train and ix_test # The best accuracy percentages I got were 100% and 90%.

# Code 3b Here for d in digits: correct = 0 for i in ix_test: if (mean_classifier(signals[d][i]) == d): correct += 1 #print(f'{mean_classifier(signals[d][i])}, {d}') accuracy = correct * 100/ (len(ix_test)) print(f"{d}, accuracy is {accuracy} %")

# Write answer for Q4 here ''' The accuracy gets worse when we add more digits (digits from 0 to 9): 0, accuracy is 60.0 % 1, accuracy is 35.0 % 2, accuracy is 75.0 % 3, accuracy is 50.0 % 4, accuracy is 80.0 % 5, accuracy is 95.0 % 6, accuracy is 75.0 % 7, accuracy is 85.0 % 8, accuracy is 50.0 % 9, accuracy is 80.0 % Because we got better accuracy with just digits 1 and 2, this tells us that digits 1 and 2 have very different FFTs, which made distinguishing between them more straightforward. Adding digits meant that for every test signal we get, there are more similar train signals than when we just had 2 digits which worsened the accuracy. '''

# Code Q4 here data_dir = './recordings/' # determine digits of interest (0 to 9) digits = [d for d in range(10)] # change here to load more digits. #fix # dictionary that will store our values signals = {d:[] for d in digits} file_names = {d:[] for d in digits} # import files for filename in os.listdir(data_dir): # iterate over digits for d in digits: if filename.startswith(str(d)+'_'): wav = wavfile.read(data_dir+filename)[1] ''' print("...") print(wavfile.read(data_dir+filename)) print(wav) print(wav.shape) print(len(wav.shape)) length gives #columns? and why? and why 2? and are the 2-byte values volume?''' if len(wav.shape)<2: signals[d].append(wav) file_names[d].append(filename) # find maximum of vector length N = max([len(v) for d in digits for v in signals[d]]) ''' a = np.array([1,2]) print(a.shape) print(len(a.shape)) ''' #print(signals[1]) #print(signals) #N = 6112 #lengths = [] #for signal in signals: # lengths.append(len(signal)) #lengths = [signals[d] for d in range(10)] #print(signals[0]) fftfreq(N) # next we split our dataset in train and test # we will use a 80/20 random split. # create train/test split ix = np.arange(100) random.shuffle(ix) # select train entries ix_train = ix[:80] #select test entries ix_test = ix[80:] # next we compute the average spectrum of each spoken digit in the training set. # we will consider a window up to 1.5 KHz # sampling rate is 8kHz Ts = 1.0/8000 ix_cut = int(np.ceil(1500*Ts*N)) # = 1146 # initialize dictionary for storing transforms transforms = {} # initialize dictionary for storing mean transforms mean_transforms = {} # compute mean transform of each digit and in the training set. # Make sure to only keep the spectrum up to 1.5kHz #print(ix_cut) # Code Solution to Q1 Here avg_not_normalized = {d: [] for d in digits} avg_normalized = {d: [] for d in digits} #norms = {d: [] for d in digits} for d in digits: #for every digit transforms[d] = [] # store all instance FFTs in a list for index in ix_train: #if (d==1 and index == 1): # temp1 = fft(signals[d][index], n = N) # print(temp1) temp = fft(signals[d][index], n = N) transforms[d].append(temp[0: ix_cut + 1]) # FFTs are stored as list of lists #for transform in transforms[d]: # norms[d].append(np.abs(transform)) # for DT signal w/ N samples, we get N freqs. Because it's real, the coeffs are symmetric # N = 6112 even. #-> freqs: k/(N*Ts) where k = {-3056,.. -2,-1, 0, 1, 2, ..., 3055} where 3056 is N/2 #but we want to extract k = {-1146,.. -2,-1, 0, 1, 2, ..., 1146} where 1146 is ix_cut #return of fft is coeffs(freq) = coeffs(k) where freq=k/(N*Ts) # f = [0, 1, ..., n/2-1, -n/2, ..., -1] / (d*n) if n is even # s.t. fft_return = [coeff(0), coeff(1), coeff(2),..., coeff(3055), coeff(-3056), coeff(-3055),...,coeff(-1)] #once we have the transforms of instances for that digit, take avg for k in range(ix_cut+1): # 0,..,ix_cut # Xd[k] = Sum(i in #instances) of transforms[d][i][k] summer = 0 for i in range(len(ix_train)): summer += np.abs(transforms[d][i][k]) avg_not_normalized[d].append(summer/len(ix_train)) #now normalize the avgs x_norm = np.linalg.norm(avg_not_normalized[d]) for k in range(ix_cut+1): avg_normalized[d].append(avg_not_normalized[d][k]/x_norm) mean_transforms[d] = avg_normalized[d] # Code 3b Here for d in digits: correct = 0 for i in ix_test: if (mean_classifier(signals[d][i]) == d): correct += 1 #print(f'{mean_classifier(signals[d][i])}, {d}') accuracy = correct * 100/ (len(ix_test)) print(f"{d}, accuracy is {accuracy} %")

# Code Q5 here #cross validation #NN classifier (nearest neighbor) #k nn CLASSIFIER def nn_classifier(x, degree): maxes = {} # Code Q3a Here x_fft = fft(x, n = N)[0:ix_cut+1] #get transform of x and look at window till ix_cut p = {d: [] for d in digits} # p[d] is a list p(X, Xd) for all instances of d in training set digit_products_sorted = {d: [] for d in digits} top3s = [] top3s_sorted = [] for d in digits: for instance in transforms[d]: summer = 0 for k in range(ix_cut+1): x_norm = abs(x_fft[k]) xd_norm = abs(instance[k]) summer += x_norm * xd_norm p[d].append(summer) digit_products_sorted[d] = p[d].copy() digit_products_sorted[d].sort(reverse = True) for i in range(degree): top3s.append(digit_products_sorted[d][i]) # add the top #degree from that digit top3s_sorted = top3s.copy() top3s_sorted.sort(reverse = True) for i in range(degree): top3_best_digits.append(top3s.index(top3s_sorted[i]) // degree) # find most frequent element frequencies = {} for element in top3_best_digits: occurences = 0 # iterate over list for i in top3_best_digits: if (element == i): occurences += 1 frequencies [element] = occurences match = return match # 1-nn classifier def nn_classifier(x): maxes = {} # Code Q3a Here x_fft = fft(x, n = N)[0:ix_cut+1] #get transform of x and look at window till ix_cut p = {d: [] for d in digits} # p[d] is a list p(X, Xd) for all instances of d in training set for d in digits: for instance in transforms[d]: summer = 0 for k in range(ix_cut+1): x_norm = abs(x_fft[k]) xd_norm = abs(instance[k]) summer += x_norm * xd_norm p[d].append(summer) maxes[d] = max(p[d]) # find match based on max similarity match = max(maxes, key=maxes.get) return match # Code 3b Here for d in digits: correct = 0 for i in ix_test: if (nn_classifier(signals[d][i]) == d): correct += 1 #print(f'{mean_classifier(signals[d][i])}, {d}') accuracy = correct * 100/ (len(ix_test)) print(f"{d}, accuracy is {accuracy} %") '''0, accuracy is 90.0 % 1, accuracy is 0.0 % 2, accuracy is 0.0 % 3, accuracy is 5.0 % 4, accuracy is 45.0 % 5, accuracy is 10.0 % 6, accuracy is 55.0 % 7, accuracy is 0.0 % 8, accuracy is 5.0 % 9, accuracy is 25.0 % '''