Pset6-Voice Recognition

import numpy as np import numpy.random as random from numpy.fft import fft from scipy.io import wavfile import matplotlib.pyplot as plt import seaborn as sns import os %matplotlib inline sns.set() sns.set(font_scale=1.5)

data_dir = './recordings/' # determine digits of interest (0 to 9) digits = [1,2] # change here to load more digits # dictionary that will store our values signals = {d:[] for d in digits} file_names = {d:[] for d in digits} # import files for filename in os.listdir(data_dir): # iterate over digits for d in digits: if filename.startswith(str(d)+'_'): wav = wavfile.read(data_dir+filename)[1] if len(wav.shape)<2: signals[d].append(wav) file_names[d].append(filename) # find maximum of vector length N = max([len(v) for d in digits for v in signals[d]])

# next we split our dataset in train and test # we will use a 80/20 random split. # create train/test split ix = np.arange(100) random.shuffle(ix) # select train entries ix_train = ix[:80] #select test entries ix_test = ix[80:]

# next we compute the average spectrum of each spoken digit in the training set. # we will consider a window up to 1.5 KHz # sampling rate is 8kHz Ts = 1.0/8000 ix_cut = int(np.ceil(1500*Ts*N)) # initialize dictionary for storing transforms transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits # initialize dictionary for storing mean transforms mean_transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits # compute mean transform of each digit and in the training set. # Make sure to only keep the spectrum up to 1.5kHz

# Code Solution to Q1 Here # create dictionaries of lists for the signals corresponding to each digit not_normalized = {d: [] for d in digits} normalized = {d: [] for d in digits} # iterate through the digits for d in digits: # iterative through the training data for i in ix_train: # get the FT of the given signal and add to the relevant dictionary list signalFFT = fft(signals[d][i], n = N) transforms[d].append(signalFFT[0: ix_cut+1]) # iterative through the audio sample until the cutoff time for cutoff in range(ix_cut+1): counter = 0 # iterate through the training signals for training in range(len(ix_train)): counter += np.abs(transforms[d][training][cutoff]) # take the summation for the average spectral magnitude of each signal not_normalized[d].append(counter/len(ix_train)) # calculate a normalization constant using the average magnitudes signal_norms = np.linalg.norm(not_normalized[d]) # iterate through the frequency length and normalize the samples per the given equation for t in range(ix_cut+1): normalized[d].append(not_normalized[d][t]/signal_norms) # add the normalized audio to the dictionary mean_transforms[d] = normalized[d]

# In this next part, plot the average spectral magnitude of each digit.

# Code Solution to Q2 here # initialize a variable for the frequency domain frequencies = np.linspace(0, 1500, ix_cut + 1) # create the subplots fig, axs = plt.subplots(len(digits)) # loop through the subplot to plot for each digit for plotNum in range(len(digits)): axs[plotNum].plot(frequencies, mean_transforms[plotNum+1]) axs[plotNum].set_title(f'FFT for {plotNum}') axs[plotNum].set(xlabel='frequency', ylabel='magnitude') fig.tight_layout()

# classifier function # receives a vector, computes the product with average digits, and returns the max inner prod # Input: sample x (vector) def mean_classifier(x): # take the transform of the input signal over the frequency range signal_fft = fft(x, n = N)[0:ix_cut+1] # create a dictionary to hold the results for each digit counterResults = {} # iterate through the digits for d in digits: counter = 0 # iterative across frequencies for f in range(ix_cut+1): # take the inner product between the signal and the normalized signals generated from the training data signal_norm = abs(signal_fft[f]) trained_norm = abs(mean_transforms[d][f]) counter += signal_norm * trained_norm # store the counter results counterResults[d] = counter # determine the class belonging based on which norm is the greatest digitClass = max(counterResults, key=counterResults.get) return digitClass

# Write answer for Q3b here # For a sample training set, we obtain an accuracy of 85% and 95% for # digits 1 and 2, respectively. Running the code for different randomized # sets reveals different accuracies, reflective of the variability caused # by using points both towards the interior and periphery of each digit # cluster. This variance highlights the importance of randomizing the # test and training data over several iterations to show efficacy in # our model.

# Code 3b Here # iterate across the digits for d in digits: # store a counter for number of signals accurately classified counter = 0 # iterate across the test data for i in ix_test: # check to see whether the signal is accurately classified. If so, increment the counter checkTest = mean_classifier(signals[d][i]) == d if (checkTest): counter += 1 # calculate the accuracy by dividing the number of correctly assigned signals and the total number accuracy = counter/(len(ix_test)) # print out the results print(f'The test data for digit {d} yielded an accuracy of {accuracy*100} %')

# Write answer for Q4 here # The test data for digit 0 yielded an accuracy of 60.0 % # The test data for digit 1 yielded an accuracy of 30.0 % # The test data for digit 2 yielded an accuracy of 70.0 % # The test data for digit 3 yielded an accuracy of 15.0 % # The test data for digit 4 yielded an accuracy of 85.0 % # The test data for digit 5 yielded an accuracy of 90.0 % # The test data for digit 6 yielded an accuracy of 60.0 % # The test data for digit 7 yielded an accuracy of 70.0 % # The test data for digit 8 yielded an accuracy of 40.0 % # The test data for digit 9 yielded an accuracy of 70.0 % # Repeated renditions of the program reveal that the more digits available leads to overall depressed accuracy. The likely # cause of this result is that more clusters results in a greater crowding of the signal space. We also note that different # number tend to have higher accuracies across different simulations. The likely cause of this variation is the differences # in phonetic sounds underlying each number. For instance, the similary s sound at the beginning of six and seven may be a # contributor to overlap in their normalized FFT and thus a cause of misinterpretation of the testing data.

data_dir = './recordings/' # determine digits of interest (0 to 9) digits = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # change here to load more digits # dictionary that will store our values signals = {d:[] for d in digits} file_names = {d:[] for d in digits} # import files for filename in os.listdir(data_dir): # iterate over digits for d in digits: if filename.startswith(str(d)+'_'): wav = wavfile.read(data_dir+filename)[1] if len(wav.shape)<2: signals[d].append(wav) file_names[d].append(filename) # find maximum of vector length N = max([len(v) for d in digits for v in signals[d]]) # next we split our dataset in train and test # we will use a 80/20 random split. # create train/test split ix = np.arange(100) random.shuffle(ix) # select train entries ix_train = ix[:80] #select test entries ix_test = ix[80:] # next we compute the average spectrum of each spoken digit in the training set. # we will consider a window up to 1.5 KHz # sampling rate is 8kHz Ts = 1.0/8000 ix_cut = int(np.ceil(1500*Ts*N)) # initialize dictionary for storing transforms transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits # initialize dictionary for storing mean transforms mean_transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits # compute mean transform of each digit and in the training set. # Make sure to only keep the spectrum up to 1.5kHz # Code Solution to Q1 Here # create dictionaries of lists for the signals corresponding to each digit not_normalized = {d: [] for d in digits} normalized = {d: [] for d in digits} # iterate through the digits for d in digits: # iterative through the training data for i in ix_train: # get the FT of the given signal and add to the relevant dictionary list signalFFT = fft(signals[d][i], n = N) transforms[d].append(signalFFT[0: ix_cut+1]) # iterative through the audio sample until the cutoff time for cutoff in range(ix_cut+1): counter = 0 # iterate through the training signals for training in range(len(ix_train)): counter += np.abs(transforms[d][training][cutoff]) # take the summation for the average spectral magnitude of each signal not_normalized[d].append(counter/len(ix_train)) # calculate a normalization constant using the average magnitudes signal_norms = np.linalg.norm(not_normalized[d]) # iterate through the frequency length and normalize the samples per the given equation for t in range(ix_cut+1): normalized[d].append(not_normalized[d][t]/signal_norms) # add the normalized audio to the dictionary mean_transforms[d] = normalized[d] # Code 3b Here # iterate across the digits for d in digits: # store a counter for number of signals accurately classified counter = 0 # iterate across the test data for i in ix_test: # check to see whether the signal is accurately classified. If so, increment the counter checkTest = mean_classifier(signals[d][i]) == d if (checkTest): counter += 1 # calculate the accuracy by dividing the number of correctly assigned signals and the total number accuracy = counter/(len(ix_test)) # print out the results print(f'The test data for digit {d} yielded an accuracy of {accuracy*100} %')

# classifier function # receives a vector, computes the product with average digits, and returns the max inner prod # Input: sample x (vector) def mean_classifier(x): # take the transform of the input signal over the frequency range signal_fft = fft(x, n = N)[0:ix_cut+1] # create a dictionary to hold the results for each digit counterResults = {} # iterate through the digits for d in digits: maxNorm = 0 for training in range(len(ix_train)): counter = 0 # iterative across frequencies for f in range(ix_cut+1): # take the inner product between the signal and the normalized signals generated from the training data signal_norm = abs(signal_fft[f]) trained_norm = abs(transforms[d][training][f]) counter += signal_norm * trained_norm # check for maximum norm for a digit class if counter > maxNorm: maxNorm = counter # store the counter results counterResults[d] = maxNorm # determine the class belonging based on which norm is the greatest digitClass = max(counterResults, key=counterResults.get) return digitClass

data_dir = './recordings/' # determine digits of interest (0 to 9) digits = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # change here to load more digits # dictionary that will store our values signals = {d:[] for d in digits} file_names = {d:[] for d in digits} # import files for filename in os.listdir(data_dir): # iterate over digits for d in digits: if filename.startswith(str(d)+'_'): wav = wavfile.read(data_dir+filename)[1] if len(wav.shape)<2: signals[d].append(wav) file_names[d].append(filename) # find maximum of vector length N = max([len(v) for d in digits for v in signals[d]]) # next we split our dataset in train and test # we will use a 80/20 random split. # create train/test split ix = np.arange(100) random.shuffle(ix) # select train entries ix_train = ix[:80] #select test entries ix_test = ix[80:] # next we compute the average spectrum of each spoken digit in the training set. # we will consider a window up to 1.5 KHz # sampling rate is 8kHz Ts = 1.0/8000 ix_cut = int(np.ceil(1500*Ts*N)) # initialize dictionary for storing transforms transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits # initialize dictionary for storing mean transforms mean_transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits # compute mean transform of each digit and in the training set. # Make sure to only keep the spectrum up to 1.5kHz # Code Solution to Q1 Here # create dictionaries of lists for the signals corresponding to each digit not_normalized = {d: [] for d in digits} normalized = {d: [] for d in digits} # iterate through the digits for d in digits: # iterative through the training data for i in ix_train: # get the FT of the given signal and add to the relevant dictionary list signalFFT = fft(signals[d][i], n = N) signal_norms = np.linalg.norm(signalFFT) normalizedFFT = signalFFT/signal_norms #for t in range(ix_cut+1): # normalized[d].append(not_normalized[d][t]/signal_norms) transforms[d].append(normalizedFFT[0: ix_cut+1]) # Code 3b Here # iterate across the digits for d in digits: # store a counter for number of signals accurately classified counter = 0 # iterate across the test data for i in ix_test: # check to see whether the signal is accurately classified. If so, increment the counter checkTest = mean_classifier(signals[d][i]) == d if (checkTest): counter += 1 # calculate the accuracy by dividing the number of correctly assigned signals and the total number accuracy = counter/(len(ix_test)) # print out the results print(f'The test data for digit {d} yielded an accuracy of {accuracy*100} %')