import numpy as np
import numpy.random as random
from numpy.fft import fft
from scipy.io import wavfile
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
sns.set()
sns.set(font_scale=1.5)
data_dir = './recordings/'
# determine digits of interest (0 to 9)
digits = [1,2] # change here to load more digits
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length
N = max([len(v) for d in digits for v in signals[d]])
# next we split our dataset in train and test
# we will use a 80/20 random split.
# create train/test split
ix = np.arange(100)
random.shuffle(ix)
# select train entries
ix_train = ix[:80]
#select test entries
ix_test = ix[80:]
# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
ix_cut = int(np.ceil(1500*Ts*N))
# initialize dictionary for storing transforms
transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits
# initialize dictionary for storing mean transforms
mean_transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits
# compute mean transform of each digit and in the training set.
# Make sure to only keep the spectrum up to 1.5kHz
# Code Solution to Q1 Here
# create dictionaries of lists for the signals corresponding to each digit
not_normalized = {d: [] for d in digits}
normalized = {d: [] for d in digits}
# iterate through the digits
for d in digits:
# iterative through the training data
for i in ix_train:
# get the FT of the given signal and add to the relevant dictionary list
signalFFT = fft(signals[d][i], n = N)
transforms[d].append(signalFFT[0: ix_cut+1])
# iterative through the audio sample until the cutoff time
for cutoff in range(ix_cut+1):
counter = 0
# iterate through the training signals
for training in range(len(ix_train)):
counter += np.abs(transforms[d][training][cutoff])
# take the summation for the average spectral magnitude of each signal
not_normalized[d].append(counter/len(ix_train))
# calculate a normalization constant using the average magnitudes
signal_norms = np.linalg.norm(not_normalized[d])
# iterate through the frequency length and normalize the samples per the given equation
for t in range(ix_cut+1):
normalized[d].append(not_normalized[d][t]/signal_norms)
# add the normalized audio to the dictionary
mean_transforms[d] = normalized[d]
# In this next part, plot the average spectral magnitude of each digit.
# Code Solution to Q2 here
# initialize a variable for the frequency domain
frequencies = np.linspace(0, 1500, ix_cut + 1)
# create the subplots
fig, axs = plt.subplots(len(digits))
# loop through the subplot to plot for each digit
for plotNum in range(len(digits)):
axs[plotNum].plot(frequencies, mean_transforms[plotNum+1])
axs[plotNum].set_title(f'FFT for {plotNum}')
axs[plotNum].set(xlabel='frequency', ylabel='magnitude')
fig.tight_layout()
# classifier function
# receives a vector, computes the product with average digits, and returns the max inner prod
# Input: sample x (vector)
def mean_classifier(x):
# take the transform of the input signal over the frequency range
signal_fft = fft(x, n = N)[0:ix_cut+1]
# create a dictionary to hold the results for each digit
counterResults = {}
# iterate through the digits
for d in digits:
counter = 0
# iterative across frequencies
for f in range(ix_cut+1):
# take the inner product between the signal and the normalized signals generated from the training data
signal_norm = abs(signal_fft[f])
trained_norm = abs(mean_transforms[d][f])
counter += signal_norm * trained_norm
# store the counter results
counterResults[d] = counter
# determine the class belonging based on which norm is the greatest
digitClass = max(counterResults, key=counterResults.get)
return digitClass
# Write answer for Q3b here
# For a sample training set, we obtain an accuracy of 85% and 95% for
# digits 1 and 2, respectively. Running the code for different randomized
# sets reveals different accuracies, reflective of the variability caused
# by using points both towards the interior and periphery of each digit
# cluster. This variance highlights the importance of randomizing the
# test and training data over several iterations to show efficacy in
# our model.
# Code 3b Here
# iterate across the digits
for d in digits:
# store a counter for number of signals accurately classified
counter = 0
# iterate across the test data
for i in ix_test:
# check to see whether the signal is accurately classified. If so, increment the counter
checkTest = mean_classifier(signals[d][i]) == d
if (checkTest):
counter += 1
# calculate the accuracy by dividing the number of correctly assigned signals and the total number
accuracy = counter/(len(ix_test))
# print out the results
print(f'The test data for digit {d} yielded an accuracy of {accuracy*100} %')
# Write answer for Q4 here
# The test data for digit 0 yielded an accuracy of 60.0 %
# The test data for digit 1 yielded an accuracy of 30.0 %
# The test data for digit 2 yielded an accuracy of 70.0 %
# The test data for digit 3 yielded an accuracy of 15.0 %
# The test data for digit 4 yielded an accuracy of 85.0 %
# The test data for digit 5 yielded an accuracy of 90.0 %
# The test data for digit 6 yielded an accuracy of 60.0 %
# The test data for digit 7 yielded an accuracy of 70.0 %
# The test data for digit 8 yielded an accuracy of 40.0 %
# The test data for digit 9 yielded an accuracy of 70.0 %
# Repeated renditions of the program reveal that the more digits available leads to overall depressed accuracy. The likely
# cause of this result is that more clusters results in a greater crowding of the signal space. We also note that different
# number tend to have higher accuracies across different simulations. The likely cause of this variation is the differences
# in phonetic sounds underlying each number. For instance, the similary s sound at the beginning of six and seven may be a
# contributor to overlap in their normalized FFT and thus a cause of misinterpretation of the testing data.
data_dir = './recordings/'
# determine digits of interest (0 to 9)
digits = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # change here to load more digits
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length
N = max([len(v) for d in digits for v in signals[d]])
# next we split our dataset in train and test
# we will use a 80/20 random split.
# create train/test split
ix = np.arange(100)
random.shuffle(ix)
# select train entries
ix_train = ix[:80]
#select test entries
ix_test = ix[80:]
# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
ix_cut = int(np.ceil(1500*Ts*N))
# initialize dictionary for storing transforms
transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits
# initialize dictionary for storing mean transforms
mean_transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits
# compute mean transform of each digit and in the training set.
# Make sure to only keep the spectrum up to 1.5kHz
# Code Solution to Q1 Here
# create dictionaries of lists for the signals corresponding to each digit
not_normalized = {d: [] for d in digits}
normalized = {d: [] for d in digits}
# iterate through the digits
for d in digits:
# iterative through the training data
for i in ix_train:
# get the FT of the given signal and add to the relevant dictionary list
signalFFT = fft(signals[d][i], n = N)
transforms[d].append(signalFFT[0: ix_cut+1])
# iterative through the audio sample until the cutoff time
for cutoff in range(ix_cut+1):
counter = 0
# iterate through the training signals
for training in range(len(ix_train)):
counter += np.abs(transforms[d][training][cutoff])
# take the summation for the average spectral magnitude of each signal
not_normalized[d].append(counter/len(ix_train))
# calculate a normalization constant using the average magnitudes
signal_norms = np.linalg.norm(not_normalized[d])
# iterate through the frequency length and normalize the samples per the given equation
for t in range(ix_cut+1):
normalized[d].append(not_normalized[d][t]/signal_norms)
# add the normalized audio to the dictionary
mean_transforms[d] = normalized[d]
# Code 3b Here
# iterate across the digits
for d in digits:
# store a counter for number of signals accurately classified
counter = 0
# iterate across the test data
for i in ix_test:
# check to see whether the signal is accurately classified. If so, increment the counter
checkTest = mean_classifier(signals[d][i]) == d
if (checkTest):
counter += 1
# calculate the accuracy by dividing the number of correctly assigned signals and the total number
accuracy = counter/(len(ix_test))
# print out the results
print(f'The test data for digit {d} yielded an accuracy of {accuracy*100} %')
# classifier function
# receives a vector, computes the product with average digits, and returns the max inner prod
# Input: sample x (vector)
def mean_classifier(x):
# take the transform of the input signal over the frequency range
signal_fft = fft(x, n = N)[0:ix_cut+1]
# create a dictionary to hold the results for each digit
counterResults = {}
# iterate through the digits
for d in digits:
maxNorm = 0
for training in range(len(ix_train)):
counter = 0
# iterative across frequencies
for f in range(ix_cut+1):
# take the inner product between the signal and the normalized signals generated from the training data
signal_norm = abs(signal_fft[f])
trained_norm = abs(transforms[d][training][f])
counter += signal_norm * trained_norm
# check for maximum norm for a digit class
if counter > maxNorm:
maxNorm = counter
# store the counter results
counterResults[d] = maxNorm
# determine the class belonging based on which norm is the greatest
digitClass = max(counterResults, key=counterResults.get)
return digitClass
data_dir = './recordings/'
# determine digits of interest (0 to 9)
digits = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] # change here to load more digits
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length
N = max([len(v) for d in digits for v in signals[d]])
# next we split our dataset in train and test
# we will use a 80/20 random split.
# create train/test split
ix = np.arange(100)
random.shuffle(ix)
# select train entries
ix_train = ix[:80]
#select test entries
ix_test = ix[80:]
# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
ix_cut = int(np.ceil(1500*Ts*N))
# initialize dictionary for storing transforms
transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits
# initialize dictionary for storing mean transforms
mean_transforms = {d: [] for d in digits} # initialize the dictionary to include the selected digits
# compute mean transform of each digit and in the training set.
# Make sure to only keep the spectrum up to 1.5kHz
# Code Solution to Q1 Here
# create dictionaries of lists for the signals corresponding to each digit
not_normalized = {d: [] for d in digits}
normalized = {d: [] for d in digits}
# iterate through the digits
for d in digits:
# iterative through the training data
for i in ix_train:
# get the FT of the given signal and add to the relevant dictionary list
signalFFT = fft(signals[d][i], n = N)
signal_norms = np.linalg.norm(signalFFT)
normalizedFFT = signalFFT/signal_norms
#for t in range(ix_cut+1):
# normalized[d].append(not_normalized[d][t]/signal_norms)
transforms[d].append(normalizedFFT[0: ix_cut+1])
# Code 3b Here
# iterate across the digits
for d in digits:
# store a counter for number of signals accurately classified
counter = 0
# iterate across the test data
for i in ix_test:
# check to see whether the signal is accurately classified. If so, increment the counter
checkTest = mean_classifier(signals[d][i]) == d
if (checkTest):
counter += 1
# calculate the accuracy by dividing the number of correctly assigned signals and the total number
accuracy = counter/(len(ix_test))
# print out the results
print(f'The test data for digit {d} yielded an accuracy of {accuracy*100} %')