import numpy as np
import numpy.random as random
from numpy.fft import fft
from scipy.io import wavfile
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
sns.set()
sns.set(font_scale=1.5)
data_dir = './recordings/'
# determine digits of interest (0 to 9)
digits = [1,2] # change here to load more digits but Aneel said to wait until Q4.
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length
N = max([len(v) for d in digits for v in signals[d]])
# next we split our dataset in train and test
# we will use a 80/20 random split.
# create train/test split
ix = np.arange(100)
random.shuffle(ix)
# select train entries
ix_train = ix[:80]
#select test entries
ix_test = ix[80:]
# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
ix_cut = int(np.ceil(1500*Ts*N))
# initialize dictionary for storing transforms
transforms = {d:[] for d in digits}
# initialize dictionary for storing mean transforms
mean_transforms = {d:[] for d in digits}
# compute mean transform of each digit and in the training set.
# Make sure to only keep the spectrum up to 1.5kHz
# Code Solution to Q1 Here
transform_norms = {d:[] for d in digits}
x_bar = {d:[] for d in digits}
for d in digits:
for index in ix_train:
transforms[d].append(fft(signals[d][index], n=N)[:ix_cut+1])
transform_norms[d] = np.abs(transforms[d])
for k in range(ix_cut+1):
running_sum = 0
for sample in transform_norms[d]:
running_sum += sample[k]
x_bar[d].append(running_sum/len(ix_train))
for k in range(ix_cut+1):
mean_transforms[d].append(x_bar[d][k]/np.linalg.norm(x_bar[d]))
assert(len(mean_transforms[d]) == ix_cut+1)
# In this next part, plot the average spectral magnitude of each digit.
# Code Solution to Q2 here
xf = np.linspace(0, 1500, ix_cut+1)
for d in digits:
yf = mean_transforms[d]
plt.plot(xf, yf)
plt.xlabel("Hz")
plt.show()
# classifier function
# receives a vector, computes the product with average digits, and returns the max inner product
# Input: sample x (vector)
def mean_classifier(x):
# Code Q3a Here
X = fft(x, n=N)[:ix_cut+1]
max_digit = 0
max_sum = 0
for d in digits:
running_sum = 0
for k in range(ix_cut+1):
running_sum += abs(X[k]) * abs(mean_transforms[d][k])
if(max_sum == 0):
max_sum = running_sum
max_digit = d
elif(running_sum > max_sum):
max_sum = running_sum
max_digit = d
return max_digit
# Write anser for Q3b here
# The test did swimmingly. It was about 90% accurate with just the two choices (0 or 1)
# Code 3b Here
def test():
correct = 0
total = 0
for d in digits:
d_correct = 0
d_total = 0
for index in ix_test:
guess = mean_classifier(signals[d][index])
if (guess == d):
correct += 1
d_correct += 1
d_total += 1
total += 1
print("Digit " + str(d) + " got " + str(d_correct) + "/" + str(d_total) + " for a score of " + str(np.round(d_correct/d_total * 100)) + "%.")
print("Overall: " + str(correct) + "/" + str(total) + " for a score of " + str(np.round(correct/total * 100)) + "%.")
return
test()
# Write answer for Q4 here
# The accuracy here got quite a bit worse (down to about 60 or 70%) but is still pretty good considering guessing we would only expect an average of 10% correct.
# Code Q4 here
data_dir = './recordings/'
# determine digits of interest (0 to 9)
digits = [0,1,2,3,4,5,6,7,8,9] # change here to load more digits but Aneel said to wait until Q4.
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length
N = max([len(v) for d in digits for v in signals[d]])
# next we split our dataset in train and test
# we will use a 80/20 random split.
# create train/test split
ix = np.arange(100)
random.shuffle(ix)
# select train entries
ix_train = ix[:80]
#select test entries
ix_test = ix[80:]
# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
ix_cut = int(np.ceil(1500*Ts*N))
# initialize dictionary for storing transforms
transforms = {d:[] for d in digits}
# initialize dictionary for storing mean transforms
mean_transforms = {d:[] for d in digits}
# compute mean transform of each digit and in the training set.
# Make sure to only keep the spectrum up to 1.5kHz
# Code Solution to Q1 Here
transform_norms = {d:[] for d in digits}
x_bar = {d:[] for d in digits}
for d in digits:
for index in ix_train:
transforms[d].append(fft(signals[d][index], n=N)[:ix_cut+1])
transform_norms[d] = np.abs(transforms[d])
for k in range(ix_cut+1):
running_sum = 0
for sample in transform_norms[d]:
running_sum += sample[k]
x_bar[d].append(running_sum/80)
for k in range(ix_cut+1):
mean_transforms[d].append(x_bar[d][k]/np.linalg.norm(x_bar[d]))
assert(len(mean_transforms[d]) == ix_cut+1)
test()
# Assuming I implemented it correctly (which is admittedly a pretty big if), this nearest neigbor approximation w/ N=5 doesn't seem to do
# better than the original classifier, takes considerably longer to run, and is biased heavily towards certain numbers (0,5, and 6) though this could easily
# be because I implemented it incorrectly. In theory, one could set up a function to loop over values of N to find the optimal one fairly easily given
# how I wrote the function, but seeing as the test for individual values of N is taking upwards of 10 minutes to run, I won't do this.
# Code Q5 here
def mean_classifier(x, num_neighbors):
max_sums = []
max_digits = []
X = fft(x, n=N)[:ix_cut+1]
for d in digits:
for transform in transforms[d]:
running_sum = 0
for k in range(ix_cut+1):
running_sum += abs(X[k]) * abs(transform[k])
for i in range(len(max_sums)):
if(max_sums[i] > running_sum):
max_sums.insert(i, running_sum)
max_digits.insert(i, d)
break
if(i == len(max_sums)-1):
max_sums.insert(i+1, running_sum)
max_digits.insert(i+1, d)
break
# need to be sure we can get started
if(len(max_sums) == 0):
max_sums.insert(0, running_sum)
max_digits.insert(0, d)
if(len(max_sums) > num_neighbors):
max_sums.pop(0)
max_digits.pop(0)
return stats.mode(max_digits)[0][0]
def test():
correct = 0
total = 0
for d in digits:
d_correct = 0
d_total = 0
for index in ix_test:
guess = mean_classifier(signals[d][index], 5)
if (guess == d):
correct += 1
d_correct += 1
d_total += 1
total += 1
print("Digit " + str(d) + " got " + str(d_correct) + "/" + str(d_total) + " for a score of " + str(np.round(d_correct/d_total * 100)) + "%.")
print("Overall: " + str(correct) + "/" + str(total) + " for a score of " + str(np.round(correct/total * 100)) + "%.")
return
test()
# Test block for fun test purposes
num_neighbors = 3
running_sums = [6,5,4,3,2,1]
max_sums = []
max_digits = []
d = 0
for running_sum in running_sums:
for i in range(len(max_sums)):
if(max_sums[i] > running_sum):
max_sums.insert(i, running_sum)
max_digits.insert(i, d)
break
if(i == len(max_sums)-1):
max_sums.insert(i+1, running_sum)
max_digits.insert(i+1, d)
break
# need to be sure we can get started
if(len(max_sums) == 0):
max_sums.insert(0, running_sum)
max_digits.insert(0, d)
if(len(max_sums) > num_neighbors):
max_sums.pop(0)
max_digits.pop(0)
d += 1
print(max_sums)
print(max_digits)