import numpy as np
import numpy.random as random
from numpy.fft import fft, fftfreq #added freq
from scipy.io import wavfile
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
sns.set()
sns.set(font_scale=1.5)
data_dir = './recordings/'
# determine digits of interest (0 to 9)
#digits = [d for d in range(10)] # change here to load more digits. #fix
digits = [1,2]
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
''' print("...")
print(wavfile.read(data_dir+filename))
print(wav)
print(wav.shape)
print(len(wav.shape)) length gives #columns? and why? and why 2? and are the 2-byte values volume?'''
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length
N = max([len(v) for d in digits for v in signals[d]])
''' a = np.array([1,2])
print(a.shape)
print(len(a.shape)) '''
#print(signals[1])
#print(signals)
#N = 6112
#lengths = []
#for signal in signals:
# lengths.append(len(signal))
#lengths = [signals[d] for d in range(10)]
#print(signals[0])
fftfreq(N)
# next we split our dataset in train and test
# we will use a 80/20 random split.
# create train/test split
ix = np.arange(100)
random.shuffle(ix)
# select train entries
ix_train = ix[:80]
#select test entries
ix_test = ix[80:]
# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
ix_cut = int(np.ceil(1500*Ts*N)) # = 1146
# initialize dictionary for storing transforms
transforms = {}
# initialize dictionary for storing mean transforms
mean_transforms = {}
# compute mean transform of each digit and in the training set.
# Make sure to only keep the spectrum up to 1.5kHz
#print(ix_cut)
# Code Solution to Q1 Here
avg_not_normalized = {d: [] for d in digits}
avg_normalized = {d: [] for d in digits}
#norms = {d: [] for d in digits}
for d in digits: #for every digit
transforms[d] = [] # store all instance FFTs in a list
for index in ix_train:
#if (d==1 and index == 1):
# temp1 = fft(signals[d][index], n = N)
# print(temp1)
temp = fft(signals[d][index], n = N) # compute FFT for each training instance of that digit
transforms[d].append(temp[0: ix_cut + 1]) # FFTs are stored as list of lists
#for transform in transforms[d]:
# norms[d].append(np.abs(transform))
# for DT signal w/ N samples, we get N freqs. Because it's real, the coeffs are symmetric
# N = 6112 even.
#-> freqs: k/(N*Ts) where k = {-3056,.. -2,-1, 0, 1, 2, ..., 3055} where 3056 is N/2
#but we want to extract k = {-1146,.. -2,-1, 0, 1, 2, ..., 1146} where 1146 is ix_cut
#return of fft is coeffs(freq) = coeffs(k) where freq=k/(N*Ts)
# f = [0, 1, ..., n/2-1, -n/2, ..., -1] / (d*n) if n is even
# s.t. fft_return = [coeff(0), coeff(1), coeff(2),..., coeff(3055), coeff(-3056), coeff(-3055),...,coeff(-1)]
#once we have the transforms of instances for that digit, take avg
for k in range(ix_cut+1): # 0,..,ix_cut
# Xd[k] = Sum(i in #instances) of transforms[d][i][k]
summer = 0
for i in range(len(ix_train)):
summer += np.abs(transforms[d][i][k])
avg_not_normalized[d].append(summer/len(ix_train))
#now normalize the avgs
x_norm = np.linalg.norm(avg_not_normalized[d])
for k in range(ix_cut+1):
avg_normalized[d].append(avg_not_normalized[d][k]/x_norm)
mean_transforms[d] = avg_normalized[d]
#print(mean_transforms[d])
#print(mean_transforms[1])
# In this next part, plot the average spectral magnitude of each digit.
# Code Solution to Q2 here
w = np.linspace(0, 1500, ix_cut+1)
for d in digits:
plt.plot(w, mean_transforms[d])
plt.title(f"FFT of digit {d}")
plt.show()
# classifier function
# receives a vector, computes the product with average digits, and returns the max inner product
# Input: sample x (vector)
def mean_classifier(x):
# Code Q3a Here
x_fft = fft(x, n = N)[0:ix_cut+1] #get transform of x and look at window till ix_cut
p = {} # p[d] is p(X, Xd)
for d in digits:
summer = 0
for k in range(ix_cut+1):
x_norm = abs(x_fft[k])
xd_norm = abs(mean_transforms[d][k])
summer += x_norm * xd_norm
p[d] = summer
# find match based on max similarity
match = max(p, key=p.get)
return match
# Write anser for Q3b here
# The accuracy for digit 1 is 100.0 % and the accuracy for digit 2 is 85.0 %.
# However, these accuracies change each time I rerun the cell that splits the signals into ix_train and ix_test
# The best accuracy percentages I got were 100% and 90%.
# Code 3b Here
for d in digits:
correct = 0
for i in ix_test:
if (mean_classifier(signals[d][i]) == d):
correct += 1
#print(f'{mean_classifier(signals[d][i])}, {d}')
accuracy = correct * 100/ (len(ix_test))
print(f"{d}, accuracy is {accuracy} %")
# Write answer for Q4 here
''' The accuracy gets worse when we add more digits (digits from 0 to 9):
0, accuracy is 60.0 %
1, accuracy is 35.0 %
2, accuracy is 75.0 %
3, accuracy is 50.0 %
4, accuracy is 80.0 %
5, accuracy is 95.0 %
6, accuracy is 75.0 %
7, accuracy is 85.0 %
8, accuracy is 50.0 %
9, accuracy is 80.0 %
Because we got better accuracy with just digits 1 and 2, this tells us that digits 1 and 2
have very different FFTs, which made distinguishing between them more straightforward. Adding digits
meant that for every test signal we get, there are more similar train signals than when we just had 2 digits
which worsened the accuracy. '''
# Code Q4 here
data_dir = './recordings/'
# determine digits of interest (0 to 9)
digits = [d for d in range(10)] # change here to load more digits. #fix
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
''' print("...")
print(wavfile.read(data_dir+filename))
print(wav)
print(wav.shape)
print(len(wav.shape)) length gives #columns? and why? and why 2? and are the 2-byte values volume?'''
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length
N = max([len(v) for d in digits for v in signals[d]])
''' a = np.array([1,2])
print(a.shape)
print(len(a.shape)) '''
#print(signals[1])
#print(signals)
#N = 6112
#lengths = []
#for signal in signals:
# lengths.append(len(signal))
#lengths = [signals[d] for d in range(10)]
#print(signals[0])
fftfreq(N)
# next we split our dataset in train and test
# we will use a 80/20 random split.
# create train/test split
ix = np.arange(100)
random.shuffle(ix)
# select train entries
ix_train = ix[:80]
#select test entries
ix_test = ix[80:]
# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
ix_cut = int(np.ceil(1500*Ts*N)) # = 1146
# initialize dictionary for storing transforms
transforms = {}
# initialize dictionary for storing mean transforms
mean_transforms = {}
# compute mean transform of each digit and in the training set.
# Make sure to only keep the spectrum up to 1.5kHz
#print(ix_cut)
# Code Solution to Q1 Here
avg_not_normalized = {d: [] for d in digits}
avg_normalized = {d: [] for d in digits}
#norms = {d: [] for d in digits}
for d in digits: #for every digit
transforms[d] = [] # store all instance FFTs in a list
for index in ix_train:
#if (d==1 and index == 1):
# temp1 = fft(signals[d][index], n = N)
# print(temp1)
temp = fft(signals[d][index], n = N)
transforms[d].append(temp[0: ix_cut + 1]) # FFTs are stored as list of lists
#for transform in transforms[d]:
# norms[d].append(np.abs(transform))
# for DT signal w/ N samples, we get N freqs. Because it's real, the coeffs are symmetric
# N = 6112 even.
#-> freqs: k/(N*Ts) where k = {-3056,.. -2,-1, 0, 1, 2, ..., 3055} where 3056 is N/2
#but we want to extract k = {-1146,.. -2,-1, 0, 1, 2, ..., 1146} where 1146 is ix_cut
#return of fft is coeffs(freq) = coeffs(k) where freq=k/(N*Ts)
# f = [0, 1, ..., n/2-1, -n/2, ..., -1] / (d*n) if n is even
# s.t. fft_return = [coeff(0), coeff(1), coeff(2),..., coeff(3055), coeff(-3056), coeff(-3055),...,coeff(-1)]
#once we have the transforms of instances for that digit, take avg
for k in range(ix_cut+1): # 0,..,ix_cut
# Xd[k] = Sum(i in #instances) of transforms[d][i][k]
summer = 0
for i in range(len(ix_train)):
summer += np.abs(transforms[d][i][k])
avg_not_normalized[d].append(summer/len(ix_train))
#now normalize the avgs
x_norm = np.linalg.norm(avg_not_normalized[d])
for k in range(ix_cut+1):
avg_normalized[d].append(avg_not_normalized[d][k]/x_norm)
mean_transforms[d] = avg_normalized[d]
# Code 3b Here
for d in digits:
correct = 0
for i in ix_test:
if (mean_classifier(signals[d][i]) == d):
correct += 1
#print(f'{mean_classifier(signals[d][i])}, {d}')
accuracy = correct * 100/ (len(ix_test))
print(f"{d}, accuracy is {accuracy} %")
# Code Q5 here
#cross validation
#NN classifier (nearest neighbor)
#k nn CLASSIFIER
def nn_classifier(x, degree):
maxes = {}
# Code Q3a Here
x_fft = fft(x, n = N)[0:ix_cut+1] #get transform of x and look at window till ix_cut
p = {d: [] for d in digits} # p[d] is a list p(X, Xd) for all instances of d in training set
digit_products_sorted = {d: [] for d in digits}
top3s = []
top3s_sorted = []
for d in digits:
for instance in transforms[d]:
summer = 0
for k in range(ix_cut+1):
x_norm = abs(x_fft[k])
xd_norm = abs(instance[k])
summer += x_norm * xd_norm
p[d].append(summer)
digit_products_sorted[d] = p[d].copy()
digit_products_sorted[d].sort(reverse = True)
for i in range(degree):
top3s.append(digit_products_sorted[d][i]) # add the top #degree from that digit
top3s_sorted = top3s.copy()
top3s_sorted.sort(reverse = True)
for i in range(degree):
top3_best_digits.append(top3s.index(top3s_sorted[i]) // degree)
# find most frequent element
frequencies = {}
for element in top3_best_digits:
occurences = 0
# iterate over list
for i in top3_best_digits:
if (element == i):
occurences += 1
frequencies [element] = occurences
match =
return match
# 1-nn classifier
def nn_classifier(x):
maxes = {}
# Code Q3a Here
x_fft = fft(x, n = N)[0:ix_cut+1] #get transform of x and look at window till ix_cut
p = {d: [] for d in digits} # p[d] is a list p(X, Xd) for all instances of d in training set
for d in digits:
for instance in transforms[d]:
summer = 0
for k in range(ix_cut+1):
x_norm = abs(x_fft[k])
xd_norm = abs(instance[k])
summer += x_norm * xd_norm
p[d].append(summer)
maxes[d] = max(p[d])
# find match based on max similarity
match = max(maxes, key=maxes.get)
return match
# Code 3b Here
for d in digits:
correct = 0
for i in ix_test:
if (nn_classifier(signals[d][i]) == d):
correct += 1
#print(f'{mean_classifier(signals[d][i])}, {d}')
accuracy = correct * 100/ (len(ix_test))
print(f"{d}, accuracy is {accuracy} %")
'''0, accuracy is 90.0 %
1, accuracy is 0.0 %
2, accuracy is 0.0 %
3, accuracy is 5.0 %
4, accuracy is 45.0 %
5, accuracy is 10.0 %
6, accuracy is 55.0 %
7, accuracy is 0.0 %
8, accuracy is 5.0 %
9, accuracy is 25.0 % '''