import numpy as np
import numpy.random as random
from numpy.fft import fft
from scipy.io import wavfile
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
sns.set()
sns.set(font_scale=1.5)

data_dir = './recordings/'
# determine digits of interest (0 to 9)
digits = [1,2] # change here to load more digits
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length
N = max([len(v) for d in digits for v in signals[d]])

# next we split our dataset in train and test
# we will use a 80/20 random split.
# create train/test split
ix = np.arange(100)
random.shuffle(ix)
# select train entries
ix_train = ix[:80]
#select test entries
ix_test = ix[80:]

# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
ix_cut = int(np.ceil(1500*Ts*N))
# initialize dictionary for storing transforms
transforms = {d:[] for d in digits}
# initialize dictionary for storing mean transforms
mean_transforms = {d:[] for d in digits}
# compute mean transform of each digit and in the training set.
# Make sure to only keep the spectrum up to 1.5kHz

# Code Solution to Q1 Here
for d in digits:
#first, do compute the transforms with the 1.5kHZ cutoff
for i in ix_train:
transforms[d].append(fft(signals[d][i],ix_cut))
# declare variables that will be used
X_d = {k:[] for k in range(80)}
norm = [0]
X_d_norm = []
for k in range(ix_cut):
X_i_d = []
#put all coefficients X_i,d[k] in the same array for the same k
for j in range(80):
X_i_d.append(transforms[d][j-1][k-1])
# average them to get the mean
X_d[k] = sum(X_i_d)/len(X_i_d)
# calculate the power of 2 of the norm
norm = norm + (abs(X_d[k]))**2
for k in range(ix_cut):
# calculate the normalized avg and assign it to mean_transforms
X_d_norm.append(X_d[k]/(np.sqrt(norm)))
mean_transforms[d] = X_d_norm

# In this next part, plot the average spectral magnitude of each digit.

# Code Solution to Q2 here
# I will do it with the normalized spectral magnitude, since it is the signature of each digit
# initializing the frequency vector
xf = np.linspace(1,ix_cut, ix_cut)/(N*Ts)
# plotting results for each digit
for d in range(len(digits)):
plt.figure(d)
plt.plot(xf,mean_transforms[digits[d]])
plt.title('Digit='+str(digits[d]))
plt.xlabel('Hz')

# classifier function
# receives a vector, computes the product with average digits, and returns the max inner product
# Input: sample x (vector)

# Code Q3a Here
def mean_classifier(x):
# compute the fft for the signal
y = fft(x, ix_cut)
# initialize the similarities dictionary
sims = {q:[] for q in digits}
for d in digits:
sim = 0
# calculate simlarity
for k in range(ix_cut):
sim = sim + y[k]*np.conj(mean_transforms[d][k])
# attribute similarity
sims[d].append(sim)
# get the key with the maximum similarity and return it
max_digit = max(sims, key=sims.get)
return max_digit

# Write answer for Q3b here
# It performs fairly well. Anything above 1/#digits is better than a guess and thus satisfactory.
# For digits 1 and 2, the overall accuracy seems to gravitate towards around 0.6, after many runs.
# That means that the result is good enough.

# Code 3b Here
# apply classifier to the test set for each digit, and check how many it gets right
rights = 0
acc_d = []
for d in range(len(digits)):
corrects = 0
for i in ix_test:
# calling the classifier
if mean_classifier(signals[digits[d]][i]) == digits[d]:
corrects += 1
rights += 1
# storing the number of correct evaluations for each digit
acc_d.append(corrects/20)
# and overall accuracy
acc_tot = rights/(len(digits)*20)
# plotting the results
plt.plot(digits, acc_d, 'o',digits, acc_tot*np.ones(len(digits)))
plt.xlabel('Digit')
plt.ylabel('Accuracy')

# Write answer for Q4 here
# By increasing the number of digits, it seems that the classifier does a progressively poorer job.
# However, on average, it seems to always have a better accuracy than 1/#digits, meaning
# it is still better than a guess.

# Code Q4 here
# just copy and paste code with the digits changed (just the parts that require it)
# determine digits of interest (0 to 9)
digits = [0,1,2,3,4,5,6,7,8,9] # change here to load more digits
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length
N = max([len(v) for d in digits for v in signals[d]])
# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
ix_cut = int(np.ceil(1500*Ts*N))
# initialize dictionary for storing transforms
transforms = {d:[] for d in digits}
# initialize dictionary for storing mean transforms
mean_transforms = {d:[] for d in digits}
# compute mean transform of each digit and in the training set.
for d in digits:
#first, do compute the transforms with the 1.5kHZ cutoff
for i in ix_train:
transforms[d].append(fft(signals[d][i],ix_cut))
# declare variables that will be used
X_d = {k:[] for k in range(80)}
norm = [0]
X_d_norm = []
for k in range(ix_cut):
X_i_d = []
#put all coefficients X_i,d[k] in the same array for the same k
for j in range(80):
X_i_d.append(transforms[d][j-1][k-1])
# average them to get the mean
X_d[k] = sum(X_i_d)/len(X_i_d)
# calculate the power of 2 of the norm
norm = norm + (abs(X_d[k]))**2
for k in range(ix_cut):
# calculate the normalized avg and assign it to mean_transforms
X_d_norm.append(X_d[k]/(np.sqrt(norm)))
mean_transforms[d] = X_d_norm
rights = 0
acc_d = []
for d in range(len(digits)):
corrects = 0
for i in ix_test:
# calling the classifier
if mean_classifier(signals[digits[d]][i]) == digits[d]:
corrects += 1
rights += 1
# storing the number of correct evaluations for each digit
acc_d.append(corrects/20)
# and overall accuracy
acc_tot = rights/(len(digits)*20)
# plotting the results
plt.plot(digits, acc_d, 'o',digits, acc_tot*np.ones(len(digits)))
plt.xlabel('Digit')
plt.ylabel('Accuracy')

# Write your answer here