import numpy as np
import numpy.random as random
from numpy.fft import fft
from scipy.io import wavfile
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
sns.set()
sns.set(font_scale=1.5)
data_dir = './recordings/'
# determine digits of interest (0 to 9)
digits = [1,2] # change here to load more digits
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length
N = max([len(v) for d in digits for v in signals[d]])
# next we split our dataset in train and test
# we will use a 80/20 random split.
# create train/test split
ix = np.arange(100)
random.shuffle(ix)
# select train entries
ix_train = ix[:80]
#select test entries
ix_test = ix[80:]
# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
ix_cut = int(np.ceil(1500*Ts*N))
# initialize dictionary for storing transforms
transforms = {}
# initialize dictionary for storing mean transforms
mean_transforms = {}
# compute mean transform of each digit and in the training set.
# Make sure to only keep the spectrum up to 1.5kHz
# Code Solution to Q1 Here
# Dictionary for non-normalized transforms
transforms_not_normalized = {}
for i in digits:
# FFT dictionary for training set
ix_train_fft = {}
for j in ix_train:
ix_train_fft[j] = fft(signals[i][j], n=N)
ix_train_fft[j] = ix_train_fft[j][:ix_cut+1]
# Find average spectral magnitude and place into correct dictionaries
av_mag = [0] * (ix_cut+1)
for j in ix_train:
for k in range(ix_cut+1):
av_mag[k] += abs(ix_train_fft[j][k])/80
transforms_not_normalized[i] = av_mag
mean_transforms[i] = transforms_not_normalized[i]/ np.linalg.norm(transforms_not_normalized[i])
# In this next part, plot the average spectral magnitude of each digit.
# Code Solution to Q2 here
%matplotlib inline
xf = np.linspace(0, 1500, ix_cut+1)
for d in [1,2]:
y_axis = transforms_not_normalized[d]
plt.plot(xf, y_axis)
plt.show()
# classifier function
# receives a vector, computes the product with average digits, and returns the max inner product
# Input: sample x (vector)
def mean_classifier(x):
fft_x = fft(x, n=N)[:ix_cut+1]
inn_prod = 0
similarity = {}
for i in digits:
for k in range(ix_cut):
inn_prod += abs(fft_x[k]) * abs(mean_transforms[i][k])
similarity[i] = inn_prod
inn_prod = 0
return max(similarity, key=similarity.get)
# Write anser for Q3b here
counter = [0] * len(digits)
for i in digits:
for j in ix_test:
if i == mean_classifier(signals[i][j]):
counter[i-1] += 1
for i in range(len(counter)):
perc = counter[i]/20 * 100
digit_num = i+1
print("digit ", digit_num, " accuracy: ", perc, "%")
# for i in counter:
# print("digit ", counter[i+1], " accuracy: " counter_avg[i+1] "%")
# Write answer for Q4 here
%matplotlib inline
data_dir = './recordings/'
# determine digits of interest (0 to 9)
digits = [0,1,2,3,4,5,6,7,8,9] # change here to load more digits
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length
N = max([len(v) for d in digits for v in signals[d]])
# next we split our dataset in train and test
# we will use a 80/20 random split.
# create train/test split
ix = np.arange(100)
random.shuffle(ix)
# select train entries
ix_train = ix[:80]
#select test entries
ix_test = ix[80:]
# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
ix_cut = int(np.ceil(1500*Ts*N))
# initialize dictionary for storing transforms
transforms = {}
# initialize dictionary for storing mean transforms
mean_transforms = {}
# compute mean transform of each digit and in the training set.
# Make sure to only keep the spectrum up to 1.5kHz
# Code Solution to Q1 Here
# Dictionary for non-normalized transforms
transforms_not_normalized = {}
for i in digits:
# FFT dictionary for training set
ix_train_fft = {}
for j in ix_train:
ix_train_fft[j] = fft(signals[i][j], n=N)
ix_train_fft[j] = ix_train_fft[j][:ix_cut+1]
# Find average spectral magnitude and place into correct dictionaries
av_mag = [0] * (ix_cut+1)
for j in ix_train:
for k in range(ix_cut+1):
av_mag[k] += abs(ix_train_fft[j][k])/80
transforms_not_normalized[i] = av_mag
mean_transforms[i] = transforms_not_normalized[i]/ np.linalg.norm(transforms_not_normalized[i])
xf = np.linspace(0, 1500, ix_cut+1)
for d in digits:
y_axis = transforms_not_normalized[d]
plt.plot(xf, y_axis)
plt.show()
def mean_classifier(x):
fft_x = fft(x, n=N)[:ix_cut+1]
inn_prod = 0
similarity = {}
for i in digits:
for k in range(ix_cut):
inn_prod += abs(fft_x[k]) * abs(mean_transforms[i][k])
similarity[i] = inn_prod
inn_prod = 0
return max(similarity, key=similarity.get)
counter = [0] * len(digits)
for i in digits:
for j in ix_test:
if i == mean_classifier(signals[i][j]):
counter[i-1] += 1
for i in range(len(counter)):
perc = counter[i]/20 * 100
digit_num = i+1
print("digit ", digit_num, " accuracy: ", perc, "%")
# for i in counter:
# print("digit ", counter[i+1], " accuracy: " counter_avg[i+1] "%")
# Code Q5 here
pip install -U notebook-as-pdf
pyppeteer-install