import numpy as np
import numpy.random as random
from numpy.fft import fft
from scipy.io import wavfile
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline
sns.set()
sns.set(font_scale=1.5)
data_dir = './recordings/'
# determine digits of interest (0 to 9)
digits = [0,1,2,3,4,5,6,7,8,9] # change here to load more digits
# dictionary that will store our values
signals = {d:[] for d in digits}
file_names = {d:[] for d in digits}
# import files
# each digit 0-9 has 100 voice recordings
# signals[0][0] gives the first recording for digit 0
# file_names[0][0] gives name of file corresponding to signals[0][0]
for filename in os.listdir(data_dir):
# iterate over digits
for d in digits:
if filename.startswith(str(d)+'_'):
wav = wavfile.read(data_dir+filename)[1]
if len(wav.shape)<2:
signals[d].append(wav)
file_names[d].append(filename)
# find maximum of vector length of a single audio sample out of all digits and all samples
N = max([len(v) for d in digits for v in signals[d]])
# pad all signals to maximum length N
for i in range(0, len(signals)):
for j in range(0,len(signals[i])):
# pad each array with zeros so recording has len = N
signals[i][j] = np.pad(signals[i][j], (0,(N-len(signals[i][j]))), 'constant', constant_values=(0))
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:18: WavFileWarning: Chunk (non-data) not understood, skipping it.
# next we split our dataset in train and test
# we will use a 80/20 random split.
train_digits = {}
test_digits = {}
for i in range(0, len(signals)):
random.shuffle(signals[i])
train_digits[i] = signals[i][:80]
test_digits[i] = signals[i][80:]
# next we compute the average spectrum of each spoken digit in the training set.
# we will consider a window up to 1.5 KHz
# sampling rate is 8kHz
Ts = 1.0/8000
# for DFT, frequency = k/(N*Ts) Hz ---> k = frequency*N*Ts
# this is a_k coefficient corresponding to the f = 1500 Hz
ix_cut = int(np.ceil(1500*(Ts*N)))
# initialize dictionary for storing transforms
transforms = {}
# initialize dictionary for storing mean transforms
mean_transforms = {}
# compute mean transform of each digit and in the training set.
# Make sure to only keep the spectrum up to 1.5kHz
# Code Solution to Q1 Here
# TRANSFORMS
# for every digit
# for every recording of the digit
# find fft of every recording
# add up all of these recordings
# divide by num of recordings for the digit we have
# store in len of signals
for i in range(0,len(signals)):
dig_sum = np.zeros(ix_cut) # array of 0s, of size ix_cut, to fit 1.5Hz freq
for j in range(0,len(signals[i])):
dig_sum += np.abs(fft(signals[i][j])[0:ix_cut]) # compute mag of fft for ith digit, jth recording
transforms[i] = dig_sum/len(signals[i]) # divide total by number of recordings, load avg spec to transform
# MEAN_TRANSFORMS
# for every digit's avg spectrum
# spec_sum = square of absolute value of every element in the spec
# set digit's mean_transform to transform / spec_sum^0.5
for i in range(0, len(transforms)):
spec_sum = 0
for j in range(0, len(transforms[i])):
spec_sum += abs(transforms[i][j])**2
mean_transforms[i] = transforms[i]/(spec_sum**0.5)
# In this next part, plot the average spectral magnitude of each digit.
# adjust subplot spacing
plt.subplots_adjust(top=2.2, bottom=0.1, hspace=1.0, wspace=0.5, left=0.1, right=1.4)
# compute rows needed (dynamically based on digits) for 2 column plotting
tot = len(digits)
cols = 2
rows = tot // cols
rows += tot % cols
# position index for each subplot
posit = range(1,tot + 1)
# gets frequency axis for plots... goes from 0 to 1500 Hz
x = np.linspace(0.0, ix_cut/(Ts*N), ix_cut)
fig = plt.figure(1)
for i in range(tot):
ax = fig.add_subplot(rows,cols,posit[i])
y = mean_transforms[i]
ax.plot(x,y)
ax.set_title("Digit " + str(i))
plt.xlabel("Hz")
plt.tight_layout()
plt.show()
# DIFFERENT PLOTTING LAYOUT
# x = np.linspace(0.0, ix_cut/(Ts*N), ix_cut) # gets frequency axis for plots
# num_subplots = len(digits)
# plt.subplots_adjust(top=(1.0+num_subplots/2), bottom=0.1, hspace=(0.1+num_subplots/10)) # change plot spacing based on number of plots
# v = 0
# for i in range(num_subplots):
# v += 1
# ax1 = plt.subplot(num_subplots,1,v)
# y = mean_transforms[i]
# ax1.plot(x,y)
# ax1.set_title("Digit " + str(i))
# plt.xlabel("Hz")
# plt.show()
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:25: UserWarning: Tight layout not applied. tight_layout cannot make axes height small enough to accommodate all axes decorations
ix_cut = int(np.ceil(1500*(Ts*N)))
# TRAIN_DATA
# use training data to find mean_transform for each digit, using globally calculated ix_cut
# takes dictionary input train_digits
# outputs dictionary of mean_transform for each digit
def train_data(train_digits):
global ix_cut
trans = {} # store the avg spec
mean_trans = {} # store the norm avg spec
# same procedure as question 1, find trans
for i in range(0, len(train_digits)):
dig_sum = np.zeros(ix_cut) # array of 0s, of size ix_cut, to fit 1.5Hz freq
for j in range(0,len(train_digits[i])):
dig_sum += np.abs(fft(train_digits[i][j])[0:ix_cut]) # compute mag of fft for ith digit, jth recording
trans[i] = dig_sum/len(train_digits[i]) # divide total by number of recordings, load avg spec to transform
# same procedure as question 1, find mean_trans
for i in range(0, len(trans)):
spec_sum = 0
for j in range(0, len(trans[i])):
spec_sum += abs(trans[i][j])**2
mean_trans[i] = trans[i]/(spec_sum**0.5)
return mean_trans
# MEAN_CLASSIFIER
# for input digit x
# compute its X[k]
# inner prod X[k] against every single X'[k] for the digits you have
# see which similarity output is highest... that is our digit!
def mean_classifier(x, train_data):
global ix_cut
global N
# pad signal with 0s to match global N
x = np.pad(x, (0,(N-len(x))), 'constant', constant_values=(0))
# calculate spectrum, and trim to 1.5Hz
x_k = fft(x)[0:ix_cut]
# compute similarity
sim = {}
for i in range(0, len(train_data)):
sim[i] = np.dot(np.abs(train_data[i]), np.abs(x_k)) # take dot prod of digit norm avg spec and x[k]
# return key (ie digit) which has highest computed sim score
return max(sim, key=sim.get)
# Write anser for Q3b here
# store correct/incorrect count
accur = {}
mean_digits = train_data(train_digits) # calculate training data classification
# iterate over all test digits, and find cor/incor
for i in range(0, len(test_digits)):
cor = 0
incor = 0
for j in range(0, len(test_digits[i])):
guess = mean_classifier(test_digits[i][j], mean_digits)
if guess == i:
cor += 1
else:
incor += 1
accur[i] = round(cor/(cor+incor), 2) # accuracy per digit, rounded to 2s place
print("Accuracy per digit:")
print(accur)
Accuracy per digit:
{0: 0.75, 1: 0.4, 2: 0.65, 3: 0.1, 4: 0.75, 5: 0.95, 6: 0.7, 7: 0.7, 8: 0.4, 9: 0.45}
def train_data_2(train_digits):
global ix_cut
global N
trans = {} # store the avg spec
mean_trans = {} # store the norm avg spec
# same procedure as question 1, find trans
for i in range(0, len(train_digits)):
dig_sum = np.zeros(N) # array of 0s, of size N
for j in range(0,len(train_digits[i])):
dig_sum += np.abs(fft(train_digits[i][j])[0:N]) # compute mag of fft for ith digit, jth recording
trans[i] = dig_sum/len(train_digits[i]) # divide total by number of recordings, load avg spec to transform
# same procedure as question 1, find mean_trans
for i in range(0, len(trans)):
spec_sum = 0
for j in range(0, len(trans[i])):
spec_sum += abs(trans[i][j])**2
mean_trans[i] = trans[i]/(spec_sum**0.5)
return mean_trans
def mean_classifier_2(x, train_data):
global ix_cut
global N
# pad signal with 0s to match global N
x = np.pad(x, (0,(N-len(x))), 'constant', constant_values=(0))
# x = x + x*x
# calculate spectrum
x_k = fft(x)[0:N]
X_K_N = x_k/np.sum(x_k)
# compute similarity
sim = {}
for i in range(0, len(train_data)):
hold = np.dot(np.abs(train_data[i]), np.abs(X_K_N)) # take dot prod of digit norm avg spec and x[k]
sim[i] = (hold + 0.5*hold/(np.sum(train_data[i])))/2 # add inner prod to scaled version of itself
# return key (ie digit) which has highest computed sim score
return max(sim, key=sim.get)
accur = {}
mean_digits = train_data_2(train_digits) # calculate training data classification
# iterate over all test digits, and find cor/incor
for i in range(0, len(test_digits)):
cor = 0
incor = 0
for j in range(0, len(test_digits[i])):
guess = mean_classifier_2(test_digits[i][j], mean_digits)
if guess == i:
cor += 1
else:
incor += 1
accur[i] = round(cor/(cor+incor), 2) # accuracy per digit, rounded to 2s place
print("Accuracy per digit:")
print(accur)
print("For most digits, we see slight improvement or no change in accuracy.")
print("We considered the entire spectrum of each signal (instead of ix_cut), computed inner product ")
print("with the normalized mean average spectrum of the input signal (instead of unormalized), and scaled each")
print("similarity score by the sum of all values in the training data's normalized mean average spectrum.")
Accuracy per digit:
{0: 0.8, 1: 0.45, 2: 0.65, 3: 0.15, 4: 0.8, 5: 0.95, 6: 0.7, 7: 0.75, 8: 0.55, 9: 0.45}
For most digits, we see slight improvement or no change in accuracy.
We considered the entire spectrum of each signal (instead of ix_cut), computed inner product
with the normalized mean average spectrum of the input signal (instead of unormalized), and scaled each
similarity score by the sum of all values in the training data's normalized mean average spectrum.