import matplotlib.pyplot as plt
import numpy as np
import librosa, librosa.display, librosa.feature
import IPython.display as ipd
import pandas as pd
from google.colab import drive
drive.mount('/content/gdrive')
bird_sound_file = "./data/audio_files/759808e5-f824-401e-9058.wav"
# bird_sound_file = "/content/gdrive/MyDrive/audio_files/759808e5-f824-401e-9058.wav"
bird,bird_sr = librosa.load(bird_sound_file) # file and samplerate
# may not work in vscode
ipd.Audio(bird_sound_file)
bird
T = 1.5 # duration in seconds
sr = 22050 # sample rate
num_samples = int(T*sr) # time * sample rate gives us number of samples
freq = 440
t = np.linspace(0, T, num_samples) # divide 0 to T into num_samples segments
A_note = 0.3 * np.sin(2*np.pi*freq*t)
ipd.Audio(A_note, rate=sr)
plt.plot(t, A_note)
plt.xlim((0, 4/freq)) # plot 4 cycles of the sinewave
plt.show()
# visualize the waveforms
plt.figure(figsize=(15,5))
librosa.display.waveplot(bird, alpha=0.5)
FRAME_SIZE = 1024
HOP_LENGTH = 512
def amplitude_envelope(signal, frame_size, hop_length=0):
if hop_length==0: hop_length=frame_size
amp_envelope = [signal[i:i+frame_size].max() for i in range(0, len(signal), hop_length)]
return np.array(amp_envelope)
bird_ae = amplitude_envelope(bird, FRAME_SIZE, HOP_LENGTH)
np.argmax(bird_ae)
# make the time vector
frames = range(0, bird_ae.size)
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)
# visualize the waveforms
plt.figure(figsize=(15,10))
# plt.subplot(2,1,1)
librosa.display.waveplot(bird, alpha=0.5)
# plt.subplot(2,1,2)
plt.plot(t, bird_ae, color="r")
bird_rms = librosa.feature.rms(y=bird, frame_length=FRAME_SIZE, hop_length=HOP_LENGTH)[0]
bird_rms
# make the time vector
frames = range(len(bird_rms))
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)
# visualize the waveforms
plt.figure(figsize=(15,10))
librosa.display.waveplot(bird, alpha=0.5)
plt.plot(t, bird_rms, color="r")
bird_zcr = librosa.feature.zero_crossing_rate(y=bird, frame_length=FRAME_SIZE, hop_length=HOP_LENGTH)[0]
# make the time vector
frames = range(len(bird_zcr))
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)
# visualize the waveforms
plt.figure(figsize=(15,10))
librosa.display.waveplot(bird, alpha=0.5)
plt.plot(t, bird_zcr, color="r")
plt.ylim([0,1])
# taking a freq_ratio of .5 gives us the Nyquist frequency
def freq_spectrum(signal, sr, freq_ratio=0.5):
ft = np.fft.fft(signal)
mag = np.abs(ft)
freq_bins = np.linspace(0, sr, len(mag))
num_bins = int(len(freq_bins)*freq_ratio)
return freq_bins[:num_bins], mag[:num_bins]
freq_bins, bird_mag_spectrum = freq_spectrum(bird, bird_sr)
bird_mag_spectrum
plt.plot(freq_bins, bird_mag_spectrum)
bird_sc = librosa.feature.spectral_centroid(y=bird, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)[0]
# make the time vector
frames = range(len(bird_sc))
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)
# visualize the waveforms
plt.figure(figsize=(15,10))
plt.plot(t, bird_sc, color="r")
bird_bw = librosa.feature.spectral_bandwidth(y=bird, sr=bird_sr, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)[0]
# make the time vector
frames = range(len(bird_bw))
t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)
# visualize the waveforms
plt.figure(figsize=(15,10))
plt.plot(t, bird_bw, color="r")
# read the labels csv, return a Series with itemid as index, hasbird as value
def read_labels():
df = pd.read_csv("./data/labels.csv", index_col="itemid", dtype={"hasbird":bool})
# df = pd.read_csv("/content/gdrive/MyDrive/labels.csv", index_col="itemid", dtype={"hasbird":bool})
# we don't need the datasetid
return df.hasbird
data_labels = read_labels()
data_labels[data_labels.index[0]]
# turn an audio sample into features
def featurize(signal,sr):
# spectral centroid
sc = librosa.feature.spectral_centroid(y=signal, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)[0]
mean_sc = np.mean(sc)
max_sc = np.max(sc)
# bandwidth
bw = librosa.feature.spectral_bandwidth(y=signal, sr=sr, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)[0]
mean_bw = bw.mean()
num_windows = 10
window_i = np.linspace(0,len(bw)-1,num_windows,dtype=int)
window_bw = bw[window_i]
# zero crossing rate
zcr = librosa.feature.zero_crossing_rate(y=signal, frame_length=FRAME_SIZE, hop_length=HOP_LENGTH)[0]
mean_zcr = np.mean(zcr)
max_zcr = np.max(zcr)
# root mean square
rms = librosa.feature.rms(y=signal, frame_length=FRAME_SIZE, hop_length=HOP_LENGTH)[0]
mean_rms = np.mean(rms)
max_rms = np.max(rms)
return np.hstack((max_sc,mean_zcr,mean_rms,window_bw))
# the for-loop way
# takes ~9s for 20 samples
# takes ~46s for all 100 samples
X = []
Y = []
for audio_id in data_labels[:20].index:
audio_file = f"./data/audio_files/{audio_id}.wav"
# audio_file = f"/content/gdrive/MyDrive/audio_files/{audio_id}.wav"
audio,audio_sr = librosa.load(audio_file) # file and samplerate
X.append(featurize(audio,audio_sr))
Y.append(data_labels[audio_id])
X = np.array(X)
Y = np.array(Y)
print(X.shape, Y.shape)
# the pandas / numpy way but wrong dimensions? X is a numpy array of numpy arrays
# def process_audio(row):
# audio_id = row.itemid
# audio_file = f"/content/gdrive/MyDrive/audio_files/{audio_id}.wav"
# audio,audio_sr = librosa.load(audio_file) # file and samplerate
# x = featurize(audio,audio_sr)
# y = row.hasbird
# return x
# or the .to_numpy() may be causing problems...
# X = data_labels[:20].reset_index().apply(process_audio, axis=1).to_numpy()
# Y = data_labels[:20].values
# evaluate different models
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import LeaveOneOut, KFold
kf = KFold(n_splits=3, shuffle=True)
classifiers = [SVC(kernel='poly'),LinearSVC(loss='hinge', random_state=0, C=1.0),KNeighborsClassifier(),RandomForestClassifier(max_features=None)]
for clf_model in classifiers:
accuracy = []
for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = Y[train_index], Y[test_index]
# print(X_train)
clf_model.fit(X_train, y_train)
accuracy.append(clf_model.score(X_test, y_test))
print("Leave one trial out cross-val accuracy for",clf_model,"is", "\n", np.mean(accuracy))