classifying birdsong audio

import matplotlib.pyplot as plt import numpy as np import librosa, librosa.display, librosa.feature import IPython.display as ipd import pandas as pd

from google.colab import drive drive.mount('/content/gdrive')

bird_sound_file = "./data/audio_files/759808e5-f824-401e-9058.wav" # bird_sound_file = "/content/gdrive/MyDrive/audio_files/759808e5-f824-401e-9058.wav" bird,bird_sr = librosa.load(bird_sound_file) # file and samplerate

# may not work in vscode ipd.Audio(bird_sound_file)

bird

T = 1.5 # duration in seconds sr = 22050 # sample rate num_samples = int(T*sr) # time * sample rate gives us number of samples freq = 440 t = np.linspace(0, T, num_samples) # divide 0 to T into num_samples segments A_note = 0.3 * np.sin(2*np.pi*freq*t)

ipd.Audio(A_note, rate=sr)

plt.plot(t, A_note) plt.xlim((0, 4/freq)) # plot 4 cycles of the sinewave plt.show()

# visualize the waveforms plt.figure(figsize=(15,5)) librosa.display.waveplot(bird, alpha=0.5)

FRAME_SIZE = 1024 HOP_LENGTH = 512

def amplitude_envelope(signal, frame_size, hop_length=0): if hop_length==0: hop_length=frame_size amp_envelope = [signal[i:i+frame_size].max() for i in range(0, len(signal), hop_length)] return np.array(amp_envelope)

bird_ae = amplitude_envelope(bird, FRAME_SIZE, HOP_LENGTH)

np.argmax(bird_ae)

# make the time vector frames = range(0, bird_ae.size) t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)

# visualize the waveforms plt.figure(figsize=(15,10)) # plt.subplot(2,1,1) librosa.display.waveplot(bird, alpha=0.5) # plt.subplot(2,1,2) plt.plot(t, bird_ae, color="r")

bird_rms = librosa.feature.rms(y=bird, frame_length=FRAME_SIZE, hop_length=HOP_LENGTH)[0] bird_rms

# make the time vector frames = range(len(bird_rms)) t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)

# visualize the waveforms plt.figure(figsize=(15,10)) librosa.display.waveplot(bird, alpha=0.5) plt.plot(t, bird_rms, color="r")

bird_zcr = librosa.feature.zero_crossing_rate(y=bird, frame_length=FRAME_SIZE, hop_length=HOP_LENGTH)[0]

# make the time vector frames = range(len(bird_zcr)) t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH)

# visualize the waveforms plt.figure(figsize=(15,10)) librosa.display.waveplot(bird, alpha=0.5) plt.plot(t, bird_zcr, color="r") plt.ylim([0,1])

# taking a freq_ratio of .5 gives us the Nyquist frequency def freq_spectrum(signal, sr, freq_ratio=0.5): ft = np.fft.fft(signal) mag = np.abs(ft) freq_bins = np.linspace(0, sr, len(mag)) num_bins = int(len(freq_bins)*freq_ratio) return freq_bins[:num_bins], mag[:num_bins]

freq_bins, bird_mag_spectrum = freq_spectrum(bird, bird_sr)

bird_mag_spectrum

plt.plot(freq_bins, bird_mag_spectrum)

bird_sc = librosa.feature.spectral_centroid(y=bird, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)[0] # make the time vector frames = range(len(bird_sc)) t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH) # visualize the waveforms plt.figure(figsize=(15,10)) plt.plot(t, bird_sc, color="r")

bird_bw = librosa.feature.spectral_bandwidth(y=bird, sr=bird_sr, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)[0] # make the time vector frames = range(len(bird_bw)) t = librosa.frames_to_time(frames, hop_length=HOP_LENGTH) # visualize the waveforms plt.figure(figsize=(15,10)) plt.plot(t, bird_bw, color="r")

# read the labels csv, return a Series with itemid as index, hasbird as value def read_labels(): df = pd.read_csv("./data/labels.csv", index_col="itemid", dtype={"hasbird":bool}) # df = pd.read_csv("/content/gdrive/MyDrive/labels.csv", index_col="itemid", dtype={"hasbird":bool}) # we don't need the datasetid return df.hasbird

data_labels = read_labels()

data_labels[data_labels.index[0]]

# turn an audio sample into features def featurize(signal,sr): # spectral centroid sc = librosa.feature.spectral_centroid(y=signal, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)[0] mean_sc = np.mean(sc) max_sc = np.max(sc) # bandwidth bw = librosa.feature.spectral_bandwidth(y=signal, sr=sr, n_fft=FRAME_SIZE, hop_length=HOP_LENGTH)[0] mean_bw = bw.mean() num_windows = 10 window_i = np.linspace(0,len(bw)-1,num_windows,dtype=int) window_bw = bw[window_i] # zero crossing rate zcr = librosa.feature.zero_crossing_rate(y=signal, frame_length=FRAME_SIZE, hop_length=HOP_LENGTH)[0] mean_zcr = np.mean(zcr) max_zcr = np.max(zcr) # root mean square rms = librosa.feature.rms(y=signal, frame_length=FRAME_SIZE, hop_length=HOP_LENGTH)[0] mean_rms = np.mean(rms) max_rms = np.max(rms) return np.hstack((max_sc,mean_zcr,mean_rms,window_bw))

# the for-loop way # takes ~9s for 20 samples # takes ~46s for all 100 samples X = [] Y = [] for audio_id in data_labels[:20].index: audio_file = f"./data/audio_files/{audio_id}.wav" # audio_file = f"/content/gdrive/MyDrive/audio_files/{audio_id}.wav" audio,audio_sr = librosa.load(audio_file) # file and samplerate X.append(featurize(audio,audio_sr)) Y.append(data_labels[audio_id]) X = np.array(X) Y = np.array(Y) print(X.shape, Y.shape) # the pandas / numpy way but wrong dimensions? X is a numpy array of numpy arrays # def process_audio(row): # audio_id = row.itemid # audio_file = f"/content/gdrive/MyDrive/audio_files/{audio_id}.wav" # audio,audio_sr = librosa.load(audio_file) # file and samplerate # x = featurize(audio,audio_sr) # y = row.hasbird # return x # or the .to_numpy() may be causing problems... # X = data_labels[:20].reset_index().apply(process_audio, axis=1).to_numpy() # Y = data_labels[:20].values

# evaluate different models from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC, LinearSVC from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import LeaveOneOut, KFold kf = KFold(n_splits=3, shuffle=True) classifiers = [SVC(kernel='poly'),LinearSVC(loss='hinge', random_state=0, C=1.0),KNeighborsClassifier(),RandomForestClassifier(max_features=None)] for clf_model in classifiers: accuracy = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] # print(X_train) clf_model.fit(X_train, y_train) accuracy.append(clf_model.score(X_test, y_test)) print("Leave one trial out cross-val accuracy for",clf_model,"is", "\n", np.mean(accuracy))