Naïve Bees

# used to change filepaths import os import matplotlib as mpl import matplotlib.pyplot as plt from IPython.display import display %matplotlib inline import pandas as pd import numpy as np # import Image from PIL from PIL import Image from skimage.feature import hog from skimage.color import rgb2gray from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA # import train_test_split from sklearn's model selection module from sklearn.model_selection import train_test_split # import SVC from sklearn's svm module from sklearn.svm import SVC # import accuracy_score from sklearn's metrics module from sklearn.metrics import roc_curve, auc, accuracy_score

# load the labels using pandas labels = pd.read_csv("datasets/labels.csv", index_col=0) # show the first five rows of the dataframe using head display(labels.head(5)) def get_image(row_id, root="datasets/"): """ Converts an image number into the file path where the image is located, opens the image, and returns the image as a numpy array. """ filename = "{}.jpg".format(row_id) file_path = os.path.join(root, filename) img = Image.open(file_path) return np.array(img) # subset the dataframe to just Apis (genus is 0.0) get the value of the sixth item in the index apis_row = labels[labels.genus == 0.0].index[5] # show the corresponding image of an Apis plt.imshow(get_image(apis_row)) plt.show() # subset the dataframe to just Bombus (genus is 1.0) get the value of the sixth item in the index bombus_row = labels[labels.genus == 1.0].index[5] # show the corresponding image of a Bombus plt.imshow(get_image(bombus_row)) plt.show()

# load a bombus image using our get_image function and bombus_row from the previous cell bombus =get_image(bombus_row) # print the shape of the bombus image print('Color bombus image has shape: ',bombus.shape) # convert the bombus image to grayscale gray_bombus = rgb2gray(bombus) # show the grayscale image plt.imshow(gray_bombus, cmap=mpl.cm.gray) # grayscale bombus image only has one channel print('Grayscale bombus image has shape: ',gray_bombus.shape)

# run HOG using our grayscale bombus image hog_features, hog_image = hog(gray_bombus, visualize=True, block_norm='L2-Hys', pixels_per_cell=(16, 16)) # show our hog_image with a gray colormap plt.imshow(hog_image, cmap=mpl.cm.gray)

def create_features(img): # flatten three channel color image color_features = img.flatten() # convert image to grayscale gray_image = rgb2gray(img) # get HOG features from grayscale image hog_features = hog(gray_image, block_norm='L2-Hys', pixels_per_cell=(16, 16)) # combine color and hog features into a single array flat_features =np.hstack((color_features,hog_features)) return flat_features bombus_features =create_features(bombus) # print shape of bombus_features bombus_features.shape

def create_feature_matrix(label_dataframe): features_list = [] for img_id in label_dataframe.index: # load image img =get_image(img_id) # get features for image image_features = create_features(img) features_list.append(image_features) # convert list of arrays into a matrix feature_matrix = np.array(features_list) return feature_matrix # run create_feature_matrix on our dataframe of images feature_matrix = create_feature_matrix(labels)

# split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels.genus.values, test_size=.3, random_state=1234123) # look at the distribution of labels in the train set y_train=pd.Series(y_train) y_train.value_counts()

# get shape of our training features print('Training features matrix shape is: ', X_train.shape) # define standard scaler ss = StandardScaler() # fit the scaler and transform the training features train_stand = ss.fit_transform(X_train) # transform the test features test_stand = ss.transform(X_test) # look at the new shape of the standardized feature matrices print('Standardized training features matrix shape is: ', train_stand.shape) print('Standardized test features matrix shape is: ', test_stand.shape)

# Instantiate a PCA object with 350 components pca = PCA(n_components=350) # use fit_transform on our standardized training features X_train = pca.fit_transform(train_stand) # use transform on our standardized test features X_test = pca.transform(test_stand) # look at new shape print('Training features matrix is: ',X_train.shape) print('Test features matrix is: ', X_test.shape)

# define support vector classifier svm = SVC(kernel='linear',probability=True,random_state=42) # fit model svm.fit(X_train,y_train) # generate predictions y_pred = svm.predict(X_test) # calculate accuracy accuracy = accuracy_score(y_pred,y_test) print('Model accuracy is: ', accuracy)

# predict probabilities for X_test using predict_proba probabilities = svm.predict_proba(X_test) # select the probabilities for label 1.0 y_proba =probabilities[:,1] # calculate false positive rate and true positive rate at different thresholds false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_proba, pos_label=1) # calculate AUC roc_auc = auc(true_positive_rate,false_positive_rate) plt.title('Receiver Operating Characteristic') # plot the false positive rate on the x axis and the true positive rate on the y axis roc_plot = plt.plot(false_positive_rate, true_positive_rate, label='AUC = {:0.2f}'.format(roc_auc)) plt.legend(loc=0) plt.plot([0,1], [0,1], ls='--') plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate');