# used to change filepaths
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline
import pandas as pd
import numpy as np
# import Image from PIL
from PIL import Image
from skimage.feature import hog
from skimage.color import rgb2gray
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# import train_test_split from sklearn's model selection module
from sklearn.model_selection import train_test_split
# import SVC from sklearn's svm module
from sklearn.svm import SVC
# import accuracy_score from sklearn's metrics module
from sklearn.metrics import roc_curve, auc, accuracy_score
# load the labels using pandas
labels = pd.read_csv("datasets/labels.csv", index_col=0)
# show the first five rows of the dataframe using head
display(labels.head(5))
def get_image(row_id, root="datasets/"):
"""
Converts an image number into the file path where the image is located,
opens the image, and returns the image as a numpy array.
"""
filename = "{}.jpg".format(row_id)
file_path = os.path.join(root, filename)
img = Image.open(file_path)
return np.array(img)
# subset the dataframe to just Apis (genus is 0.0) get the value of the sixth item in the index
apis_row = labels[labels.genus == 0.0].index[5]
# show the corresponding image of an Apis
plt.imshow(get_image(apis_row))
plt.show()
# subset the dataframe to just Bombus (genus is 1.0) get the value of the sixth item in the index
bombus_row = labels[labels.genus == 1.0].index[5]
# show the corresponding image of a Bombus
plt.imshow(get_image(bombus_row))
plt.show()
# load a bombus image using our get_image function and bombus_row from the previous cell
bombus =get_image(bombus_row)
# print the shape of the bombus image
print('Color bombus image has shape: ',bombus.shape)
# convert the bombus image to grayscale
gray_bombus = rgb2gray(bombus)
# show the grayscale image
plt.imshow(gray_bombus, cmap=mpl.cm.gray)
# grayscale bombus image only has one channel
print('Grayscale bombus image has shape: ',gray_bombus.shape)
# run HOG using our grayscale bombus image
hog_features, hog_image = hog(gray_bombus,
visualize=True,
block_norm='L2-Hys',
pixels_per_cell=(16, 16))
# show our hog_image with a gray colormap
plt.imshow(hog_image, cmap=mpl.cm.gray)
def create_features(img):
# flatten three channel color image
color_features = img.flatten()
# convert image to grayscale
gray_image = rgb2gray(img)
# get HOG features from grayscale image
hog_features = hog(gray_image, block_norm='L2-Hys', pixels_per_cell=(16, 16))
# combine color and hog features into a single array
flat_features =np.hstack((color_features,hog_features))
return flat_features
bombus_features =create_features(bombus)
# print shape of bombus_features
bombus_features.shape
def create_feature_matrix(label_dataframe):
features_list = []
for img_id in label_dataframe.index:
# load image
img =get_image(img_id)
# get features for image
image_features = create_features(img)
features_list.append(image_features)
# convert list of arrays into a matrix
feature_matrix = np.array(features_list)
return feature_matrix
# run create_feature_matrix on our dataframe of images
feature_matrix = create_feature_matrix(labels)
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(feature_matrix,
labels.genus.values,
test_size=.3,
random_state=1234123)
# look at the distribution of labels in the train set
y_train=pd.Series(y_train)
y_train.value_counts()
# get shape of our training features
print('Training features matrix shape is: ', X_train.shape)
# define standard scaler
ss = StandardScaler()
# fit the scaler and transform the training features
train_stand = ss.fit_transform(X_train)
# transform the test features
test_stand = ss.transform(X_test)
# look at the new shape of the standardized feature matrices
print('Standardized training features matrix shape is: ', train_stand.shape)
print('Standardized test features matrix shape is: ', test_stand.shape)
# Instantiate a PCA object with 350 components
pca = PCA(n_components=350)
# use fit_transform on our standardized training features
X_train = pca.fit_transform(train_stand)
# use transform on our standardized test features
X_test = pca.transform(test_stand)
# look at new shape
print('Training features matrix is: ',X_train.shape)
print('Test features matrix is: ', X_test.shape)
# define support vector classifier
svm = SVC(kernel='linear',probability=True,random_state=42)
# fit model
svm.fit(X_train,y_train)
# generate predictions
y_pred = svm.predict(X_test)
# calculate accuracy
accuracy = accuracy_score(y_pred,y_test)
print('Model accuracy is: ', accuracy)
# predict probabilities for X_test using predict_proba
probabilities = svm.predict_proba(X_test)
# select the probabilities for label 1.0
y_proba =probabilities[:,1]
# calculate false positive rate and true positive rate at different thresholds
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_proba, pos_label=1)
# calculate AUC
roc_auc = auc(true_positive_rate,false_positive_rate)
plt.title('Receiver Operating Characteristic')
# plot the false positive rate on the x axis and the true positive rate on the y axis
roc_plot = plt.plot(false_positive_rate,
true_positive_rate,
label='AUC = {:0.2f}'.format(roc_auc))
plt.legend(loc=0)
plt.plot([0,1], [0,1], ls='--')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate');