from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()
images = mnist['data']
print("Images array has the following shape: {}".format(images.shape))
print("One image has the following shape: {}".format(images.iloc[0].shape))
Images array has the following shape: (70000, 784)
One image has the following shape: (784,)
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)
def plot_imgs(images):
# pick 4 random indexes of images
indexes = np.random.choice(range(0, len(images)), 4)
fig, axs = plt.subplots(1, 4, sharey=True, figsize=(12, 4), dpi=200)
axs[0].imshow(np.array(images.iloc[indexes[0]]).reshape(28, 28), cmap='binary')
axs[1].imshow(np.array(images.iloc[indexes[1]]).reshape(28, 28), cmap='binary')
axs[2].imshow(np.array(images.iloc[indexes[2]]).reshape(28, 28), cmap='binary')
axs[3].imshow(np.array(images.iloc[indexes[3]]).reshape(28, 28), cmap='binary')
for ax in axs:
ax.axis('off')
plot_imgs(images)
print(noise.shape)
print(y_train.shape)
print(y_test.shape)
(60000, 784)
(60000,)
(10000,)
y = images
y_train = y[:60000] # take images up to the 60000th image for training
y_test = y[60000:] # take images from the 60000th image to the last image
noise = np.random.randint(0, 500, (len(y_train), 784))
X_train = y_train + noise
# use different (unseen) noise for testing
noise = np.random.randint(0, 500, (len(y_test), 784))
X_test = y_test + noise
def plot_imgs_with_labels(images, labels):
# pick 4 random indexes of images
indexes = np.random.choice(range(0, len(images)), 4)
fig, axs = plt.subplots(2, 4, figsize=(12, 8), sharey=True, dpi=200)
axs[0, 0].imshow(np.array(images.iloc[indexes[0]]).reshape(28, 28), cmap='binary')
axs[0, 1].imshow(np.array(labels.iloc[indexes[0]]).reshape(28, 28), cmap='binary')
axs[0, 2].imshow(np.array(images.iloc[indexes[1]]).reshape(28, 28), cmap='binary')
axs[0, 3].imshow(np.array(labels.iloc[indexes[1]]).reshape(28, 28), cmap='binary')
axs[1, 0].imshow(np.array(images.iloc[indexes[2]]).reshape(28, 28), cmap='binary')
axs[1, 1].imshow(np.array(labels.iloc[indexes[2]]).reshape(28, 28), cmap='binary')
axs[1, 2].imshow(np.array(images.iloc[indexes[3]]).reshape(28, 28), cmap='binary')
axs[1, 3].imshow(np.array(labels.iloc[indexes[3]]).reshape(28, 28), cmap='binary')
for ax in axs:
for idx in ax:
idx.axis('off')
axs[0, 0].set_title('Dirty image')
axs[0, 1].set_title('Clean image')
axs[0, 2].set_title('Dirty image')
axs[0, 3].set_title('Clean image')
plot_imgs_with_labels(X_train, y_train)
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_jobs=-1)
knn_clf.fit(X_train, y_train)
def plot_imgs_with_predictions(images, labels, model):
# pick 4 random indexes of images
indexes = np.random.choice(range(0, len(images)), 12)
fig, axs = plt.subplots(4, 6, figsize=(12, 8), sharey=True)
axs[0, 0].imshow(np.array(images.iloc[indexes[0]]).reshape(28, 28), cmap='binary')
axs[0, 1].imshow(model.predict([np.array(images.iloc[indexes[0]])]).reshape(28, 28), cmap='binary')
axs[0, 2].imshow(np.array(labels.iloc[indexes[0]]).reshape(28, 28), cmap='binary')
axs[1, 0].imshow(np.array(images.iloc[indexes[1]]).reshape(28, 28), cmap='binary')
axs[1, 1].imshow(model.predict([np.array(images.iloc[indexes[1]])]).reshape(28, 28), cmap='binary')
axs[1, 2].imshow(np.array(labels.iloc[indexes[1]]).reshape(28, 28), cmap='binary')
axs[2, 0].imshow(np.array(images.iloc[indexes[2]]).reshape(28, 28), cmap='binary')
axs[2, 1].imshow(model.predict([np.array(images.iloc[indexes[2]])]).reshape(28, 28), cmap='binary')
axs[2, 2].imshow(np.array(labels.iloc[indexes[2]]).reshape(28, 28), cmap='binary')
axs[3, 0].imshow(np.array(images.iloc[indexes[3]]).reshape(28, 28), cmap='binary')
axs[3, 1].imshow(model.predict([np.array(images.iloc[indexes[3]])]).reshape(28, 28), cmap='binary')
axs[3, 2].imshow(np.array(labels.iloc[indexes[3]]).reshape(28, 28), cmap='binary')
axs[0, 3].imshow(np.array(images.iloc[indexes[4]]).reshape(28, 28), cmap='binary')
axs[0, 4].imshow(model.predict([np.array(images.iloc[indexes[4]])]).reshape(28, 28), cmap='binary')
axs[0, 5].imshow(np.array(labels.iloc[indexes[4]]).reshape(28, 28), cmap='binary')
axs[1, 3].imshow(np.array(images.iloc[indexes[5]]).reshape(28, 28), cmap='binary')
axs[1, 4].imshow(model.predict([np.array(images.iloc[indexes[5]])]).reshape(28, 28), cmap='binary')
axs[1, 5].imshow(np.array(labels.iloc[indexes[5]]).reshape(28, 28), cmap='binary')
axs[2, 3].imshow(np.array(images.iloc[indexes[5]]).reshape(28, 28), cmap='binary')
axs[2, 4].imshow(model.predict([np.array(images.iloc[indexes[6]])]).reshape(28, 28), cmap='binary')
axs[2, 5].imshow(np.array(labels.iloc[indexes[5]]).reshape(28, 28), cmap='binary')
axs[3, 3].imshow(np.array(images.iloc[indexes[5]]).reshape(28, 28), cmap='binary')
axs[3, 4].imshow(model.predict([np.array(images.iloc[indexes[7]])]).reshape(28, 28), cmap='binary')
axs[3, 5].imshow(np.array(labels.iloc[indexes[5]]).reshape(28, 28), cmap='binary')
axs[0, 0].set_title('input image')
axs[0, 1].set_title('predicted image')
axs[0, 2].set_title('true image')
axs[0, 3].set_title('input image')
axs[0, 4].set_title('predicted image')
axs[0, 5].set_title('true image')
for ax in axs:
for idx in ax:
idx.axis('off')
plot_imgs_with_predictions(X_test, y_test, knn_clf)
y_test_np = np.array(y_test)
perfect_prediction = y_test_np[0]
perfect_diff = y_test_np[0] - perfect_prediction
plt.imshow(perfect_diff.reshape(28, 28), cmap='binary')
plt.title('Perfect image prediction')
plt.show()
img = np.random.randint(0, len(y_test_np)) # pick a random image - everytime you run it changes -
fig, axs = plt.subplots(1, 4, figsize=(12, 3), sharey=True)
axs[0].imshow(y_test_np[img].reshape(28, 28), cmap='binary')
axs[0].set_title('Test image')
axs[1].imshow(np.array(X_test)[img].reshape(28, 28), cmap='binary')
axs[1].set_title('model input')
predicted = knn_clf.predict([np.array(X_test)[img]])
axs[2].imshow(predicted.reshape(28, 28), cmap='binary')
axs[2].set_title('model output')
diff = y_test_np[img] - predicted
axs[3].imshow(np.maximum(diff, 0).reshape(28, 28), cmap='binary')
axs[3].set_title('Difference')
plt.show()
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test_np[0], predicted.flatten()) # flattening the array to 1D
print("Root mean squared error is: {:.2f}".format(np.sqrt(mse)))
Root mean squared error is: 66.22