Stratified Sampling
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
def display_distribution(dataset):
"""
Displays the distribution of dogs and cats in the supplied dataset.
"""
values, counts = np.unique(dataset, return_counts=True)
counter = dict(zip(values, counts))
dogs = counter.get(0, 0)
percentage_of_dogs = dogs / len(dataset) * 100
cats = counter.get(1, 0)
percentage_of_cats = cats / len(dataset) * 100
print(
f"Dataset distribution: "
f"{dogs} dogs ({percentage_of_dogs:.2f}%). "
f"{cats} cats ({percentage_of_cats:.2f}%). "
)
DOGS = 80
CATS = 20
dataset = np.concatenate((
np.array([0] * DOGS),
np.array([1] * CATS)
))
display_distribution(dataset)
train, test = train_test_split(dataset, test_size=0.2)
display_distribution(train)
display_distribution(test)
train, test = train_test_split(dataset, test_size=0.2, stratify=dataset)
display_distribution(train)
display_distribution(test)