import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from sklearn.datasets import make_classification
# Generate a synthetic imbalanced dataset
X, y = make_classification(n_samples=1000, n_features=10,
weights=[0.95, 0.05], n_classes=2,
n_informative=4, n_redundant=2,
flip_y=0, random_state=42)
# Function to plot the class distribution
def plot_distribution(y, title):
counts = Counter(y)
plt.figure(figsize=(8, 5))
sns.barplot(x=list(counts.keys()), y=list(counts.values()))
plt.title(title)
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.xticks(list(counts.keys()))
for i, count in enumerate(counts.values()):
plt.text(i, count + 10, str(count), ha='center')
plt.show()
# Visualize the original distribution
print(f"Original dataset shape: {Counter(y)}")
plot_distribution(y, 'Before Resampling: Original Class Distribution')
Run to view results
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
Run to view results
# Plot original distribution for comparison
plot_distribution(y, 'Before: Original Class Distribution')
# Apply Random Oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)
print(f"Shape after Random Oversampling: {Counter(y_ros)}")
plot_distribution(y_ros, 'After: Random Oversampling')
Run to view results
# Plot original distribution for comparison
plot_distribution(y, 'Before: Original Class Distribution')
# Apply SMOTE
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)
print(f"Shape after SMOTE: {Counter(y_smote)}")
plot_distribution(y_smote, 'After: SMOTE')
Run to view results
# Plot original distribution for comparison
plot_distribution(y, 'Before: Original Class Distribution')
# Apply ADASYN
adasyn = ADASYN(random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X, y)
print(f"Shape after ADASYN: {Counter(y_adasyn)}")
plot_distribution(y_adasyn, 'After: ADASYN')
Run to view results
# Plot original distribution for comparison
plot_distribution(y, 'Before: Original Class Distribution')
# Apply Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)
print(f"Shape after Random Undersampling: {Counter(y_rus)}")
plot_distribution(y_rus, 'After: Random Undersampling')
Run to view results
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import classification_report
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
# --- Model 1: No balancing (Baseline) ---
print("--- Baseline Model (No Balancing) ---")
base_model = LogisticRegression(solver='liblinear', random_state=42)
base_model.fit(X_train, y_train)
y_pred_base = base_model.predict(X_test)
print(classification_report(y_test, y_pred_base))
# --- Model 2: Using SMOTE + Logistic Regression ---
print("--- SMOTE + Logistic Regression ---")
smote_pipe = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote_pipe.fit_resample(X_train, y_train)
smote_model = LogisticRegression(solver='liblinear', random_state=42)
smote_model.fit(X_train_smote, y_train_smote)
y_pred_smote = smote_model.predict(X_test)
print(classification_report(y_test, y_pred_smote))
# --- Model 3: Using BalancedRandomForestClassifier ---
print("--- Balanced Random Forest Classifier ---")
brf = BalancedRandomForestClassifier(random_state=42)
brf.fit(X_train, y_train)
y_pred_brf = brf.predict(X_test)
print(classification_report(y_test, y_pred_brf))
Run to view results
from IPython.display import IFrame
IFrame(src="https://www.slideshare.net/slideshow/embed_code/key/2EUHY79COopc3N?hostedIn=slideshare&page=upload", width=476, height=400)
Run to view results