Tutorial 2 U18CO084
import numpy as np
import seaborn as sns
import os
import warnings
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('/work/Histograms.csv')
df.describe()
Data Cleaning Filling in missing values Single value imputation -mean
mean_val = df['A'].mean()
df_mean = df
df_mean['A'].fillna(value=mean_val, inplace=True)
df_mean.isna().sum()
Data Cleaning Filling in missing values Single value imputation -median
median_val = df['B'].median()
df_median = df
df_median['B'].fillna(value=median_val, inplace=True)
df_median.isna().sum()
Filling in missing values Single value imputation - Median
mode_val = list(dict(df['C'].value_counts()).keys())[0]
df_mode = df
df_mode['C'].fillna(value=mode_val, inplace=True)
df_mode.isna().sum()
Global constant replacement value let be 40
df_glob=df
df_glob['D'].fillna(value=40, inplace=True)
df_glob.isna().sum()
Imputation of missing values - KNN Imputation
from sklearn.impute import KNNImputer
impu = KNNImputer(n_neighbors=5)
data = df.to_numpy()
target = data[:, -1]
data = data[:, :-1]
data = impu.fit_transform(data)
final_data = np.concatenate((data, target.reshape(-1, 1)), axis=1)
imputation_knn_df = pd.DataFrame(final_data, columns=df.columns)
imputation_knn_df.isna().sum()
imputation_knn_df.describe()
#Outlier detection and Removal
sns.set_style('whitegrid')
sns.distplot(df['A'], kde = False, color ='red', bins=25)