import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
df=pd.read_csv('data.csv')
df.head(3)
df.info()
fig, axes = plt.subplots(2,2, figsize=(8,7))
plt.suptitle('Data Set Graph', fontsize=16)
# Positive Bias Moderate (right bias)
sns.histplot(ax=axes[0,0], data=df['Moderate Positive Skew'], color='blue', alpha=0.5)
axes[0,0].set_title('Positive Bias Moderate (right bias)')
# High Positive Bias (right bias)
sns.histplot(ax=axes[0,1], data=df['Highly Positive Skew'],color='c', alpha=0.5)
axes[0,1].set_title('Positive Bias Moderate (right bias)')
# title
sns.histplot(ax=axes[1,0], data=df['Moderate Negative Skew'],color='indigo', alpha=0.5)
axes[1,0].set_title('Negative Bias Moderate (left bias)')
# title
sns.histplot(ax=axes[1,1], data=df['Highly Negative Skew'],color='tab:brown', alpha=0.5)
axes[1,1].set_title('High Negative Bias (left bias)')
fig.tight_layout()
plt.show()
# Calculation
df.skew()
def diagnostic_graph(data_set, variable):
plt.figure(figsize=(15,6))
plt.subplot(1,2,1)
plt.title(variable)
sns.histplot(data=data_set, x=variable, kde=True, bins=27);
plt.subplot(1,2,2)
stats.probplot(data_set[variable], dist='norm', plot=plt);
plt.title('Q-Q graph')
plt.show()
diagnostic_graph(df,'Moderate Positive Skew');
df_transformed = df.copy()
# Transformation
df_transformed['Sqrt_transformation'] = np.sqrt(df['Moderate Positive Skew'])
df_transformed.head(3)
# Graph after Square Root transformation
diagnostic_graph(df_transformed, 'Sqrt_transformation')
diagnostic_graph(df_transformed,'Highly Positive Skew')
df_transformed['Log_transformation'] = np.log(df['Highly Positive Skew'])
df_transformed.head(3)
# Graph after Log transformation
diagnostic_graph(df_transformed, 'Log_transformation')
diagnostic_graph(df_transformed,'Moderate Positive Skew')
# Add a new column with the transformation
df_transformed['Cubic_transformation'] = np.cbrt(df['Moderate Positive Skew'])
# After Cubic transformation
diagnostic_graph(df_transformed, 'Cubic_transformation')
from sklearn.preprocessing import PowerTransformer
df_pt = df.copy()
# Model Creation
p_scaler = PowerTransformer(method='yeo-johnson')
# fitting and transforming the model
df_yjt = pd.DataFrame(p_scaler.fit_transform(df_pt), columns=['Moderate Positive Skew', 'Highly Positive Skew','Moderate Negative Skew', 'Highly Negative Skew'],)
# Transformed Data Set
df_yjt.head()
df_yjt.describe()
diagnostic_graph(df_pt,'Moderate Negative Skew')
diagnostic_graph(df_yjt,'Moderate Negative Skew')
pt = PowerTransformer(method='box-cox', standardize=False)
# Only positive data
df_bxcx = pd.DataFrame(p_scaler.fit_transform(df_pt.iloc[:, 0:2]), columns=['Moderate Positive Skew', 'Highly Positive Skew'])
df_bxcx.head(3)
diagnostic_graph(df_pt, 'Highly Positive Skew')
diagnostic_graph(df_bxcx, 'Highly Positive Skew')