import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
dataset= pd.read_csv('/work/diabetes.csv')
dataset.head(6)
dataset.info()
dataset.hist(figsize=(12,12))
plt.show()
#Importing statsmodel library for qqplots generation
import scipy.stats as stats
import pylab
#list of features
columns=list(dataset.columns.values)
columns
#QQ plot generation function
def make_qqplot(data,features):
for value in features:
print(f"\033[1m{value}\033[0m\n")
figure = stats.probplot(dataset[value],dist="norm",plot=pylab)
plt.show()
#qq plots of features
make_qqplot(dataset,columns)
#import test method from scipy
from scipy.stats import shapiro
def normtest_shapiro(data,features):
for index,value in enumerate(columns):
# Calculate pvalue
pval=shapiro(dataset[value]).pvalue
#significance level
alpha=.05
#Compare the pvalue of the features with significance level
if pval> alpha:
print(index,f"\033[1m{value}\033[0m is normally distributed")
elif pval < alpha:
print(index,f"\033[1m{value}\033[0m is Not normally distributed")
normtest_shapiro(dataset,columns)
#import test method from scipy
from scipy.stats import normaltest
def normtest_dAgostino(data,features):
for index,value in enumerate(columns):
# Calculate pvalue
pval=normaltest(dataset[value]).pvalue
#significance level
alpha=.05
#Compare the pvalue of the features with significance level
if pval> alpha:
print(index,f"\033[1m{value}\033[0m is normally distributed")
elif pval < alpha:
print(index,f"\033[1m{value}\033[0m is Not normally distributed")
normtest_dAgostino(dataset,columns)