#Importing useful libraries
import pandas as pd
import numpy as np
from google.colab import files
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
import pandas.util.testing as tm
df = pd.read_csv('GI_GL_corr (1).csv')
#Records that contain missing value(s)
df[df.isna().any(axis=1)]
#If we ignore the missing values
df1 = df.dropna()
#Importing useful functions
%run Utility_tools.ipynb
reduce_memory_usage(df1)
Memory usage before: 0.05 MB
Memory usage now : 0.03 MB
Memory usage decreased by 37.5%
#Correlation coefficients
display (df1.corr(method='pearson'),
df1.corr(method='spearman'))
#Checking for normality (visual)
for column in df1.columns[1:]:
fig = qqplot(df1[column], line = '45', fit=True)
ax = plt.gca()
fig.set_size_inches(15, 8)
ax.set_xlabel('Theoretical Quantiles', fontsize=13)
ax.set_ylabel(f'Sample Quantiles of the {column} column', fontsize=13)
plt.show()
#Normality check-Shapiro-Wilk test
from scipy.stats import shapiro
for column in df1.columns[1:]:
print(f'''P-value for {column} column: {shapiro(df1[column])[1]}''')
P-value for glycemic_index(GI) column: 6.483703600679291e-06
P-value for glycemic_load column: 4.006680958337093e-30
sns.scatterplot(x = 'glycemic_index(GI)', y = 'glycemic_load', data=df1)
plt.show()
#Now, let's do the same only for the products with low GI and high GL values
low_gi_high_gl = df1.loc[(df1['glycemic_index(GI)']<=55) & (df1['glycemic_load']>=20)]
low_gi_high_gl.corr()
#The correlation has considerably weakened
for column in low_gi_high_gl.columns[1:]:
fig = qqplot(low_gi_high_gl[column], line = '45', fit=True)
ax = plt.gca()
fig.set_size_inches(15, 8)
ax.set_xlabel('Theoretical Quantiles', fontsize=13)
ax.set_ylabel(f'Sample Quantiles of the {column} column', fontsize=13)
plt.show()
for column in low_gi_high_gl.columns[1:]:
print(f'''P-value for {column} column: {shapiro(low_gi_high_gl[column])[1]}''')
P-value for glycemic_index(GI) column: 2.6479616280994378e-05
P-value for glycemic_load column: 2.0963564395515277e-07