school = 'Platzi'
#print(____) # Incluir la variable declarada anteriormente.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# agrega librerías que necesites
url_wine_red = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
url_wine_white = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
red = pd.read_csv(url_wine_red, delimiter=";")
white = pd.read_csv(url_wine_white, delimiter=";")
red.head(5)
white.head(5)
red['category']='red'
white['category']='white'
red.shape
white.shape
total_wine=red.append(white, ignore_index=True)
total_wine.shape
total_wine.head(5)
total_wine.shape
total_wine.dtypes
total_wine.isnull().sum()
total_wine.describe()
quality = total_wine['quality']
quality.describe()
quality.value_counts()
countplot_quality = sns.countplot(quality)
countplot_quality.set_title('Histograma de quality')
def calidad(x):
#if x == 3 or x == 4:
if x == 3 or x == 4 or x == 5:
return ('Poor')
#elif x == 5 or x == 6 or x == 7:
elif x == 6:
return ('Medium')
else:
return ('High')
total_wine['quality_category'] = total_wine['quality'].apply(lambda x: calidad(x))
total_wine.quality_category = total_wine.quality_category.astype('category')
total_wine.info()
countplot_quality_category=sns.countplot(total_wine['quality_category'],order=['Poor','Medium','High'])
countplot_quality_category.set_title('Histograma de quality category')
for i in range(11):
sns.set(rc={'figure.figsize':(11.7,8.27)})
f, (ax_hist,ax_boxt, ax_box) = plt.subplots(3, sharex=True, gridspec_kw={"height_ratios": (.5, .5,.5)})
sns.histplot(total_wine.iloc[:,i], ax=ax_hist)
sns.boxplot(total_wine.iloc[:,i],ax=ax_boxt)
sns.boxplot(x=total_wine.iloc[:,i], y='quality_category',data=total_wine,ax=ax_box)
ax_hist.set_title(total_wine.iloc[:,i].name, size=22)
ax_boxt.set(xlabel='')
total_wine_poor_or = total_wine[total_wine.quality_category=='Poor']
total_wine_poor_or = total_wine_poor_or.reset_index(drop=True)
total_wine_poor_or['New_ID'] = total_wine_poor_or.index + 1
total_wine_poor = total_wine[total_wine.quality_category=='Poor']
total_wine_poor = total_wine_poor.reset_index(drop=True)
total_wine_poor['New_ID'] = total_wine_poor.index + 1
t_w_c= []
for i in range (11):
Q1 = total_wine_poor.iloc[:,i].quantile(q=0.25)
Q3 = total_wine_poor.iloc[:,i].quantile(q=0.75)
iqr = Q3 - Q1
minlimit = Q1 - 1.5*iqr
maxlimit = Q3 + 1.5*iqr
total_wine_filteredpoor = total_wine_poor[(total_wine_poor.iloc[:,i] < maxlimit) & (total_wine_poor.iloc[:,i] > minlimit)]
t_w_c.append(total_wine_filteredpoor['New_ID'])
#total_wine_poor=total_wine_filteredpoor
identidad=(pd.DataFrame(t_w_c[0:11])).T
identidad1=identidad.dropna()
identidadf=identidad1.iloc[:,0]
total_wine_poor_wo = total_wine_poor_or[total_wine_poor_or.New_ID.isin(identidadf)]
total_wine_medium_or = total_wine[total_wine.quality_category=='Medium']
total_wine_medium_or = total_wine_medium_or.reset_index(drop=True)
total_wine_medium_or['New_ID'] = total_wine_medium_or.index + 1
total_wine_medium = total_wine[total_wine.quality_category=='Medium']
total_wine_medium = total_wine_medium.reset_index(drop=True)
total_wine_medium['New_ID'] = total_wine_medium.index + 1
t_w_c_m= []
for i in range (11):
Q1 = total_wine_medium.iloc[:,i].quantile(q=0.25)
Q3 = total_wine_medium.iloc[:,i].quantile(q=0.75)
iqr = Q3 - Q1
minlimit = Q1 - 1.5*iqr
maxlimit = Q3 + 1.5*iqr
total_wine_filteredmedium = total_wine_medium[(total_wine_medium.iloc[:,i] < maxlimit) & (total_wine_medium.iloc[:,i] > minlimit)]
t_w_c_m.append(total_wine_filteredmedium['New_ID'])
identidadm=(pd.DataFrame(t_w_c_m[0:11])).T
identidad1m=identidadm.dropna()
identidadfm=identidad1m.iloc[:,0]
total_wine_medium_wo = total_wine_medium_or[total_wine_medium_or.New_ID.isin(identidadfm)]
total_wine_high_or = total_wine[total_wine.quality_category=='High']
total_wine_high_or = total_wine_high_or.reset_index(drop=True)
total_wine_high_or['New_ID'] = total_wine_high_or.index + 1
total_wine_high = total_wine[total_wine.quality_category=='High']
total_wine_high = total_wine_high.reset_index(drop=True)
total_wine_high['New_ID'] = total_wine_high.index + 1
t_w_c_h= []
for i in range (11):
Q1 = total_wine_high.iloc[:,i].quantile(q=0.25)
Q3 = total_wine_high.iloc[:,i].quantile(q=0.75)
iqr = Q3 - Q1
minlimit = Q1 - 1.5*iqr
maxlimit = Q3 + 1.5*iqr
total_wine_filteredhigh = total_wine_high[(total_wine_high.iloc[:,i] < maxlimit) & (total_wine_high.iloc[:,i] > minlimit)]
t_w_c_h.append(total_wine_filteredhigh['New_ID'])
identidadh=(pd.DataFrame(t_w_c_h[0:11])).T
identidad1h=identidadh.dropna()
identidadfh=identidad1h.iloc[:,0]
total_wine_high_wo = total_wine_high_or[total_wine_high_or.New_ID.isin(identidadfh)]
total_wine_final=total_wine_poor_wo.append(total_wine_medium_wo).append(total_wine_high_wo)
total_wine_final=total_wine_final.reset_index(drop=True)
for i in range(11):
sns.set(rc={'figure.figsize':(11.7,8.27)})
f, (ax_hist,ax_boxt, ax_box,ax_boxo) = plt.subplots(4, sharex=True, gridspec_kw={"height_ratios": (.5,.5, .5,.5)})
sns.histplot(total_wine_final.iloc[:,i], ax=ax_hist)
sns.boxplot(total_wine_final.iloc[:,i],ax=ax_boxt)
sns.boxplot(x=total_wine_final.iloc[:,i], y='quality_category',data=total_wine_final,ax=ax_box)
sns.boxplot(x=total_wine.iloc[:,i], y='quality_category',data=total_wine,ax=ax_boxo)
ax_boxt.set(xlabel='')
ax_box.set(xlabel='')
ax_hist.set_title(total_wine.iloc[:,i].name, size=22)
correlation_matriz=total_wine_final.drop(['New_ID'],axis=1).corr()
correlation_matriz
sns.heatmap(correlation_matriz)
correlation_matriz['quality'].sort_values(ascending=False)
correlated_quality=total_wine_final[['alcohol','density','chlorides','volatile acidity','quality_category','category']]
correlation_matriz_quality=correlated_quality.corr()
correlation_matriz_quality
sns.heatmap(correlation_matriz_quality)
sns.pairplot(correlated_quality,hue='quality_category')
sns.set(rc={'figure.figsize':(4,4)})
f, (ax_bar,ax_vio) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.6,.6)})
sns.barplot(x='quality_category', y='alcohol', data=total_wine_final, ax=ax_bar,order=['Poor','Medium','High'])
sns.violinplot(x='quality_category', y='alcohol',data=total_wine_final,ax=ax_vio,order=['Poor','Medium','High'])
ax_bar.set(xlabel='')
ax_bar.set_ylim(7,13)
sns.set(rc={'figure.figsize':(4,4)})
f, (ax_bar,ax_vio) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.6,.6)})
sns.barplot(x='quality_category', y='density', data=total_wine_final, ax=ax_bar,order=['Poor','Medium','High'])
sns.violinplot(x='quality_category', y='density',data=total_wine_final,ax=ax_vio,order=['Poor','Medium','High'])
ax_bar.set(xlabel='')
ax_bar.set_ylim(0.990,0.999)
sns.set(rc={'figure.figsize':(4,4)})
f, (ax_bar,ax_vio) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.6,.6)})
sns.barplot(x='quality_category', y='chlorides', data=total_wine_final, ax=ax_bar,order=['Poor','Medium','High'])
sns.violinplot(x='quality_category', y='chlorides',data=total_wine_final,ax=ax_vio,order=['Poor','Medium','High'])
ax_bar.set(xlabel='')
ax_bar.set_ylim(0.02,0.07)
sns.set(rc={'figure.figsize':(4,4)})
f, (ax_bar,ax_vio) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.6,.6)})
sns.barplot(x='quality_category', y='volatile acidity', data=total_wine_final, ax=ax_bar,order=['Poor','Medium','High'])
sns.violinplot(x='quality_category', y='volatile acidity',data=total_wine_final,ax=ax_vio,order=['Poor','Medium','High'])
ax_bar.set(xlabel='')
ax_bar.set_ylim(0.1,0.5)