school = 'Platzi'
print(school) # Incluir la variable declarada anteriormente.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# agrega librerías que necesites
url_wine_red = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
url_wine_white = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
red = pd.read_csv('winequality-red.csv', delimiter=";")
white = pd.read_csv('winequality-white.csv', delimiter=";")
red.head(5)
white.head(5)
red['category']='red'
red.head()
white['category']='white'
white.head()
total_wine=red.append(white, ignore_index=True)
total_wine
np.shape(total_wine)
print("Filas: " + str(total_wine.shape[0]))
print("Columnas: " + str(total_wine.shape[1]))
total_wine.info()
total_wine.describe()
quality = total_wine['quality']
print("Valor de la moda: " + str(quality.mode()))
print("Valor de la media: " + str(quality.mean()))
print("Valor de la mediana: " + str(quality.median()))
print("Valor Maximo: " + str(quality.max()))
print("Valor Minimo: " + str(quality.min()))
quality
np.unique(quality, return_counts=True)
values, distribucion = np.unique(quality, return_counts=True)
plt.bar(values, distribucion)
sns.set_theme(style='darkgrid')
sns.countplot(x='quality', data=total_wine)
plt.title('Distribución de la Calidad del Vino')
total_wine['quality_category'] = total_wine['quality'].apply(lambda x: 'High' if x>=7 else 'Medium' if x==6 else 'Poor')
total_wine.tail()
This chart is empty
Chart was probably not set up properly in the notebook
df_outliers = (total_wine-total_wine.median())/total_wine.std()
total_wine.quality_category = total_wine.quality_category.astype('category')
total_wine.info()
sns.set_theme(style='darkgrid')
sns.catplot(x="quality_category", hue="quality_category", col="category", kind="count", data=total_wine)
#df_outliers = (total_wine-total_wine.median())/total_wine.std() # Esto fue ejecutado antes de convertir el dtype quality_category
plt.figure(figsize=(10,10))
sns.boxplot(data=df_outliers,dodge=True, orient='h')
plt.title('Distribución de los Outliers por cada categoria')
# 3. Elimina los outliers de ser necesario en la siguiente celda.
wine_correlation = total_wine.corr()
wine_correlation
plt.figure(figsize=(10,10))
sns.heatmap(total_wine.corr(),annot=True,cmap='coolwarm',linewidths=5,linecolor='black',vmin=0.5,vmax=1,cbar=True)
plt.title('Mapa de Calor de Correlacion de las variables')
total_wine.corr()[['quality']].sort_values(by='quality', ascending = False)
total_wine_qcorr = total_wine[["fixed acidity", "chlorides","alcohol","citric acid",'quality','category','quality_category']]
total_wine_qcorr.head()
total_wine_qcorr.describe()
sns.pairplot(total_wine_qcorr,
hue= 'quality', palette='dark', kind = 'scatter');
sns.pairplot(total_wine_qcorr,
hue= 'quality_category', palette='dark', kind = 'scatter');
sns.set(rc={'figure.figsize':(10,10)})
sns.heatmap(total_wine_qcorr.corr(),annot=True,linewidths=5,linecolor='white',vmin=-1,vmax=1)
sns.set_theme(style='ticks')
plt.title('Mapa de Calor de las variales con mayor correlacion con Quality ')
valores_correlacion = total_wine_qcorr.unstack()
#valores_correlacion.sort_values(kind='quicksort')
valores_correlacion
sns.set_theme(style='darkgrid')
sns.scatterplot(data=total_wine_qcorr,x='alcohol',y='quality',hue='quality')
plt.xlabel('alcohol')
plt.ylabel('quality')
plt.title('Correlación entre Calidad y Alcohol')
plt.legend(loc='center',bbox_to_anchor=(1.12,0.5))
sns.set_theme(style='darkgrid')
sns.scatterplot(data=total_wine_qcorr,x='quality_category',y='alcohol',hue='quality_category')
plt.ylabel('alcohol')
plt.xlabel('quality category')
plt.title('Correlación entre Categoria de Calidad y Alcohol')
plt.legend(loc='center',bbox_to_anchor=(1.12,0.5))
sns.set_theme(style='darkgrid')
sns.scatterplot(data=total_wine_qcorr,x='citric acid',y='quality',hue='quality')
plt.xlabel('citric acid')
plt.ylabel('quality')
plt.title('Correlación entre Calidad y Acido Citrico')
plt.legend(loc='center',bbox_to_anchor=(1.12,0.5))
sns.set_theme(style='darkgrid')
sns.scatterplot(data=total_wine_qcorr,x='chlorides',y='quality',hue='quality')
plt.xlabel('chlorides')
plt.ylabel('quality')
plt.title('Correlación entre Calidad y Chlorides')
plt.legend(loc='center',bbox_to_anchor=(1.12,0.5))
Graficas de Correlaciones Positivas
Quality Category - Alcohol
fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2)
sns.barplot(data=total_wine_qcorr, x= 'quality_category', y= 'alcohol',ax = ax1)
ax1.set_ylim(0,16)
sns.violinplot(data=total_wine_qcorr, x= 'quality_category', y= 'alcohol', ax = ax2)
ax2.set_ylim(0,16)
fig.tight_layout()
Quality Category - Citric Acid
fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2)
sns.barplot(data=total_wine_qcorr, x= 'quality_category', y= 'citric acid', ax = ax1)
ax1.set_ylim(0,1)
sns.violinplot(data=total_wine_qcorr, x= 'quality_category', y= 'citric acid', ax = ax2)
ax2.set_ylim(-0.5,2)
fig.tight_layout()
Graficas de Correlaciones Negativas
Quality Category - Chlorides
fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2)
sns.barplot(data=total_wine_qcorr, x= 'quality_category', y= 'chlorides', ax = ax1)
ax1.set_ylim(0, 0.08)
sns.violinplot(data=total_wine_qcorr, x= 'quality_category', y= 'chlorides', ax = ax2)
ax2.set_ylim(-0.05, 0.675)
fig.tight_layout()
Quality Category - Fixed Acidity
fig, [ax1, ax2] = plt.subplots(nrows=1, ncols=2)
sns.barplot(data=total_wine_qcorr, x= 'quality_category', y= 'fixed acidity', ax = ax1)
ax1.set_ylim(0, 9)
sns.violinplot(data=total_wine_qcorr, x= 'quality_category', y= 'fixed acidity', ax = ax2)
ax2.set_ylim(-1, 17)
fig.tight_layout()