school = 'Platzi'
# print(____) # Incluir la variable declarada anteriormente.
print(school)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# agrega librerías que necesites
url_wine_red = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
url_wine_white = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
red = pd.read_csv(url_wine_red, delimiter=";")
white = pd.read_csv(url_wine_white, delimiter=";")
red.head(5)
white.head(5)
red['Category']='red'
white['Category']='white'
# total_wine=red.merge(white, ignore_index=True)
total_wine=pd.concat([red, white], ignore_index=True)
total_wine.head(5)
total_wine.shape
total_wine.info()
total_wine.describe()
quality = total_wine['quality']
quality.describe()
quality.value_counts()
sns.set_theme(style="darkgrid")
sns.countplot(data=total_wine, y='quality')
plt.show()
total_wine['quality_category'] = total_wine['quality'].apply(lambda x: str('Poor') if x <= 5 else (str('High') if x >=7 else str('Medium')))
total_wine.tail()
total_wine.quality_category = total_wine.quality_category.astype('category')
total_wine.info()
sns.set_theme(style="dark")
sns.countplot(data=total_wine, y='quality_category')
plt.xlabel('Wine')
plt.ylabel('Quality')
plt.show()
high = len(quality[(quality >= 7)])
por_high = round(high * 100 / len(quality.index))
medium = len(quality[(quality == 6)])
por_medium = round(medium * 100 / len(quality.index))
print('Los vinos de alta calidad solamente representan un',por_high,'% del total.\nMientras que la mayoría son los de calidad media con su',por_medium,'%')
# Desviación estándar
quality.std()
# Rango
rango = quality.max() - quality.min()
rango
# Quartiles
median = quality.median()
q1 = quality.quantile(q=0.25)
q3 = quality.quantile(q=0.75)
min_val = quality.quantile(q=0)
max_val = quality.quantile(q=1)
# out_max = len(quality[(quality > 7)])
#out_min = len(quality[(quality < 4)])
#print(out_max + out_min)
print(min_val, q1, median, q3, max_val)
# Rango Intercuartil
iqr = q3 - q1
iqr
# Datos simétricamente distribuidos
minlimit = q1 - 1.5*iqr
maxlimit = q3 + 1.5*iqr
print(minlimit, maxlimit)
plt.hist(quality, bins=5, histtype='bar')
plt.ylabel('Wine')
plt.xlabel('Quality')
plt.show()
plt.figure(figsize=(8,4))
sns.boxplot(data=total_wine, x='quality')
plt.axvline(quality.median(), color='magenta', label='median', linestyle='-')
plt.axvline(quality.mean(), color='orange', label='mean', linestyle='--')
plt.legend()
plt.show()
out_max = len(quality[(quality > 7)])
out_min = len(quality[(quality < 4)])
out = out_max + out_min
outpor = round(out * 100 / len(quality.index))
print('Hay',out, 'valores atípicos, el', outpor,'% del total')
# 3. Elimina los outliers de ser necesario en la siguiente celda.
print('No :v')
total_wine.corr()
plt.figure(figsize=(16,16))
sns.heatmap(total_wine.corr(), annot=True, cmap='coolwarm', linewidths=5, linecolor='aquamarine',
vmin=-0.5, vmax=0.5)
plt.show()
total_wine.corr()[['quality']].sort_values(by='quality', ascending = False)
# total_wine.columns
wine_corr = total_wine.drop(['quality', 'fixed acidity', 'citric acid', 'residual sugar', 'free sulfur dioxide', 'total sulfur dioxide', 'pH', 'sulphates'], axis=1)
sns.heatmap(wine_corr.corr(), annot=True, cmap='Spectral', linewidths=2, linecolor='indigo',
vmin=-0.5, vmax=0.5)
plt.show()
# Alcohol vs Calidad
f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
sns.set(style='darkgrid',palette='deep',font_scale=1)
sns.barplot(data=total_wine, x='quality_category', y='alcohol', ax=axs[0])
sns.violinplot(data=total_wine, x='quality_category', y='alcohol', ax=axs[1])
f.tight_layout()
# Sal vs Calidad
f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
sns.set(style='darkgrid',palette='deep',font_scale=1)
sns.barplot(data=total_wine, x='quality_category', y='chlorides', ax=axs[0])
sns.violinplot(data=total_wine, x='quality_category', y='chlorides', ax=axs[1])
f.tight_layout()
# ácido acético
f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
sns.set(style='darkgrid',palette='deep',font_scale=1)
sns.barplot(data=total_wine, x='quality_category', y='volatile acidity', ax=axs[0])
sns.violinplot(data=total_wine, x='quality_category', y='volatile acidity', ax=axs[1])
f.tight_layout()
# Calidad vs Densidad
f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
sns.set(style='darkgrid',palette='deep',font_scale=1)
sns.barplot(data=total_wine, x='quality_category', y='density', ax=axs[0])
axs[0].axis(ymin=total_wine['density'].min(), ymax=1)
sns.violinplot(data=total_wine, x='quality_category', y='density', ax=axs[1])
f.tight_layout()
# Relación Densidad = Azúcar / Alcohol
sns.set(style='darkgrid',palette='deep',font_scale=1)
sns.relplot(data=total_wine, x='alcohol', y='residual sugar', hue='quality_category', col='Category')
plt.ylim([total_wine['residual sugar'].min(), 25])
plt.show()
# Calidad según su azúcar
f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
sns.set(style='darkgrid',palette='deep',font_scale=1)
sns.barplot(data=total_wine, x='quality_category', y='residual sugar', ax=axs[0])
sns.violinplot(data=total_wine, x='quality_category', y='residual sugar', ax=axs[1])
f.tight_layout()
Azúcar por tipos de vino
f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
sugar = ["#BC2C0D", "#F9E79F"]
sns.set(style='darkgrid',palette=sugar, font_scale=1)
sns.barplot(data=total_wine, x='Category', y='residual sugar', ax=axs[0])
axs[0].axis(ymin=total_wine['residual sugar'].min(), ymax=10)
sns.violinplot(data=total_wine, x='Category', y='residual sugar', ax=axs[1])
f.tight_layout()
flatui = ["#009BFF","#FF9300","#00FF74",]
sns.set(style='darkgrid',palette=flatui, font_scale=1)
sns.catplot(data=total_wine, x='quality_category', y='residual sugar',dodge=True, kind='violin',col='Category')
plt.show()
sns.set(style='darkgrid',palette=flatui, font_scale=1)
sns.catplot(data=total_wine, x='quality_category', y='residual sugar', dodge=True, kind='bar',col='Category')
plt.show()
# !pip install session_info
import session_info
session_info.show()