school = 'Platzi'
print(school) # Incluir la variable declarada anteriormente.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
url_wine_red = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
url_wine_white = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
red = pd.read_csv(url_wine_red, delimiter=";")
white = pd.read_csv(url_wine_white, delimiter=";")
red.head(5)
white.head(5)
red['category']='red'
white['category']='white'
total_wine = pd.concat([red, white],ignore_index=True )
total_wine
total_wine.head()
total_wine.shape
total_wine.info()
total_wine.describe()
quality = total_wine['quality']
quality.describe()
quality.unique()
sns.set_theme(style="darkgrid")
grap_quality = sns.countplot(x="quality", data=total_wine).set(title='Grafica de la calidad para todos los vinos')
total_wine['quality_category'] = total_wine['quality'].apply(lambda x: 'Poor' if x<5 else ('Medium' if x<8 else 'High'))
total_wine.tail()
total_wine.quality_category = total_wine['quality_category'].astype('category')
total_wine.info()
sns.set_theme(style="darkgrid")
grap_quality = sns.countplot(x="quality_category", data=total_wine).set(title='Grafica de la calidad para todos los vinos por categoria')
0 fixed acidity // acidez fija
fa = total_wine['fixed acidity']
sns.boxplot(x=fa).set_title('Boxplot de la acidez fija');
1 volatile acidity // acidez volátil
va = total_wine['volatile acidity']
sns.boxplot(x=va, color='aqua').set_title('Boxplot de la acidez volátil');
2 citric acid // ácido cítrico
ac = total_wine['citric acid']
sns.boxplot(x=ac, color='lime').set_title('Boxplot del ácido cítrico');
3 residual sugar // azúcar residual
rs = total_wine['residual sugar']
sns.boxplot(x=rs, color='salmon').set_title('Boxplot del azúcar residual ');
4 chlorides // cloruros
cloro = total_wine['chlorides']
sns.boxplot(x=cloro, color='gold').set_title('Boxplot de los cloruros');
5 free sulfur dioxide // dióxido de azufre libre
f_sul_d = total_wine['free sulfur dioxide']
sns.boxplot(x=f_sul_d, color='crimson').set_title('Boxplot del dióxido de azufre libre');
6 total sulfur dioxide // dióxido de azufre total
t_sul_d = total_wine['total sulfur dioxide']
sns.boxplot(x=t_sul_d, color='teal').set_title('Boxplot del dióxido de azufre total');
7 density // densidad
density = total_wine['density']
sns.boxplot(x=density, color='cyan').set_title('Boxplot de la densidad ');
8 pH
ph = total_wine['pH']
sns.boxplot(x=ph, color='r').set_title('Boxplot del pH');
9 sulphates // sulfatos
sul = total_wine['sulphates']
sns.boxplot(x=sul, color='g').set_title('Boxplot de los sulfatos');
10 alcohol
sul = total_wine['sulphates']
sns.boxplot(x=sul, color='g').set_title('Boxplot de los sulfatos');
total_wine.describe()
#Realizamos una copia del datatframe
total_wine_copy = total_wine.copy()
# Removemos Outliers, creando una función que regresa una lista de index de lso outliers
def outliers(df, col):
"""
Esta función regresa una lista con los indices de los outliers
df: Es el dataframe
columna: Columna donde quieres buscar los outliers
return: Lista con los index de los outliers
"""
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5 * IQR
limite_superior = Q3 + 1.5 * IQR
ls = df.index[(df[col] < limite_inferior) | (df[col] > limite_superior)]
return ls
# Creamos una lista vacia para guardar los indices de las columnas que nos interesa eliminar los outliers
index_list = []
# Creamos un for loop para eliminar los outliers de las columnas
for columna in ['volatile acidity', 'residual sugar', 'chlorides']:
index_list.extend(outliers(total_wine_copy, columna))
print(len(index_list))
print(index_list, '\n')
def remove(df, ls):
ls = sorted(set(ls))
df = df.drop(ls)
return df
total_wine_copy_clean = remove(total_wine_copy, index_list)
total_wine_copy_clean.shape
total_wine_copy_clean.describe()
total_wine_copy_clean
datos_eliminados = len(total_wine_copy) - len(total_wine_copy_clean)
porcentaje_datos_eliminados = datos_eliminados / len(total_wine_copy) * 100
print(f'Datos del dataframe elimandos: {datos_eliminados}, procentaje de los datos del dataframe elimnados: {porcentaje_datos_eliminados}')
sns.pairplot(total_wine, corner=True, height=2);
df_for_heatmap = total_wine.iloc[:, :12]
df_for_heatmap
sns.set(rc = {'figure.figsize':(15,8)})
sns.heatmap(df_for_heatmap.corr(), annot=True);
total_wine.corr()[['quality']].sort_values(by='quality', ascending = False)
sns.jointplot(x = "alcohol", y = "quality", data = total_wine, kind = "scatter");
0 fixed acidity // acidez fija
sns.barplot(x="quality_category", y="fixed acidity", data=total_wine).set_title('Grafíca de barras de la acidez fija');
sns.violinplot(x="quality_category", y="fixed acidity", data=total_wine).set_title('Grafíca de violín de la acidez fija');
1 volatile acidity // acidez volátil
sns.barplot(x="quality_category", y="volatile acidity", data=total_wine).set_title('Grafíca de barras de la acidez volátil');
g1 = sns.violinplot(x="quality_category", y="volatile acidity", data=total_wine)
g1.set_title('Grafíca de violin de la acidez volátil')
g1.set_ylim(0, 1.5);
2 citric acid // ácido cítrico
sns.barplot(x="quality_category", y="citric acid", data=total_wine).set_title('Grafíca de barras del ácido cítrico');
g2 = sns.violinplot(x="quality_category", y="citric acid", data=total_wine)
g2.set_title('Grafíca de violin del ácido cítrico')
g2.set_ylim(-0.25, 1)
3 residual sugar // azúcar residual
sns.barplot(x="quality_category", y="residual sugar", data=total_wine).set_title('Grafíca de barras del azúcar residual');
g3 = sns.violinplot(x="quality_category", y="residual sugar", data=total_wine)
g3.set_ylim(0,25)
g3.set_title('Grafíca de violin del azúcar residual');
4 chlorides // cloruros
sns.barplot(x="quality_category", y="chlorides", data=total_wine).set_title('Grafíca de barras de los cloruros');
g4 = sns.violinplot(x="quality_category", y="chlorides", data=total_wine)
g4.set_ylim(0,.20)
g4.set_title('Grafíca de violin de los cloruros');
5 free sulfur dioxide // dióxido de azufre libre
sns.barplot(x="quality_category", y="free sulfur dioxide", data=total_wine).set_title('Grafíca de barras del dióxido de azufre libre');
g5 = sns.violinplot(x="quality_category", y="free sulfur dioxide", data=total_wine)
g5.set_ylim(0,160)
g5.set_title('Grafíca de violin del dióxido de azufre libre ');
6 total sulfur dioxide // dióxido de azufre total
sns.barplot(x="quality_category", y="total sulfur dioxide", data=total_wine).set_title('Grafíca de barras del dióxido de azufre total');
g6 = sns.violinplot(x="quality_category", y="total sulfur dioxide", data=total_wine)
g6.set_ylim(0,320)
g6.set_title('Grafíca de violin del dióxido de azufre total');
7 density // densidad
sns.barplot(x="quality_category", y="density", data=total_wine).set_title('Grafíca de barras de la densidad ');
g7 = sns.violinplot(x="quality_category", y="density", data=total_wine)
g7.set_ylim(0.985,1.005)
g7.set_title('Grafíca de violin del dióxido de la densidad ');
8 pH
sns.barplot(x="quality_category", y="pH", data=total_wine).set_title('Grafíca de barras del pH ');
g8 = sns.violinplot(x="quality_category", y="pH", data=total_wine)
g8.set_title('Grafíca de violin del dióxido del pH ');
9 sulphates // sulfatos
sns.barplot(x="quality_category", y="sulphates", data=total_wine).set_title('Grafíca de barras de los sulfatos ');
g9 = sns.violinplot(x="quality_category", y="sulphates", data=total_wine)
g9.set_ylim(0, 1.25)
g9.set_title('Grafíca de violin de los sulfatos');
10 alcohol
sns.barplot(x="quality_category", y="alcohol", data=total_wine).set_title('Grafíca de barras del alcohol ');
g10 = sns.violinplot(x="quality_category", y="alcohol", data=total_wine)
g10.set_title('Grafíca de violin del alcohol');