import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# agrega librerías que necesites
url_wine_red = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
url_wine_white = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
red = pd.read_csv(url_wine_red, delimiter=";")
white = pd.read_csv(url_wine_white, delimiter=";")
wine_red = pd.DataFrame(red)
wine_white = pd.DataFrame(white)
wine_red.head(5)
wine_white.head(5)
red['category']='wine_red'
white['category']='wine_white'
total_wine = pd.concat([wine_white, wine_red], ignore_index=True)
total_wines = pd.DataFrame(total_wine)
total_wine
rows_wine_red = len(wine_red.axes[0])
columns_wine_red = len(wine_red.axes[1])
rows_wine_white = len(wine_white.axes[0])
columns_wine_white = len(wine_white.axes[1])
print(rows_wine_red)
print(columns_wine_red)
print(rows_wine_white)
print(columns_wine_white)
total_wines.info()
total_wines.describe()
quality_wine = total_wines['quality']
quality_wine
quality_wine.unique()
count_points = quality_wine.value_counts()
print(count_points)
sns.set_theme(style="whitegrid")
sns.countplot(quality_wine).set(title='Calidad en Vinos')
total_wines['quality_category'] = total_wines['quality'].apply(lambda x : 'Poor' if x<5 else ('Medium' if x<8 else 'High'))
total_wines.tail()
total_wines.quality_category = total_wines['quality_category'].astype('category')
total_wines.info()
sns.set_theme(style="whitegrid")
graph_quality = sns.countplot(x="quality_category", data=total_wine).set(title='Categorías de calidad en los Vinos')
ph = total_wines['pH']
sns.boxplot(x=ph, color='yellow').set_title('Distribución del pH')
density = total_wines['density']
sns.boxplot(x=density, color='red').set_title('Distribución de la Densidad')
alcohol = total_wines['alcohol']
sns.boxplot(x=alcohol, color='cyan').set_title('Distribución del Alcohol')
residual_sugar = total_wines['residual sugar']
sns.boxplot(x=residual_sugar, color='brown').set_title('Distribución del Azúcar')
citric_acid = total_wines['citric acid']
sns.boxplot(x=citric_acid, color='yellow').set_title('Distribución del Ácido Cítrico')
volatile_acidity = total_wines['volatile acidity']
sns.boxplot(x=volatile_acidity, color='red').set_title('Distribución de la Acidez Volátil')
fixed_acidity = total_wines['fixed acidity']
sns.boxplot(x=fixed_acidity, color='grey').set_title('Distribución del Ácido Fijo')
chlorides = total_wines['chlorides']
sns.boxplot(x=chlorides, color='red').set_title('Distribución de los Cloruros')
free_sulfur_dioxide = total_wines['free sulfur dioxide']
sns.boxplot(x=free_sulfur_dioxide, color='blue').set_title('Distribución del Dióxido de Azufre Libre')
total_sulfure_dioxide = total_wines['total sulfur dioxide']
sns.boxplot(x=total_sulfure_dioxide, color='cyan').set_title('Ditribución de Azufre Total')
sulphates = total_wines['sulphates']
sns.boxplot(x=sulphates, color='green').set_title('Distribución de Sulfatos')
total_wines_copy = total_wines.copy()
def outliers(df, columna):
Q1 = df[columna].quantile(0.25)
Q3 = df[columna].quantile(0.75)
IQR = Q3 - Q1
limite_inferior = Q1 - 1.5*IQR
limite_superior = Q3 + 1.5*IQR
ls = df.index[(df[columna] < limite_inferior) | (df[columna] > limite_superior)]
return ls
indices_list = []
for columna in ['volatile acidity', 'residual sugar', 'chlorides']:
indices_list.extend(outliers(total_wines_copy, columna))
print(len(indices_list))
print(indices_list, '\n')
def remove(df, ls):
ls = sorted(set(ls))
df = df.drop(ls)
return df
total_wines_copy_clean = remove(total_wines_copy, indices_list)
total_wines_copy_clean.shape
total_wines_copy_clean
total_wines_copy_clean.describe()
total_deleted_data = len(total_wines_copy) - len(total_wines_copy_clean)
print(total_deleted_data)
porc_deleted_data = total_deleted_data / len(total_wines_copy) * 100
print(porc_deleted_data)
sns.pairplot(total_wines, corner=True, height=2)
df_heatmap = total_wine.iloc[:, :12]
df_heatmap
sns.set(rc = {'figure.figsize':(10,8)})
sns.heatmap(df_heatmap.corr(), annot=True)
total_wines.corr()[['quality']].sort_values(by='quality', ascending = False)
sns.jointplot(x = "alcohol", y = "quality", data = total_wines, kind = "scatter")
sns.barplot(x="quality_category", y="alcohol", data=total_wines).set_title('Grafíca de barras del Alcohol')
sns.violinplot(x="quality_category", y="alcohol", data=total_wine).set_title('Grafíca de barras del Alcohol')
sns.barplot(x="quality_category", y="citric acid", data=total_wines).set_title('Grafíca de barras del Ácido Cítrico')
sns.violinplot(x="quality_category", y="citric acid", data=total_wines).set_title('Grafíca de barras del Ácido Cítrico')
sns.barplot(x="quality_category", y="free sulfur dioxide", data=total_wines).set_title('Grafíca de barras del Dióxido de Azufre Libre')
sns.violinplot(x="quality_category", y="free sulfur dioxide", data=total_wines).set_title('Grafíca de barras del Dióxido de Azufre Libre')