import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# agrega librerías que necesites
url_wine_red = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
url_wine_white = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
red = pd.read_csv(url_wine_red, delimiter=";")
white = pd.read_csv(url_wine_white, delimiter=";")
red.head(5)
fixed acidityfloat64
volatile acidityfloat64
0
7.4
0.7
1
7.8
0.88
2
7.8
0.76
3
11.2
0.28
4
7.4
0.7
white.head(5)
fixed acidityfloat64
volatile acidityfloat64
0
7
0.27
1
6.3
0.3
2
8.1
0.28
3
7.2
0.23
4
7.2
0.23
red['category']='red'
white['category']='white'
total_wine=red.append(white, ignore_index=True)
total_wine
fixed acidityfloat64
3.8 - 15.9
volatile acidityfloat64
0.08 - 1.58
0
7.4
0.7
1
7.8
0.88
2
7.8
0.76
3
11.2
0.28
4
7.4
0.7
5
7.4
0.66
6
7.9
0.6
7
7.3
0.65
8
7.8
0.58
9
7.5
0.5
total_wine.shape
total_wine.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 6497 non-null float64
1 volatile acidity 6497 non-null float64
2 citric acid 6497 non-null float64
3 residual sugar 6497 non-null float64
4 chlorides 6497 non-null float64
5 free sulfur dioxide 6497 non-null float64
6 total sulfur dioxide 6497 non-null float64
7 density 6497 non-null float64
8 pH 6497 non-null float64
9 sulphates 6497 non-null float64
10 alcohol 6497 non-null float64
11 quality 6497 non-null int64
12 category 6497 non-null object
dtypes: float64(11), int64(1), object(1)
memory usage: 660.0+ KB
total_wine.describe()
fixed acidityfloat64
volatile acidityfloat64
count
6497
6497
mean
7.215307065
0.3396659997
std
1.296433758
0.1646364741
min
3.8
0.08
25%
6.4
0.23
50%
7
0.29
75%
7.7
0.4
max
15.9
1.58
quality = total_wine['quality']
quality.describe()
quality.value_counts()
plt.figure(figsize=(15,9))
sns.countplot(data=total_wine,x='quality')
plt.show
total_wine['quality_category'] = total_wine['quality'].apply(lambda x: "poor" if x <5 else ("high" if x>7 else "medium"))
total_wine.tail()
fixed acidityfloat64
volatile acidityfloat64
6493
6.6
0.32
6492
6.2
0.21
6494
6.5
0.24
6495
5.5
0.29
6496
6
0.21
total_wine.quality_category = total_wine.quality_category.astype('category')
total_wine.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 6497 non-null float64
1 volatile acidity 6497 non-null float64
2 citric acid 6497 non-null float64
3 residual sugar 6497 non-null float64
4 chlorides 6497 non-null float64
5 free sulfur dioxide 6497 non-null float64
6 total sulfur dioxide 6497 non-null float64
7 density 6497 non-null float64
8 pH 6497 non-null float64
9 sulphates 6497 non-null float64
10 alcohol 6497 non-null float64
11 quality 6497 non-null int64
12 category 6497 non-null object
13 quality_category 6497 non-null category
dtypes: category(1), float64(11), int64(1), object(1)
memory usage: 666.5+ KB
plt.figure(figsize=(15,9))
sns.countplot(data=total_wine,x='quality_category')
plt.show
sns.scatterplot(data = total_wine, x = 'residual sugar', y = 'alcohol',hue = "quality_category")
sns.scatterplot(data = total_wine, x = 'fixed acidity', y = 'volatile acidity',hue="quality_category")
plt.figure(figsize=(15,8))
sns.boxplot(x = 'density', y = 'quality_category', data = total_wine)
plt.show()
plt.figure(figsize=(15,9))
sns.boxplot(y = 'quality_category', x = 'citric acid', data = total_wine)
plt.show()
plt.figure(figsize=(15,9))
sns.boxplot(y = 'quality_category', x = 'residual sugar', data = total_wine)
plt.show()
from sklearn.preprocessing import StandardScaler
sns.pairplot(total_wine[['fixed acidity',
'chlorides', 'total sulfur dioxide', 'density','alcohol']])
total_wine.columns
scaler = StandardScaler()
scaled = scaler.fit_transform(
total_wine[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol','quality']]
)
scaled.T
covariance_matrix = np.cov(scaled.T)
covariance_matrix
plt.figure(figsize=(10,10))
sns.set(font_scale=1.5)
hm = sns.heatmap(covariance_matrix,
cbar=True,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size': 12},
yticklabels=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol','quality'],
xticklabels=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol','quality'])
total_wine.corr()[['quality']].sort_values(by='quality', ascending = False)
qualityfloat64
-0.3058579060694202 - 1.0
quality
1
alcohol
0.44431852
citric acid
0.08553171718
free sulfur dioxide
0.05546305862
sulphates
0.03848544588
pH
0.01950570371
residual sugar
-0.03698048459
total sulfur dioxide
-0.04138545386
fixed acidity
-0.07674320791
chlorides
-0.2006655004
plt.figure(figsize=(10,5))
sns.relplot(x="volatile acidity", y="quality", kind="line", ci="sd", data=total_wine,);
#plt.xlim(0,1.2)
plt.show()
plt.figure(figsize=(10,5))
sns.relplot(x="alcohol", y="quality", kind="line", ci="sd", data=total_wine,hue="category");
plt.show()
sns.barplot(data = total_wine, y = "density", x = "quality_category")
plt.ylim(0.98)
sns.violinplot(data = total_wine, y = "density", x = "quality_category")
plt.ylim(0.98)
sns.barplot(data = total_wine, y = "alcohol", x = "quality_category")
sns.violinplot(data = total_wine, y = "alcohol", x = "quality_category")
sns.barplot(data = total_wine, y = "volatile acidity", x = "quality_category")
sns.violinplot(data = total_wine, y = "volatile acidity", x = "quality_category")