import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
url_wine_red = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
url_wine_white = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
red = pd.read_csv(url_wine_red, delimiter=";")
white = pd.read_csv(url_wine_white, delimiter=";")
red.head(5)
white.head(5)
red['category']='red'
white['category']='white'
total_wine=red.append(white, ignore_index=True)
total_wine
total_wine.shape
total_wine.info()
total_wine.describe()
quality = total_wine['quality']
quality.describe()
quality.value_counts()
plt.figure(figsize=(15,9))
sns.countplot(data=total_wine,x='quality')
plt.show
total_wine['quality_category'] = total_wine['quality'].apply(lambda x: "poor" if x <5 else ("high" if x>7 else "medium"))
total_wine.tail()
total_wine.quality_category = total_wine.quality_category.astype('category')
total_wine.info()
plt.figure(figsize=(15,9))
sns.countplot(data=total_wine,x='quality_category')
plt.show
sns.scatterplot(data = total_wine, x = 'residual sugar', y = 'alcohol',hue = "quality_category")
sns.scatterplot(data = total_wine, x = 'fixed acidity', y = 'volatile acidity',hue="quality_category")
plt.figure(figsize=(15,8))
sns.boxplot(x = 'density', y = 'quality_category', data = total_wine)
plt.show()
plt.figure(figsize=(15,9))
sns.boxplot(y = 'quality_category', x = 'citric acid', data = total_wine)
plt.show()
plt.figure(figsize=(15,9))
sns.boxplot(y = 'quality_category', x = 'residual sugar', data = total_wine)
plt.show()
from sklearn.preprocessing import StandardScaler
sns.pairplot(total_wine[['fixed acidity',
'chlorides', 'total sulfur dioxide', 'density','alcohol']])
total_wine.columns
scaler = StandardScaler()
scaled = scaler.fit_transform(
total_wine[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol','quality']]
)
scaled.T
covariance_matrix = np.cov(scaled.T)
covariance_matrix
plt.figure(figsize=(10,10))
sns.set(font_scale=1.5)
hm = sns.heatmap(covariance_matrix,
cbar=True,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size': 12},
yticklabels=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol','quality'],
xticklabels=['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
'pH', 'sulphates', 'alcohol','quality'])
total_wine.corr()[['quality']].sort_values(by='quality', ascending = False)
plt.figure(figsize=(10,5))
sns.relplot(x="volatile acidity", y="quality", kind="line", ci="sd", data=total_wine,);
#plt.xlim(0,1.2)
plt.show()
plt.figure(figsize=(10,5))
sns.relplot(x="alcohol", y="quality", kind="line", ci="sd", data=total_wine,hue="category");
plt.show()
sns.barplot(data = total_wine, y = "density", x = "quality_category")
plt.ylim(0.98)
sns.violinplot(data = total_wine, y = "density", x = "quality_category")
plt.ylim(0.98)
sns.barplot(data = total_wine, y = "alcohol", x = "quality_category")
sns.violinplot(data = total_wine, y = "alcohol", x = "quality_category")
sns.barplot(data = total_wine, y = "volatile acidity", x = "quality_category")
sns.violinplot(data = total_wine, y = "volatile acidity", x = "quality_category")