import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from scipy.stats import probplot
from scipy.stats import pearsonr
df_raw = pd.read_csv("../data/winequalityN.csv")
df_raw.columns
df_raw.head()
df_raw.dtypes
df_raw.describe()
df_raw.isnull().sum()
df_raw = df_raw.interpolate()
df_raw.isnull().sum()
sns.set(rc={'figure.figsize': (14, 8)})
sns.countplot(x='quality', data=df_raw).set(title='Wine quality distribution')
plt.show()
sns.pairplot(df_raw)
plt.show()
sns.heatmap(df_raw.corr(), annot=True, fmt='.2f', linewidths=2)
plt.show()
def plot_feature_distribution(df : pd.DataFrame(), feature : str) -> None:
'''Plots probability and distribution analysis.
'''
f, ax = plt.subplots(1, 2, figsize=(15, 5))
# Histogram
sns.histplot(df[feature], kde=True, stat='density', linewidth=0, ax=ax[0]).set(title=f'{feature} distribution')
#Probability plot
probplot(df[feature], plot=ax[1])
plt.show()
print(f'{feature} skewness = {skew(df[feature])}')
numeric_features = df_raw.select_dtypes(include=np.number).columns.tolist()
for feature in numeric_features:
plot_feature_distribution(df_raw, feature)
def get_correlation(feature_1 : str, feature_2 : str, df : pd.DataFrame()) -> None:
pearson_corr, p_value = pearsonr(df[feature_1], df[feature_2])
print(f"Correlation between {feature_1} and {feature_2} is {pearson_corr}")
print(f"P-value of this correlation is {p_value}")
sns.boxplot(x='quality', y = 'alcohol', data=df_raw, showfliers=False).set(title="Alcohol vs quality")
plt.show()
get_correlation('quality', 'alcohol', df_raw)
def convert_quality_numerical_to_categorical(value: int) -> None:
if value <= 5:
return 'low'
elif value >5 and value <= 7:
return 'medium'
else:
return 'high'
df_wine = df_raw
df_wine['quality_label'] = df_wine['quality'].apply(convert_quality_numerical_to_categorical)
#df_wine = df_wine.drop('quality', axis=1)
df_wine.head()
print(df_wine['quality_label'].value_counts())
subset_attr = ['alcohol', 'density', 'volatile acidity', 'quality']
low = round(df_wine[df_wine['quality_label'] == 'low'][subset_attr].describe(), 2)
medium = round(df_wine[df_wine['quality_label'] == 'medium'][subset_attr].describe(), 2)
high = round(df_wine[df_wine['quality_label'] == 'high'][subset_attr].describe(), 2)
pd.concat([low, medium, high], axis=1, keys=['Low Quality Wine', 'Medium Quality Wine','High Quality Wine'])
fig = df_wine.hist(bins=15, color='fuchsia', edgecolor='darkmagenta', linewidth=1.0, xlabelsize=10, ylabelsize=10, xrot=45, yrot=0, figsize=(10,9), grid=False)
plt.tight_layout(rect=(0, 0, 1.5, 1.5))
plt.title('Numeric features distribution')
plt.show()
fig, (ax) = plt.subplots(1, 1, figsize=(14, 8))
hm = sns.heatmap(df_wine.corr(), ax=ax, cmap="bwr", annot=True, fmt='.2f', linewidths=0.5)
fig.subplots_adjust(top = 0.93)
fig.suptitle('Wine Attributes and their Correlation Heatmap', fontsize=14, fontweight='bold')
plt.show()
fig = plt.figure(figsize=(16, 8))
sns.countplot(data=df_wine, x="quality", hue="type").set(title="Type of wine frecuency distribution ")
plt.show()
fig = plt.figure(figsize=(16, 12))
ax = fig.add_subplot(111, projection='3d')
xscale = df_wine['residual sugar']
yscale = df_wine['free sulfur dioxide']
zscale = df_wine['total sulfur dioxide']
ax.scatter(xscale, yscale, zscale, s=50, alpha=0.6, edgecolors='w')
ax.set_xlabel('Residual Sugar')
ax.set_ylabel('free sulfur dioxide')
ax.set_zlabel('Total sulfur dioxide')
plt.title("Correlation between free/total sulfur dioxide and residual sugar")
plt.show()
fig = plt.figure(figsize=(16, 12))
plt.scatter(x = df_wine['fixed acidity'],
y = df_wine['free sulfur dioxide'],
s = df_wine['total sulfur dioxide'] * 2,
alpha=0.4,
edgecolors='w')
plt.xlabel('Fixed Acidity')
plt.ylabel('free sulfur dioxide')
plt.title('Wine free sulfur dioxide Content - Fixed Acidity - total sulfur dioxide', y=1.05)
plt.show()