# System
import warnings
# Data manipulation
import pandas as pd
import numpy as np
# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns
# Suppress warnings
warnings.filterwarnings('ignore')
# Set the default Seaborn style
sns.set_theme(style='whitegrid', font='serif')
data = pd.read_csv('../datasets/raw/white_wine.csv', delimiter=';')
# Display the first 5 rows of the dataset
data.head()
data.describe().T
data.info()
data.isnull().sum()
data.duplicated().sum()
# Drop duplicates
data = data.drop_duplicates()
# Check for duplicates
data.duplicated().sum()
# Set the figure size
plt.figure(figsize=(10, 6))
# Plot the distribution of the target variable
ax = sns.countplot(data=data, x='quality', color='lightblue')
# Find the bar with the highest value
max_height = max([p.get_height() for p in ax.patches])
for p in ax.patches:
if p.get_height() == max_height:
p.set_color('salmon')
# Set the title and labels
plt.title('Distribution of the Quality Variable')
plt.xlabel('Quality')
plt.ylabel('Count')
# Add text annotations
for p in ax.patches:
ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
ha='center', va='center', xytext=(0, 5), textcoords='offset points')
# Add grid lines
plt.grid(linestyle='--', alpha=0.6)
# Display the plot
plt.tight_layout()
plt.show()
# Set the figure size
plt.figure(figsize=(10, 6))
# Plot the distribution of the target variable
sns.scatterplot(data=data, x='alcohol', y='quality', color='salmon')
# Set the title and labels
plt.title('Quality vs Alcohol Content')
plt.xlabel('Alcohol Content')
plt.ylabel('Quality')
# Add grid lines
plt.grid(linestyle='--', alpha=0.6)
# Display the plot
plt.tight_layout()
plt.show()
# Get unique quality values
qualities = data['quality'].unique()
# Set the figure size
fig, axes = plt.subplots(3, 3, figsize=(15, 15))
axes = axes.flatten()
# Plot the distribution of alcohol content for each quality
for i, quality in enumerate(qualities):
ax = axes[i]
sns.histplot(data[data['quality'] == quality], x='alcohol', kde=True, bins=30, ax=ax, color='lightblue')
ax.set_title(f'Quality: {quality}')
ax.set_xlabel('Alcohol Content')
ax.set_ylabel('Count')
ax.grid(linestyle='--', alpha=0.6)
# Remove any empty subplots
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
# Adjust layout
plt.tight_layout()
plt.show()
# Set the figure size
plt.figure(figsize=(10, 10))
# Plot the heatmap
sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
# Set the title
plt.title('Correlation Heatmap')
# Display the plot
plt.tight_layout()
plt.show()