import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
wine = pd.read_csv('vinos_tintos.csv')
wine.head()
fixed acidityfloat64
volatile acidityfloat64
0
7.4
0.7
1
7.8
0.88
2
7.8
0.76
3
11.2
0.28
4
7.4
0.7
wine.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 fixed acidity 1594 non-null float64
1 volatile acidity 1599 non-null float64
2 citric acid 1599 non-null float64
3 residual sugar 1594 non-null float64
4 chlorides 1599 non-null float64
5 free sulfur dioxide 1599 non-null float64
6 total sulfur dioxide 1599 non-null float64
7 density 1599 non-null float64
8 pH 1599 non-null float64
9 sulphates 1599 non-null float64
10 alcohol 1594 non-null float64
11 success 1599 non-null float64
12 country 1599 non-null object
13 pricing 1599 non-null object
dtypes: float64(12), object(2)
memory usage: 175.0+ KB
wine.isnull().sum()
# Se descaran las observaciones al ser tan pocas
na_cols = ['residual sugar', 'alcohol', 'fixed acidity']
for item in na_cols:
wine = wine[wine[item].notna()]
wine.isnull().sum()
wine[wine.duplicated()]
fixed acidityfloat64
volatile acidityfloat64
736
7.7
0.965
902
7.4
0.635
1481
8.2
0.28
# Se descartan las observaciones duplicadas (3)
wine = wine.drop_duplicates()
wine.shape
wine.describe().round(2)
fixed acidityfloat64
volatile acidityfloat64
count
1581.0
1581.0
mean
8.32
0.53
std
1.75
0.18
min
4.6
0.12
25%
7.1
0.39
50%
7.9
0.52
75%
9.2
0.64
max
15.9
1.58
wine.describe(include = object)
countryobject
pricingobject
count
1581
1581
unique
4
3
top
Italy
Budget
freq
954
653
sns.countplot(x='country', data=wine)
# Seleccionar todas la lineas que tienen 'spa'
spain_data = wine['country'].str.contains('spa', case = False)
spain_data
# Si una linea contiene 'spa', convertir en 'Spain'
wine['country'] = np.where(spain_data, 'Spain', wine['country'])
sns.countplot(x='country', data=wine)
sns.countplot(x='pricing' , data=wine)
sns.boxplot(y='success', data=wine)
sns.displot(wine['success'], kde=True)
sns.relplot(x='alcohol', y='success', data=wine)
sns.regplot(x='alcohol', y='success', data=wine)
sns.pairplot(wine)
sns.barplot(x='country', y='alcohol', data=wine)
sns.boxplot(x='country', y='alcohol', data=wine)
corr = wine.corr().round(2)
plt.figure(figsize=(10,8))
sns.heatmap(corr, vmax=0.8, linewidths=0.01, annot=True)
sns.lmplot(x='alcohol', y='success', data=wine,
scatter_kws={'s':5, 'alpha':0.3})
alcohol_success_plot = sns.lmplot(x='alcohol', y='success', col='country', data=wine
, scatter_kws={'s':5, 'alpha':0.3})
sns.despine()
axes = alcohol_success_plot.axes.flatten()
axes[0].set_title('Vinos Españoles')
axes[1].set_title('Vinos Italianos')
vals = axes[0].get_xticks()
axes[0].set_xticks([x for x in vals])
axes[0].set_xticklabels(['{:,.1%}'.format(x) for x in vals])
plt.suptitle('Más alcohol, más éxito, en Italia como en España! \n ¡Podemos predecir el éxito de un vino!',
fontsize=24, ha='center', va='top', color='#000088', y=1.2)
plt.show()
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X = wine[['alcohol']] #Dobles corchetes porque usualmente esto es una matriz de varias columnas, aqui solo 1
y = wine['success']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 3)
print('* Datos de entrenamiento:')
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print('\n* Datos de testeo:')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')
* Datos de entrenamiento:
X_train: (1264, 1)
y_train: (1264,)
* Datos de testeo:
X_test: (317, 1)
y_test: (317,)
# El algoritmo escogido es una Regresión Linear
reg = LinearRegression()
# Se entrena el modelo con la data train
reg.fit(X_train, y_train)
reg.predict([[0.12]])
/shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
warnings.warn(
y_pred = reg.predict(X_test)
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Estimar las metricas de Error
MAE = mean_absolute_error(y_pred, y_test)
RMSE = sqrt(mean_squared_error(y_pred, y_test))
print(f'MAE: {MAE.round(4)}')
print(f'RMSE: {round(RMSE,4)}')
MAE: 9.9299
RMSE: 12.3828
# Modelo con variables alcohol y vol. acidity
X2 = wine[['alcohol', 'volatile acidity']]
y2 = wine['success']
# Division train/test
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state = 3)
# Entrenamiento del modelo
reg2 = LinearRegression()
reg2.fit(X2_train, y2_train)
# Estimacion del modelo con test
y2_pred = reg2.predict(X2_test)
# Desempeño del modelo
print(f'MAE_2: {mean_absolute_error(y2_pred, y2_test).round(4)}')
print(f'RMSE_2: {round(sqrt(mean_squared_error(y2_pred, y2_test)),4)}')
MAE_2: 9.7424
RMSE_2: 12.1189
# Crear las nuevas variables dummies
wine['is_spain'] = np.where(wine['country'] == 'Spain', 1, 0)
wine['pricing_enc'] = np.where(wine['pricing'] == 'Budget', 1, np.where(wine['pricing'] == 'Medium', 2, 3))
wine.head()
fixed acidityfloat64
volatile acidityfloat64
0
7.4
0.7
1
7.8
0.88
2
7.8
0.76
3
11.2
0.28
4
7.4
0.7
X3 = wine.drop(['success', 'pricing', 'country'], axis = 1)
y3 = wine['success']
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state = 3)
reg3 = LinearRegression()
reg3.fit(X3_train, y3_train)
y3_pred = reg3.predict(X3_test)
print(f'MAE_3: {mean_absolute_error(y3_pred, y3_test).round(4)}')
print(f'RMSE_3: {round(sqrt(mean_squared_error(y3_pred, y3_test)),4)}')
MAE_3: 9.2191
RMSE_3: 11.784
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# Estimacion de modelo Arboles Aleatorios
X4 = wine.drop(['success', 'pricing', 'country'], axis = 1)
y4 = wine['success']
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state = 3)
reg4 = RandomForestRegressor(n_estimators = 5, max_features = 10)
reg4.fit(X4_train, y4_train)
y4_pred = reg4.predict(X4_test)
print(f'MAE_4: {mean_absolute_error(y4_pred, y4_test).round(4)}')
print(f'RMSE_4: {round(sqrt(mean_squared_error(y4_pred, y4_test)),4)}')
MAE_4: 9.503
RMSE_4: 12.1526
# Diccionario de distintos parametros a iterar
param_grid = [{'n_estimators': [10,50,100,150,200], 'max_features': [5, 10]}]
# Eleccion de modelo, con Cross-Validation (10)
grid_search_forest = GridSearchCV(reg4, param_grid, cv=10, scoring='neg_mean_absolute_error')
# Entrenamiento de los distintos modelos
grid_search_forest.fit(X4_train, y4_train)
# Mejor modelo encontrado
best_forest = grid_search_forest.best_estimator_
best_forest
# Evaluacion del mejor modelo
best_forest.fit(X4_train, y4_train)
y4_pred = best_forest.predict(X4_test)
print(f'MAE_4: {mean_absolute_error(y4_pred, y4_test).round(4)}')
print(f'RMSE_4: {round(sqrt(mean_squared_error(y4_pred, y4_test)),4)}')
MAE_4: 8.9082
RMSE_4: 11.4242
bf = best_forest.fit(X4_train, y4_train)
sorted_idx = bf.feature_importances_.argsort()
plt.barh(X4.columns[sorted_idx], bf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")