import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
wine = pd.read_csv('vinos_tintos.csv')
wine.head()
wine.info()
wine.isnull().sum()
# Se descaran las observaciones al ser tan pocas
na_cols = ['residual sugar', 'alcohol', 'fixed acidity']
for item in na_cols:
wine = wine[wine[item].notna()]
wine.isnull().sum()
wine[wine.duplicated()]
# Se descartan las observaciones duplicadas (3)
wine = wine.drop_duplicates()
wine.shape
wine.describe().round(2)
wine.describe(include = object)
sns.countplot(x='country', data=wine)
# Seleccionar todas la lineas que tienen 'spa'
spain_data = wine['country'].str.contains('spa', case = False)
spain_data
# Si una linea contiene 'spa', convertir en 'Spain'
wine['country'] = np.where(spain_data, 'Spain', wine['country'])
sns.countplot(x='country', data=wine)
sns.countplot(x='pricing' , data=wine)
sns.boxplot(y='success', data=wine)
sns.displot(wine['success'], kde=True)
sns.relplot(x='alcohol', y='success', data=wine)
sns.regplot(x='alcohol', y='success', data=wine)
sns.pairplot(wine)
sns.barplot(x='country', y='alcohol', data=wine)
sns.boxplot(x='country', y='alcohol', data=wine)
corr = wine.corr().round(2)
plt.figure(figsize=(10,8))
sns.heatmap(corr, vmax=0.8, linewidths=0.01, annot=True)
sns.lmplot(x='alcohol', y='success', data=wine,
scatter_kws={'s':5, 'alpha':0.3})
alcohol_success_plot = sns.lmplot(x='alcohol', y='success', col='country', data=wine
, scatter_kws={'s':5, 'alpha':0.3})
sns.despine()
axes = alcohol_success_plot.axes.flatten()
axes[0].set_title('Vinos Españoles')
axes[1].set_title('Vinos Italianos')
vals = axes[0].get_xticks()
axes[0].set_xticks([x for x in vals])
axes[0].set_xticklabels(['{:,.1%}'.format(x) for x in vals])
plt.suptitle('Más alcohol, más éxito, en Italia como en España! \n ¡Podemos predecir el éxito de un vino!',
fontsize=24, ha='center', va='top', color='#000088', y=1.2)
plt.show()
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X = wine[['alcohol']] #Dobles corchetes porque usualmente esto es una matriz de varias columnas, aqui solo 1
y = wine['success']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 3)
print('* Datos de entrenamiento:')
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print('\n* Datos de testeo:')
print(f'X_test: {X_test.shape}')
print(f'y_test: {y_test.shape}')
# El algoritmo escogido es una Regresión Linear
reg = LinearRegression()
# Se entrena el modelo con la data train
reg.fit(X_train, y_train)
reg.predict([[0.12]])
y_pred = reg.predict(X_test)
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Estimar las metricas de Error
MAE = mean_absolute_error(y_pred, y_test)
RMSE = sqrt(mean_squared_error(y_pred, y_test))
print(f'MAE: {MAE.round(4)}')
print(f'RMSE: {round(RMSE,4)}')
# Modelo con variables alcohol y vol. acidity
X2 = wine[['alcohol', 'volatile acidity']]
y2 = wine['success']
# Division train/test
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state = 3)
# Entrenamiento del modelo
reg2 = LinearRegression()
reg2.fit(X2_train, y2_train)
# Estimacion del modelo con test
y2_pred = reg2.predict(X2_test)
# Desempeño del modelo
print(f'MAE_2: {mean_absolute_error(y2_pred, y2_test).round(4)}')
print(f'RMSE_2: {round(sqrt(mean_squared_error(y2_pred, y2_test)),4)}')
# Crear las nuevas variables dummies
wine['is_spain'] = np.where(wine['country'] == 'Spain', 1, 0)
wine['pricing_enc'] = np.where(wine['pricing'] == 'Budget', 1, np.where(wine['pricing'] == 'Medium', 2, 3))
wine.head()
X3 = wine.drop(['success', 'pricing', 'country'], axis = 1)
y3 = wine['success']
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state = 3)
reg3 = LinearRegression()
reg3.fit(X3_train, y3_train)
y3_pred = reg3.predict(X3_test)
print(f'MAE_3: {mean_absolute_error(y3_pred, y3_test).round(4)}')
print(f'RMSE_3: {round(sqrt(mean_squared_error(y3_pred, y3_test)),4)}')
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# Estimacion de modelo Arboles Aleatorios
X4 = wine.drop(['success', 'pricing', 'country'], axis = 1)
y4 = wine['success']
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state = 3)
reg4 = RandomForestRegressor(n_estimators = 5, max_features = 10)
reg4.fit(X4_train, y4_train)
y4_pred = reg4.predict(X4_test)
print(f'MAE_4: {mean_absolute_error(y4_pred, y4_test).round(4)}')
print(f'RMSE_4: {round(sqrt(mean_squared_error(y4_pred, y4_test)),4)}')
# Diccionario de distintos parametros a iterar
param_grid = [{'n_estimators': [10,50,100,150,200], 'max_features': [5, 10]}]
# Eleccion de modelo, con Cross-Validation (10)
grid_search_forest = GridSearchCV(reg4, param_grid, cv=10, scoring='neg_mean_absolute_error')
# Entrenamiento de los distintos modelos
grid_search_forest.fit(X4_train, y4_train)
# Mejor modelo encontrado
best_forest = grid_search_forest.best_estimator_
best_forest
# Evaluacion del mejor modelo
best_forest.fit(X4_train, y4_train)
y4_pred = best_forest.predict(X4_test)
print(f'MAE_4: {mean_absolute_error(y4_pred, y4_test).round(4)}')
print(f'RMSE_4: {round(sqrt(mean_squared_error(y4_pred, y4_test)),4)}')
bf = best_forest.fit(X4_train, y4_train)
sorted_idx = bf.feature_importances_.argsort()
plt.barh(X4.columns[sorted_idx], bf.feature_importances_[sorted_idx])
plt.xlabel("Random Forest Feature Importance")