Big Data aplicada a los negocios

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from math import sqrt

wine = pd.read_csv('vinos_tintos.csv') wine.head()

wine.info()

wine.isnull().sum()

# Se descaran las observaciones al ser tan pocas na_cols = ['residual sugar', 'alcohol', 'fixed acidity'] for item in na_cols: wine = wine[wine[item].notna()]

wine.isnull().sum()

wine[wine.duplicated()]

# Se descartan las observaciones duplicadas (3) wine = wine.drop_duplicates() wine.shape

wine.describe().round(2)

wine.describe(include = object)

sns.countplot(x='country', data=wine)

# Seleccionar todas la lineas que tienen 'spa' spain_data = wine['country'].str.contains('spa', case = False) spain_data

# Si una linea contiene 'spa', convertir en 'Spain' wine['country'] = np.where(spain_data, 'Spain', wine['country'])

sns.countplot(x='country', data=wine)

sns.countplot(x='pricing' , data=wine)

sns.boxplot(y='success', data=wine)

sns.displot(wine['success'], kde=True)

sns.relplot(x='alcohol', y='success', data=wine)

sns.regplot(x='alcohol', y='success', data=wine)

sns.pairplot(wine)

sns.barplot(x='country', y='alcohol', data=wine)

sns.boxplot(x='country', y='alcohol', data=wine)

corr = wine.corr().round(2) plt.figure(figsize=(10,8)) sns.heatmap(corr, vmax=0.8, linewidths=0.01, annot=True)

sns.lmplot(x='alcohol', y='success', data=wine, scatter_kws={'s':5, 'alpha':0.3})

alcohol_success_plot = sns.lmplot(x='alcohol', y='success', col='country', data=wine , scatter_kws={'s':5, 'alpha':0.3}) sns.despine() axes = alcohol_success_plot.axes.flatten() axes[0].set_title('Vinos Españoles') axes[1].set_title('Vinos Italianos') vals = axes[0].get_xticks() axes[0].set_xticks([x for x in vals]) axes[0].set_xticklabels(['{:,.1%}'.format(x) for x in vals]) plt.suptitle('Más alcohol, más éxito, en Italia como en España! \n ¡Podemos predecir el éxito de un vino!', fontsize=24, ha='center', va='top', color='#000088', y=1.2) plt.show()

from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split

X = wine[['alcohol']] #Dobles corchetes porque usualmente esto es una matriz de varias columnas, aqui solo 1 y = wine['success'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 3)

print('* Datos de entrenamiento:') print(f'X_train: {X_train.shape}') print(f'y_train: {y_train.shape}') print('\n* Datos de testeo:') print(f'X_test: {X_test.shape}') print(f'y_test: {y_test.shape}')

# El algoritmo escogido es una Regresión Linear reg = LinearRegression() # Se entrena el modelo con la data train reg.fit(X_train, y_train)

reg.predict([[0.12]])

y_pred = reg.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error

# Estimar las metricas de Error MAE = mean_absolute_error(y_pred, y_test) RMSE = sqrt(mean_squared_error(y_pred, y_test)) print(f'MAE: {MAE.round(4)}') print(f'RMSE: {round(RMSE,4)}')

# Modelo con variables alcohol y vol. acidity X2 = wine[['alcohol', 'volatile acidity']] y2 = wine['success'] # Division train/test X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state = 3) # Entrenamiento del modelo reg2 = LinearRegression() reg2.fit(X2_train, y2_train) # Estimacion del modelo con test y2_pred = reg2.predict(X2_test) # Desempeño del modelo print(f'MAE_2: {mean_absolute_error(y2_pred, y2_test).round(4)}') print(f'RMSE_2: {round(sqrt(mean_squared_error(y2_pred, y2_test)),4)}')

# Crear las nuevas variables dummies wine['is_spain'] = np.where(wine['country'] == 'Spain', 1, 0) wine['pricing_enc'] = np.where(wine['pricing'] == 'Budget', 1, np.where(wine['pricing'] == 'Medium', 2, 3)) wine.head()

X3 = wine.drop(['success', 'pricing', 'country'], axis = 1) y3 = wine['success'] X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state = 3) reg3 = LinearRegression() reg3.fit(X3_train, y3_train) y3_pred = reg3.predict(X3_test) print(f'MAE_3: {mean_absolute_error(y3_pred, y3_test).round(4)}') print(f'RMSE_3: {round(sqrt(mean_squared_error(y3_pred, y3_test)),4)}')

from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV

# Estimacion de modelo Arboles Aleatorios X4 = wine.drop(['success', 'pricing', 'country'], axis = 1) y4 = wine['success'] X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state = 3) reg4 = RandomForestRegressor(n_estimators = 5, max_features = 10) reg4.fit(X4_train, y4_train) y4_pred = reg4.predict(X4_test) print(f'MAE_4: {mean_absolute_error(y4_pred, y4_test).round(4)}') print(f'RMSE_4: {round(sqrt(mean_squared_error(y4_pred, y4_test)),4)}')

# Diccionario de distintos parametros a iterar param_grid = [{'n_estimators': [10,50,100,150,200], 'max_features': [5, 10]}] # Eleccion de modelo, con Cross-Validation (10) grid_search_forest = GridSearchCV(reg4, param_grid, cv=10, scoring='neg_mean_absolute_error') # Entrenamiento de los distintos modelos grid_search_forest.fit(X4_train, y4_train)

# Mejor modelo encontrado best_forest = grid_search_forest.best_estimator_ best_forest

# Evaluacion del mejor modelo best_forest.fit(X4_train, y4_train) y4_pred = best_forest.predict(X4_test) print(f'MAE_4: {mean_absolute_error(y4_pred, y4_test).round(4)}') print(f'RMSE_4: {round(sqrt(mean_squared_error(y4_pred, y4_test)),4)}')

bf = best_forest.fit(X4_train, y4_train) sorted_idx = bf.feature_importances_.argsort() plt.barh(X4.columns[sorted_idx], bf.feature_importances_[sorted_idx]) plt.xlabel("Random Forest Feature Importance")