Big Data aplicada a los negocios

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from math import sqrt

wine = pd.read_csv('vinos_tintos.csv') wine.head()

fixed acidityfloat64

volatile acidityfloat64

7.4

0.7

7.8

0.88

7.8

0.76

11.2

0.28

7.4

0.7

wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1594 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1594 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1594 non-null   float64
 11  success               1599 non-null   float64
 12  country               1599 non-null   object 
 13  pricing               1599 non-null   object 
dtypes: float64(12), object(2)
memory usage: 175.0+ KB

wine.isnull().sum()

# Se descaran las observaciones al ser tan pocas na_cols = ['residual sugar', 'alcohol', 'fixed acidity'] for item in na_cols: wine = wine[wine[item].notna()]

wine.isnull().sum()

wine[wine.duplicated()]

fixed acidityfloat64

volatile acidityfloat64

736

7.7

0.965

902

7.4

0.635

1481

8.2

0.28

# Se descartan las observaciones duplicadas (3) wine = wine.drop_duplicates() wine.shape

wine.describe().round(2)

fixed acidityfloat64

volatile acidityfloat64

count

1581.0

mean

8.32

0.53

std

1.75

0.18

min

4.6

0.12

25%

7.1

0.39

50%

7.9

0.52

75%

9.2

0.64

max

15.9

1.58

wine.describe(include = object)

countryobject

pricingobject

count

1581

unique

top

Italy

Budget

freq

954

653

sns.countplot(x='country', data=wine)

# Seleccionar todas la lineas que tienen 'spa' spain_data = wine['country'].str.contains('spa', case = False) spain_data

# Si una linea contiene 'spa', convertir en 'Spain' wine['country'] = np.where(spain_data, 'Spain', wine['country'])

sns.countplot(x='country', data=wine)

sns.countplot(x='pricing' , data=wine)

sns.boxplot(y='success', data=wine)

sns.displot(wine['success'], kde=True)

sns.relplot(x='alcohol', y='success', data=wine)

sns.regplot(x='alcohol', y='success', data=wine)

sns.pairplot(wine)

sns.barplot(x='country', y='alcohol', data=wine)

sns.boxplot(x='country', y='alcohol', data=wine)

corr = wine.corr().round(2) plt.figure(figsize=(10,8)) sns.heatmap(corr, vmax=0.8, linewidths=0.01, annot=True)

sns.lmplot(x='alcohol', y='success', data=wine, scatter_kws={'s':5, 'alpha':0.3})

alcohol_success_plot = sns.lmplot(x='alcohol', y='success', col='country', data=wine , scatter_kws={'s':5, 'alpha':0.3}) sns.despine() axes = alcohol_success_plot.axes.flatten() axes[0].set_title('Vinos Españoles') axes[1].set_title('Vinos Italianos') vals = axes[0].get_xticks() axes[0].set_xticks([x for x in vals]) axes[0].set_xticklabels(['{:,.1%}'.format(x) for x in vals]) plt.suptitle('Más alcohol, más éxito, en Italia como en España! \n ¡Podemos predecir el éxito de un vino!', fontsize=24, ha='center', va='top', color='#000088', y=1.2) plt.show()

from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split

X = wine[['alcohol']] #Dobles corchetes porque usualmente esto es una matriz de varias columnas, aqui solo 1 y = wine['success'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 3)

print('* Datos de entrenamiento:') print(f'X_train: {X_train.shape}') print(f'y_train: {y_train.shape}') print('\n* Datos de testeo:') print(f'X_test: {X_test.shape}') print(f'y_test: {y_test.shape}')

* Datos de entrenamiento:
X_train: (1264, 1)
y_train: (1264,)

* Datos de testeo:
X_test: (317, 1)
y_test: (317,)

# El algoritmo escogido es una Regresión Linear reg = LinearRegression() # Se entrena el modelo con la data train reg.fit(X_train, y_train)

reg.predict([[0.12]])

/shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/base.py:450: UserWarning: X does not have valid feature names, but LinearRegression was fitted with feature names
  warnings.warn(

y_pred = reg.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error

# Estimar las metricas de Error MAE = mean_absolute_error(y_pred, y_test) RMSE = sqrt(mean_squared_error(y_pred, y_test)) print(f'MAE: {MAE.round(4)}') print(f'RMSE: {round(RMSE,4)}')

MAE: 9.9299
RMSE: 12.3828

# Modelo con variables alcohol y vol. acidity X2 = wine[['alcohol', 'volatile acidity']] y2 = wine['success'] # Division train/test X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state = 3) # Entrenamiento del modelo reg2 = LinearRegression() reg2.fit(X2_train, y2_train) # Estimacion del modelo con test y2_pred = reg2.predict(X2_test) # Desempeño del modelo print(f'MAE_2: {mean_absolute_error(y2_pred, y2_test).round(4)}') print(f'RMSE_2: {round(sqrt(mean_squared_error(y2_pred, y2_test)),4)}')

MAE_2: 9.7424
RMSE_2: 12.1189

# Crear las nuevas variables dummies wine['is_spain'] = np.where(wine['country'] == 'Spain', 1, 0) wine['pricing_enc'] = np.where(wine['pricing'] == 'Budget', 1, np.where(wine['pricing'] == 'Medium', 2, 3)) wine.head()

fixed acidityfloat64

volatile acidityfloat64

7.4

0.7

7.8

0.88

7.8

0.76

11.2

0.28

7.4

0.7

X3 = wine.drop(['success', 'pricing', 'country'], axis = 1) y3 = wine['success'] X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state = 3) reg3 = LinearRegression() reg3.fit(X3_train, y3_train) y3_pred = reg3.predict(X3_test) print(f'MAE_3: {mean_absolute_error(y3_pred, y3_test).round(4)}') print(f'RMSE_3: {round(sqrt(mean_squared_error(y3_pred, y3_test)),4)}')

MAE_3: 9.2191
RMSE_3: 11.784

from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import GridSearchCV

# Estimacion de modelo Arboles Aleatorios X4 = wine.drop(['success', 'pricing', 'country'], axis = 1) y4 = wine['success'] X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state = 3) reg4 = RandomForestRegressor(n_estimators = 5, max_features = 10) reg4.fit(X4_train, y4_train) y4_pred = reg4.predict(X4_test) print(f'MAE_4: {mean_absolute_error(y4_pred, y4_test).round(4)}') print(f'RMSE_4: {round(sqrt(mean_squared_error(y4_pred, y4_test)),4)}')

MAE_4: 9.503
RMSE_4: 12.1526

# Diccionario de distintos parametros a iterar param_grid = [{'n_estimators': [10,50,100,150,200], 'max_features': [5, 10]}] # Eleccion de modelo, con Cross-Validation (10) grid_search_forest = GridSearchCV(reg4, param_grid, cv=10, scoring='neg_mean_absolute_error') # Entrenamiento de los distintos modelos grid_search_forest.fit(X4_train, y4_train)

# Mejor modelo encontrado best_forest = grid_search_forest.best_estimator_ best_forest

# Evaluacion del mejor modelo best_forest.fit(X4_train, y4_train) y4_pred = best_forest.predict(X4_test) print(f'MAE_4: {mean_absolute_error(y4_pred, y4_test).round(4)}') print(f'RMSE_4: {round(sqrt(mean_squared_error(y4_pred, y4_test)),4)}')

MAE_4: 8.9082
RMSE_4: 11.4242

bf = best_forest.fit(X4_train, y4_train) sorted_idx = bf.feature_importances_.argsort() plt.barh(X4.columns[sorted_idx], bf.feature_importances_[sorted_idx]) plt.xlabel("Random Forest Feature Importance")