import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('./inventario.csv')
df.head()
df.dtypes
df.drop('Order', axis=1, inplace=True)
df.head()
df.describe()
# Tipo de datos de la base
df_train.dtypes
# Categoricas
df_train.loc[df_train['MarketingType']=='S','MarketingType'] = 0
df_train.loc[df_train['MarketingType']=='D','MarketingType'] = 1
df_test.loc[df_test['MarketingType']=='S','MarketingType'] = 0
df_test.loc[df_test['MarketingType']=='D','MarketingType'] = 1
# Tipos a entero
df_train['MarketingType'] = df_train['MarketingType'].astype('int')
df_test['MarketingType'] = df_test['MarketingType'].astype('int')
df_train.hist(figsize=(20,15), bins=100)
plt.show()
# Correlaciones
corr = df_train.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, vmin=-1,vmax=1)
plt.show()
plt.figure(figsize=(10, 8))
sns.heatmap(np.abs(corr), vmin=0,vmax=1)
plt.show()
#Se separa entrada y salida
X = df_train.drop(columns=['SoldFlag','SoldCount']) # sacamos los posibles target
y = df_train[['SoldFlag']]
y.hist()
plt.show()
#!pip install imblearn
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)
X = pd.DataFrame(columns=X.columns, data=X_resampled)
y = pd.DataFrame(columns=y.columns, data=y_resampled)
y.hist()
plt.plot()
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn import preprocessing
scaler = preprocessing.RobustScaler().fit(X_train.values)
normalized_train_X = scaler.transform(X_train.values)
normalized_val_X = scaler.transform(X_val.values)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
def clasificador(n_var, drop_out):
model = Sequential()
model.add(Dense(10, input_dim=n_var, activation='relu'))
model.add(Dropout(drop_out))
model.add(Dense(10, input_dim=n_var, activation='relu'))
model.add(Dropout(drop_out))
model.add(Dense(1, activation='sigmoid'))
return model
n_epochs = 100
n_batch = 100
n_var = normalized_train_X.shape[1]
#Hay que fundamentar los hiperparámetros??
modelo_cd = clasificador(n_var, 0.1)
modelo_cd.summary()
modelo_cd.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['acc', 'Precision', 'Recall'])
history_cd = modelo_cd.fit(normalized_train_X, y_train, validation_split=0.2, epochs = n_epochs, verbose=1, batch_size=n_batch)
y_pred = modelo_cd.predict(normalized_val_X).round()
#!pip install mlxtend
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
fig, ax = plot_confusion_matrix(conf_mat=confusion_matrix(y_val, y_pred),figsize=(6,6), show_absolute=True, show_normed=False, colorbar=False)
plt.title('Matriz de confusión')
plt.show()
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred))
# ajustamos los datos activos
X_test = df_test.drop(columns=['SoldFlag','SoldCount']) # no sirven para predecir o no están contenidos en la data activa
normalized_X_test = scaler.transform(X_test.values)
# predicción de clases para test
y_test_pred = modelo_cd.predict(normalized_X_test).round()
# probabilidades de clases para test
y_test_pred_prob = modelo_cd.predict(normalized_X_test).round(3)
df_test.loc[:,['SaleProb']] = y_test_pred_prob # asociamos probabilidades a la base con datos activos
rankings = df_test['SaleProb'].rank(method='min',ascending=False).values
df_test.loc[:,['Rank']] = rankings # rank según el orden de estas probabilidades
# Ranking
df_ranks = df_test.sort_values(by=["SaleProb"], ascending=False)[['SaleProb','Rank']]
display(df_ranks)
df_train.loc[:,['RankPercentile']] = df_train['SoldCount'].rank(method='first')
df_train.loc[:,['decile']] = pd.qcut(df_train['RankPercentile'].values, 10).codes
X = df_train.drop(columns=['SoldFlag','SoldCount','RankPercentile','decile']) # no sirven para predecir o no están contenidos en la data activa
y_cant = pd.get_dummies(df_train['decile'])
y_pred_2 = modelo_cd_2.predict(normalized_val_X_2)
print('predicciones para una observación de validación:', np.round(y_pred_2[0],3))