import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as ply
import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import missingno as msno
from functools import reduce
# Librerias para modelado
!pip install feature_engine
from feature_engine.imputation import RandomSampleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest,chi2,f_classif
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score,confusion_matrix,roc_curve,auc,f1_score,plot_roc_curve, plot_confusion_matrix
from sklearn.metrics import mean_absolute_error, r2_score,mean_squared_error
from sklearn.linear_model import SGDClassifier, SGDRegressor
from itertools import cycle
from sklearn import svm, datasets
from sklearn.preprocessing import label_binarize
from scipy import interp
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
# Instalamos la librería emoji unicamente para poder correr el library de funciones del modulo 1
!pip install emoji --upgrade
import emoji
pd.set_option('display.float_format', lambda x: '%.5f' % x)
from google.colab import drive
drive.mount("/content/gdrive")
import sys
sys.path.insert(0,'/content/gdrive/MyDrive/Trabajos diplomado Data Science/codigos modulo 2/Practicas/')
import modulo1_libreria as m1
import libreriamodulo2 as m2
cars_data = pd.read_csv('/content/gdrive/MyDrive/Trabajos diplomado Data Science/Examen final/cars_adds.csv')
cars_data.head()
cars_data.shape
cars_data.dtypes
for col in cars_data.columns:
print(cars_data[col].value_counts(1))
c_features = ['mileage','manufacture_year','engine_displacement','engine_power','stk_year','door_count','seat_count','price_eur']
v_features = ['maker','model','body_type','color_slug','transmission','fuel_type']
d_features = ['date_created','date_last_seen']
m1.etiquetado_vars(cars_data,c_features,v_features,d_features)
cars_data.dtypes
cars_data.describe(np.arange(.1,1,.1))
# Vamos a convertir las demás variables numéricas al tipo de dato correcto
c_features = [x for x in cars_data.columns if x.startswith('c_')]
for _ in c_features :
cars_data[_] = pd.to_numeric(cars_data[_],errors='coerce')
cars_data.describe(np.arange(.1,1,.1))
cars_data[[x for x in cars_data.columns if x.startswith('v_')]]
cars_data[[x for x in cars_data.columns if x.startswith('v_')]].dtypes
cars_data.duplicated().sum()
cars_data[c_features].describe(np.arange(.1,1,.1))
cars_data.c_door_count.value_counts()
cars_data.shape
# En numero máximo razonable de puertas que puede tener un auto es de 7,
# Dado que son pocos registros y que ademnas tienen bastantes missings procedemos a eliminarlos
aux = cars_data[cars_data.c_door_count>7].index
data_cars = cars_data.copy()
data_cars.drop(index=aux,inplace=True)
data_cars.reset_index()
data_cars.shape
data_cars.c_seat_count.value_counts(1)
# A menos que sean autobuses los autos no tienen más de 10 o 12 asientos considerando cierto tipo de camionetas.
data_cars[data_cars.c_seat_count>12].shape
aux = data_cars[data_cars.c_seat_count>12].index
data_cars.drop(index=aux,inplace=True)
data_cars.reset_index()
data_cars.shape
data_cars.c_stk_year.value_counts()
# La fecha de la ultima revisión de emisiones no puede ser mayor a 2021
aux = data_cars[data_cars.c_stk_year>2021].index
data_cars.drop(index=aux,inplace=True)
data_cars.reset_index()
data_cars.shape
data_cars.c_stk_year.value_counts()
ply.histogram(data_cars,x='c_manufacture_year')
# El percentil 10 comienza en 2000% por lo que podemos estimar un año muy cercano a esa fecha para acotarlo
data_cars[data_cars.c_manufacture_year<1990]
aux = data_cars[data_cars.c_manufacture_year<1990].index
data_cars.drop(index=aux,inplace=True)
data_cars.reset_index()
data_cars.shape
# Acotamos los registros a partir de 1985
data_cars = data_cars[data_cars['c_manufacture_year']>1985]
ply.histogram(data_cars,x='c_manufacture_year')
data_cars.c_price_eur.describe(np.arange(0.05,1,0.05))
# El precio 1295 está desbalanceando la variable por lo que será mejor quitarlo
data_cars = data_cars[data_cars["c_price_eur"] != 1295.34]
# De la misma manera en la cota superior existen valores extremos por lo que nos quedaremos unicamente con el 95% de la variable
data_cars = data_cars[data_cars["c_price_eur"] <33004.67]
ply.histogram(data_cars,x='c_price_eur')
data_cars.c_price_eur.describe(np.arange(0.05,1,0.05))
# Existe una gran diferencia entre el minimo y el percentil 5% por lo que vamos a acotar a partir del percentil 5%
data_cars = data_cars[data_cars["c_price_eur"] >=1099]
ply.histogram(data_cars,x='c_price_eur')
data_cars.c_mileage.describe(np.arange(.05,1,.05))
# Acotamos al 95% mileage
data_cars = data_cars[data_cars["c_mileage"] <242391]
ply.histogram(data_cars,x='c_mileage')
# Podemos ver que todas las variables numericas presentan outliers
data_cars.c_mileage.describe(np.arange(.02,.2,.02))
ply.histogram(data_cars[data_cars['c_mileage']>100],x='c_mileage')
data_cars = data_cars[data_cars["c_mileage"] >100]
ply.box(data_cars, y = 'c_mileage')
data_cars.c_engine_displacement.describe(np.arange(.05,1,.05))
data_cars = data_cars[data_cars["c_engine_displacement"] < 2980]
ply.histogram(data_cars,x='c_engine_displacement')
data_cars = data_cars[data_cars["c_engine_displacement"] > 1000]
ply.histogram(data_cars,x='c_engine_displacement')
# Usamos la función outliers que nos trae los missings detectados por tecnica de
# deteccion
m1.outliers(data_cars,cols=['c_engine_displacement','c_engine_power','c_price_eur','c_mileage'])
# vemos que la información estaba muy contaminada y aun quedan varios outliers
# Dado que no siguen una distribución normal no podemos imputar por la media.
# por lo que procederemos a eliminarlos
outliers = m1.outliers(data_cars,cols=['c_engine_displacement','c_engine_power','c_price_eur','c_mileage'])
indices = list(outliers.indices.values)
indices = list(set(reduce(lambda x,y: x+y,indices)))
len(indices) #Total de missings
# Podemos ver que son registros con caso todos los valores en el renglón llenos de valores ausentes
data_cars[data_cars.isin(indices)]
data_cars = data_cars[~data_cars.index.isin(indices)].reset_index(drop=True)
data_cars.shape
msno.matrix(data_cars)
msno.heatmap(data_cars)
m1.completitud(cars_data)
# nuestra regla de imputación es eliminar todas las caracteristicas que tengan menos de 80%
# de valores completos per la bajaremos a 75% para no perder tantas variables
cols_missings = ['v_color_slug','c_stk_year','v_fuel_type','c_seat_count',
'v_model','v_body_type','c_door_count']
data_cars.drop(cols_missings,axis=1,inplace=True)
# Para imputar valores ausentes lo haremos de manera aleatoria dado que no siguen ninguna distribución
# Imputación de Missings de manera aleatoria
imputer = RandomSampleImputer()
data_cars = imputer.fit_transform(data_cars)
msno.matrix(data_cars)
# Tratamiento de la variable objetivo
m1.clean_vars(data_cars,columns=['v_maker'])
# tratamiento para la variable objetivo
data_cars.v_maker.value_counts(1)
# Dado de son demasiadas marcas vamos a crear la categorìa "otros" para agrupar todas las categorìas que no representen al menos el 1%
others = ['suzuki','honda','mitsubishi','chevrolet','lancia','jeep',
'rover','subaru','chrysler','jaguar','alfa romeo','porsche',
'lexus','dodge','dacia','land rover','isuzu','infinity',
'lotus','hummer','maserati','tesla','bentley','lamborghini']
data_cars.v_maker.replace(others,'other',inplace=True)
data_cars.v_maker.value_counts(1)
sns.set(rc={"figure.figsize":(20, 15)})
order = data_cars.v_maker.value_counts(ascending=False).index
sns.countplot(data_cars['v_maker'],order=order)
# convertimos las categorias "Marcas" en numeros
label_encoder = LabelEncoder()
target_maker = label_encoder.fit_transform(data_cars.v_maker)
target_maker
data_cars['target_maker'] = target_maker.tolist()
data_cars.head()
maker = data_cars.v_maker
data_cars.drop('v_maker',axis=1,inplace=True)
data_cars.head()
# Creación de variables dummy
data_cars = pd.get_dummies(data_cars,columns=['v_transmission'],drop_first=True)
data_cars.describe()
# vamos a extraer el año de cada fecha de las variables tipo fecha
data_cars['d_date_created'] = pd.to_datetime(data_cars.d_date_created)
data_cars['d_date_last_seen'] = pd.to_datetime(data_cars.d_date_last_seen)
data_cars['c_year_created'] = [x.year for x in data_cars.d_date_created]
data_cars['c_year_last_seen'] = [x.year for x in data_cars.d_date_last_seen]
data_cars.drop(['d_date_created','d_date_last_seen'],axis=1,inplace=True)
# Reacomodamos el dataframe para tener el target_market al final
cols = ['c_mileage','c_manufacture_year','c_engine_displacement','c_engine_power','c_price_eur','v_transmission_man',
'c_year_created','c_year_last_seen','target_maker']
data_cars = data_cars[cols]
data_cars.shape
X = data_cars.drop(['target_maker'],axis=1)
y = data_cars.target_maker
kbest = SelectKBest(chi2, k=4)
X_best = kbest.fit_transform(X, y)
X_best.shape
kbest.pvalues_
X.head()
cols = kbest.get_support(indices=True)
X_best_feats= X.iloc[:,cols]
X_best_feats
# Variables seleccionadas
# 1 c_mileage
# 2 c_engine_displacement
# 3 c_engine_power
# 4 c_price_eur
# Utilizaremos LDA para encontrar las mejores variables para el target 'Maker'
data_cars.head()
lda = LinearDiscriminantAnalysis(n_components=8,)
X_lda = lda.fit_transform(X,y)
X_lda
# varianza explicada por cada variable
print(lda.explained_variance_ratio_)
# Podemos ver que el 94% de la varianza se explica con las 4 primeras variables por lo que
# nos quedamos unicamente con 4 variables
sns.countplot(data_cars.target_maker)
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_best_feats, y, test_size=.8,
random_state=1)
forest=RandomForestClassifier(max_depth=25,random_state=1)
forest.fit(X_train,y_train)
y_pred_rf = forest.predict(X_test)
f1_score(y_test,y_pred_rf,average='micro')
y_pred_train = forest.predict(X_train)
y_pred_proba = forest.predict_proba(X_test)
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_rf)
print("Accuracy: Train=%.4f Test=%.4f" % (accuracy_train,accuracy_test))
cm = confusion_matrix(y_test,y_pred_rf)
sns.heatmap(cm,cmap='crest')
forest.score(X_train,y_train)
num_clases = data_cars.target_maker.max()
num_clases
# Binarize the output
y = label_binarize(target_maker, classes=range(1,21))
n_classes = y.shape[1]
# Learn to predict each class against the other
classifier = OneVsRestClassifier(svm.SVC(kernel='linear',probability=True,max_iter=1000))
y_score = classifier.fit(X_train, y_train).predict(X_test)
y= data_cars.target_maker
# shuffle and split training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_best_feats, y, test_size=.8,random_state=1)
svm = SVC(kernel='linear',decision_function_shape='ovr',max_iter=1000)
svm.fit(X_train,y_train.values.ravel())
ls_scores = cross_val_score(estimator=svm,scoring='accuracy',X = X_train, y=y_train, cv=4, n_jobs=-1)
np.mean(ls_scores), np.std(ls_scores)
param_grid = {"C": [x for x in range(2)],
"kernel":['linear','poly','rbf','sigmoid']}
n_hyper = np.prod([x for x in map(len, param_grid.values())])
n_hyper
clf = RandomizedSearchCV(svm, param_grid,cv=2, error_score=-100, scoring="accuracy",n_iter=1000,n_jobs=-1,verbose=5)
clf.fit(X_train, y_train.values.ravel())
print('Best score: '+str(clf.best_score_))
print('Best estimator: '+str(clf.best_estimator_))
accuracy_score(y_test,classifier.predict(X_test))
matrix = plot_confusion_matrix(classifier, X_test, y_test,
cmap=plt.cm.Blues,
normalize='true')
plt.title('Confusion matrix for OvR classifier')
plt.show(matrix)
plt.show()
tree=DecisionTreeClassifier(random_state=1)
tree.fit(X_train,y_train)
y_pred_dt = tree.predict(X_test)
f1_score(y_test,y_pred_dt,average='micro')
cm = confusion_matrix(y_test,y_pred_dt)
sns.heatmap(cm,cmap='viridis')
gnb = GaussianNB()
gnb.fit(X_train,y_train)
accuracy_score(y_pred=gnb.predict(X_test),y_true = y_test)
cm_naive = confusion_matrix(y_pred=gnb.predict(X_test),y_true = y_test)
sns.heatmap(cm_naive,cmap='crest')
f1_score(y_test,gnb.predict(X_test),average='weighted')
# Con escalamiento
X_ = X_best_feats
y_ = target_maker
y_.shape
X_train_, X_test_, y_train_, y_test_, = train_test_split(X_, y_, test_size=0.2, random_state = 0)
sc = StandardScaler()
X_train_ = sc.fit_transform(X_train_)
X_test_ = sc.transform(X_test_)
gnb_scaled = GaussianNB()
gnb_scaled.fit(X_train_, y_train_)
cm_gnb = confusion_matrix(y_test_,gnb_scaled.predict(X_test_))
sns.heatmap(cm_gnb,cmap='BuGn')
accuracy_score(y_test_,gnb.predict(X_test_))
data_regresion = data_cars.rename(columns={'target_maker':'c_maker','c_price_eur':'target_price_eur'})
X = data_regresion.drop(['target_price_eur'],axis=1)
y = data_regresion['target_price_eur']
sns.histplot(y)
y.describe(np.arange(.1,1,.1))
sns.histplot(np.log(y))
sns.histplot(np.sqrt(y))
from scipy import stats
sns.histplot(stats.boxcox(y)[0])
X_train_r, X_test_r, y_train_r,y_test_r = train_test_split(X,y,random_state=1)
y_train_r
sc_reg = StandardScaler()
X_train_r = sc_reg.fit_transform(X_train_r)
X_test_r = sc_reg.transform(X_test_r)
mlp_reg = MLPRegressor()
mlp_reg.fit(X_train_r,y_train_r)
y_pred_mlp = mlp_reg.predict(X_test_r)
def metricas(y_true,y_pred):
r2=r2_score(y_true,y_pred)
mae=mean_absolute_error(y_true,y_pred)
mse=mean_squared_error(y_true,y_pred)
print(f'El r2 score es {r2}')
print(f'El error cuadrático medio es {mse}')
print(f'El error medio absoluto es {mae}')
metricas(y_test_r,y_pred_mlp)
param_grid = {'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,1)],
'activation': ['relu','tanh','logistic'],
'alpha': [0.0001, 0.05],
'learning_rate': ['constant','adaptive'],
'solver': ['adam']}
cv = RandomizedSearchCV(
mlp_reg,
param_grid,
cv=2, scoring='r2', verbose=5, n_jobs=-1)
cv.fit(X=X_train_r, y=y_train_r)
model_best_mlp = cv.best_estimator_
cv.best_score_
model_best_mlp