# Importamos las librerias principales
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='notebook')
titanic = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv', sep=',')
# Visualizacion del Dataframe
titanic.head(10)
# Eliminamos las columnas que no nos interesan
titanic.drop(
['Name', 'Fare'],
axis=1,
inplace=True)
# Renombramos las columnas Siblings/Spouses Aboard y Parents/Children Aboard por SibSp y ParCh
titanic.columns = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'ParCh']
titanic.head()
# Analizamos el shape del objeto
titanic.shape
# Visualizamos los tipos de datos
titanic.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 887 non-null int64
1 Pclass 887 non-null int64
2 Sex 887 non-null object
3 Age 887 non-null float64
4 SibSp 887 non-null int64
5 ParCh 887 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 41.7+ KB
# Cambiamos los tipos de datos de Sex
titanic = pd.get_dummies(titanic, columns=['Sex'], drop_first=True)
titanic.dtypes
titanic.head()
# Renombramos columna Sex_male a Sex
titanic.rename(columns={'Sex_male':'Sex'},inplace=True)
# Ordenar colummas por nombres
titanic = titanic[['Survived','Pclass','Sex','Age','SibSp','ParCh']]
titanic
# Vemos la proporcion del a variable objetivo
titanic.Survived.value_counts(normalize=True)
# importamos libreria para balancear los datos
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(random_state=42)
# Separamos en X e y
X_titanic = titanic.drop('Survived', axis=1)
y_titanic = titanic.Survived
# Balanceamos los datos
X_over_titanic, y_over_titanic = undersample.fit_resample(X_titanic,y_titanic)
y_over_titanic.value_counts(normalize=True)
# Importamos las librerias para dividir el dataset
from sklearn.model_selection import train_test_split
# 30% para test y 70% para train
X_train_titanic, X_test_titanic, y_train_titanic, y_test_titanic = train_test_split(X_over_titanic,y_over_titanic, test_size=0.30, random_state=42)
# Importamos las librerias para la creacion del modelo
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
# Definir el clasificador y los valores de los hiperparámetros a probar
clf = DecisionTreeClassifier(random_state=42)
param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4, 5]}
# Realizar la búsqueda de hiperparámetros utilizando GridSearchCV
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10, return_train_score=True)
grid_search.fit(X_train_titanic, y_train_titanic)
# Imprimir los resultados
print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)
print("Mejor puntuación de validación cruzada:")
print(grid_search.best_score_)
Mejores hiperparámetros encontrados:
{'criterion': 'gini', 'max_depth': 4}
Mejor puntuación de validación cruzada:
0.7908687943262411
# Modelo decision tree con parametros optimizados
best_clf = grid_search.best_estimator_
# Predecimos Y
y_train_pred_titanic = best_clf.predict(X_train_titanic)
y_test_pred_titanic = best_clf.predict(X_test_titanic)
# Graficamos matriz de confusion
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
cm = confusion_matrix(y_test_titanic,y_test_pred_titanic,labels=best_clf.classes_)
ConfusionMatrixDisplay(cm, display_labels=best_clf.classes_).plot()
# Calculo de las predicciones en Train y test
y_train_pred = best_clf.predict(X_train_titanic)
y_test_pred = best_clf.predict(X_test_titanic)
from sklearn.metrics import accuracy_score
print('El accuracy en train es:',accuracy_score(y_train_titanic,y_train_pred_titanic))
print('El accuracy en test es:', accuracy_score(y_test_titanic,y_test_pred_titanic))
El accuracy en train es: 0.8179916317991632
El accuracy en test es: 0.8252427184466019
feature_scores_titanic = pd.DataFrame(pd.Series(grid_search.best_estimator_.feature_importances_, index=X_train_titanic.columns).sort_values(ascending=False)).T
plt.figure(figsize=(12,4))
sns.barplot(data=feature_scores_titanic)
for index, value in enumerate(feature_scores_titanic.values.flatten()):
plt.annotate(f'{value:.2f}', xy=(index, value), ha='center', va='bottom')
plt.title("Factores clave en la predicción de la supervivencia en el Titanic")
plt.show()
/home/mazzaroli/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:82: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
plot_data = [np.asarray(s, float) for k, s in iter_data]
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')
df_car = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data', header=None)
columns_names = ['price','maint','doors','persons','lug_boot','safety','Class']
df_car.columns = columns_names
# Visualizacion del DataFrame
df_car.head()
# Visualizacion del shape del DF
df_car.shape
# Tipos de datos
df_car.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 price 1728 non-null object
1 maint 1728 non-null object
2 doors 1728 non-null object
3 persons 1728 non-null object
4 lug_boot 1728 non-null object
5 safety 1728 non-null object
6 Class 1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB
# Vereificamos valores missings
df_car.isnull().sum()
df_car.drop_duplicates()
df_car.isnull().sum()
# Funcion conteo y proporcion de datos
def dist(df,target):
count= df[target].value_counts(normalize=False)
prop = df[target].value_counts(normalize=True)
dist = pd.DataFrame({'Freq[N]':count,'Prop[%]':prop.round(3)})
return dist
# Ver el conteo y la proporción de Class
for i in columns_names:
print(' '*7,i.upper())
print(dist(df_car,i))
print("*"*23)
PRICE
Freq[N] Prop[%]
vhigh 432 0.25
high 432 0.25
med 432 0.25
low 432 0.25
***********************
MAINT
Freq[N] Prop[%]
vhigh 432 0.25
high 432 0.25
med 432 0.25
low 432 0.25
***********************
DOORS
Freq[N] Prop[%]
2 432 0.25
3 432 0.25
4 432 0.25
5more 432 0.25
***********************
PERSONS
Freq[N] Prop[%]
2 576 0.333
4 576 0.333
more 576 0.333
***********************
LUG_BOOT
Freq[N] Prop[%]
small 576 0.333
med 576 0.333
big 576 0.333
***********************
SAFETY
Freq[N] Prop[%]
low 576 0.333
med 576 0.333
high 576 0.333
***********************
CLASS
Freq[N] Prop[%]
unacc 1210 0.700
acc 384 0.222
good 69 0.040
vgood 65 0.038
***********************
# Graficamos la variable Class separada por los distintos atributos
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 8))
for i, variable in enumerate(columns_names[:-1]):
row = i % 2
col = i // 2
sns.countplot(data=df_car, x='Class',hue=variable, ax=axes[row][col])
axes[row][col].set_title(f"Evaluation Classes by {variable} Category")
plt.tight_layout()
plt.show()
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(random_state=42)
# Separamos datos por X e Y
X_car = df_car.drop('Class',axis=1)
y_car = df_car.Class
# Se realiza el undersampling
X_car, y_car = undersample.fit_resample(X_car,y_car)
#!pip install category_encoders
import category_encoders as ce
encoder = ce.OrdinalEncoder(cols=columns_names[:-1])
X_car = encoder.fit_transform(X_car)
X_car.head()
X_car.dtypes
from sklearn.model_selection import train_test_split
X_train_car, X_test_car, y_train_car, y_test_car = train_test_split(X_car,y_car,test_size=0.3, random_state=42)
print('X:',X_train_car.shape, X_test_car.shape)
print('y:',y_train_car.shape, y_test_car.shape)
X: (182, 6) (78, 6)
y: (182,) (78,)
from sklearn.tree import DecisionTreeClassifier
# Instancia del modelo
tree_car = DecisionTreeClassifier(random_state=42)
from sklearn.model_selection import GridSearchCV
# parametros del decision tree
param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4, 5]}
# Realizar la búsqueda de hiperparámetros utilizando GridSearchCV
grid_search = GridSearchCV(tree_car, param_grid=param_grid, cv=10, return_train_score=True)
grid_search.fit(X_train_car, y_train_car)
# Imprimir los resultados
print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)
print("Mejor puntuación de validación cruzada:")
print(grid_search.best_score_)
Mejores hiperparámetros encontrados:
{'criterion': 'entropy', 'max_depth': 4}
Mejor puntuación de validación cruzada:
0.8026315789473685
# Modelo decision tree con parametros optimizados
best_tree_car = grid_search.best_estimator_
# Predecimos Y
y_train_pred_tree_car = best_tree_car.predict(X_train_car)
y_test_pred_tree_car = best_tree_car.predict(X_test_car)
from sklearn.metrics import accuracy_score, classification_report
# Calculo el accuract en train
train_acc = accuracy_score(y_true=y_test_car,y_pred=y_test_pred_tree_car)
# Calculo el accuract en test
test_acc = accuracy_score(y_true=y_train_car,y_pred=y_train_pred_tree_car)
print("El accuracy en train es:",train_acc)
print("El accuracy en test es:",test_acc)
El accuracy en train es: 0.717948717948718
El accuracy en test es: 0.8131868131868132
print(classification_report(y_test_car,y_test_pred_tree_car))
precision recall f1-score support
acc 0.77 0.62 0.69 16
good 0.65 0.48 0.55 23
unacc 1.00 0.80 0.89 20
vgood 0.59 1.00 0.75 19
accuracy 0.72 78
macro avg 0.75 0.73 0.72 78
weighted avg 0.75 0.72 0.71 78
feature_scores_car = pd.DataFrame(pd.Series(best_tree_car.feature_importances_, index=X_train_car.columns).sort_values(ascending=False)).T
plt.figure(figsize=(12,6))
sns.barplot(data=feature_scores_car)
for index, value in enumerate(feature_scores_car.values.flatten()):
plt.annotate(f'{value:.2f}', xy=(index, value), ha='center', va='bottom')
plt.title("Factores clave en la predicción de la calidad de un automovil")
plt.show()
pd.DataFrame(feature_scores_car.T)
/home/mazzaroli/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:82: FutureWarning: iteritems is deprecated and will be removed in a future version. Use .items instead.
plot_data = [np.asarray(s, float) for k, s in iter_data]
plt.figure(figsize = (12,8))
from sklearn import tree
tree.plot_tree(best_tree_car.fit(X_train_car, y_train_car));
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')
df_diabetes = pd.read_csv('pima-indians-diabetes.csv', header=None, sep=',')
df_columns = np.array(['preg','plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'Class'])
df_diabetes.columns = df_columns
# Visualizacion del DataFrame
df_diabetes.head()
# Visualizacion del shape del DF
df_diabetes.shape
# Tipos de datos
df_diabetes.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 preg 768 non-null int64
1 plas 768 non-null int64
2 pres 768 non-null int64
3 skin 768 non-null int64
4 test 768 non-null int64
5 mass 768 non-null float64
6 pedi 768 non-null float64
7 age 768 non-null int64
8 Class 768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
# No hay datos duplicados en el dataset
df_diabetes.drop_duplicates(inplace=True)
df_diabetes
import missingno as miss
miss.matrix(df_diabetes);
df_diabetes.iloc[:, 1:6].replace(to_replace=[0], value=np.nan).isna().sum().reset_index(name = 'missing_values').rename(columns={"index": "variable"}).assign( percentage = lambda df_reset: df_reset.missing_values / len(df_diabetes) * 100)
plt.figure(figsize=(7,7))
labels, counts = np.unique(df_diabetes.Class, return_counts=True)
plt.pie(counts, autopct='%1.1f%%',labels=labels)
plt.legend({'Diabetes Negativo','Diabetes positivo'})
plt.title('Proporcion de diabetes')
plt.show()
print(df_diabetes.Class.value_counts())
0 500
1 268
Name: Class, dtype: int64
# Separamos en X e Y
X_diabetes = df_diabetes.drop('Class',axis=1)
y_diabetes = df_diabetes.Class
# Importamos las librerias para entrenamiento y testeo
from sklearn.model_selection import train_test_split
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(X_diabetes,y_diabetes, test_size=.20, random_state=42)
X_train_diabetes.shape,y_train_diabetes.shape
# Bosque aleatorio
from sklearn.ensemble import RandomForestClassifier
# Instancia del modelo
rfc_diabetes = RandomForestClassifier(random_state=42)
# Optimizacion de parametros
from sklearn.model_selection import GridSearchCV
# Definir los hiperparámetros y sus posibles valores
param_grid = {
'n_estimators': [10,25,50],
'max_depth' : [5,10,15],
'criterion' : ['mse', 'mae', 'gini', 'entropy', 'log_loss'],
'min_samples_split': [2,4,6],
'min_samples_leaf': [1,2,4],
}
# Crear el objeto GridSearchCV
grid_search = GridSearchCV(estimator=rfc_diabetes, param_grid=param_grid, cv=5, scoring='accuracy')
# Ajustar el modelo con GridSearchCV
grid_search.fit(X_train_diabetes, y_train_diabetes)
# Obtener el modelo con el mejor rendimiento
best_model_diabetes = grid_search.best_estimator_
/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py:378: FitFailedWarning:
810 fits failed out of a total of 2025.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.
Below are more details about the failures:
--------------------------------------------------------------------------------
405 fits failed with the following error:
Traceback (most recent call last):
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 476, in fit
trees = Parallel(
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1085, in __call__
if self.dispatch_one_batch(iterator):
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 901, in dispatch_one_batch
self._dispatch(tasks)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 819, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 597, in __init__
self.results = batch()
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 288, in __call__
return [func(*args, **kwargs)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 288, in <listcomp>
return [func(*args, **kwargs)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/utils/fixes.py", line 117, in __call__
return self.function(*args, **kwargs)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 189, in _parallel_build_trees
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 969, in fit
super().fit(
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 390, in fit
criterion = CRITERIA_CLF[self.criterion](
KeyError: 'mse'
--------------------------------------------------------------------------------
405 fits failed with the following error:
Traceback (most recent call last):
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 476, in fit
trees = Parallel(
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1085, in __call__
if self.dispatch_one_batch(iterator):
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 901, in dispatch_one_batch
self._dispatch(tasks)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 819, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/_parallel_backends.py", line 597, in __init__
self.results = batch()
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 288, in __call__
return [func(*args, **kwargs)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 288, in <listcomp>
return [func(*args, **kwargs)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/utils/fixes.py", line 117, in __call__
return self.function(*args, **kwargs)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 189, in _parallel_build_trees
tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 969, in fit
super().fit(
File "/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 390, in fit
criterion = CRITERIA_CLF[self.criterion](
KeyError: 'mae'
warnings.warn(some_fits_failed_message, FitFailedWarning)
/home/mazzaroli/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_search.py:953: UserWarning: One or more of the test scores are non-finite: [ nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan nan nan nan
nan nan nan nan nan nan
nan n