# Importamos las librerias principales
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context='notebook')
titanic = pd.read_csv('https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv', sep=',')
# Visualizacion del Dataframe
titanic.head(10)
# Eliminamos las columnas que no nos interesan
titanic.drop(
['Name', 'Fare'],
axis=1,
inplace=True)
# Renombramos las columnas Siblings/Spouses Aboard y Parents/Children Aboard por SibSp y ParCh
titanic.columns = ['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'ParCh']
titanic.head()
# Analizamos el shape del objeto
titanic.shape
# Visualizamos los tipos de datos
titanic.info()
# Cambiamos los tipos de datos de Sex
titanic = pd.get_dummies(titanic, columns=['Sex'], drop_first=True)
titanic.dtypes
titanic.head()
# Renombramos columna Sex_male a Sex
titanic.rename(columns={'Sex_male':'Sex'},inplace=True)
# Ordenar colummas por nombres
titanic = titanic[['Survived','Pclass','Sex','Age','SibSp','ParCh']]
titanic
# Vemos la proporcion del a variable objetivo
titanic.Survived.value_counts(normalize=True)
# importamos libreria para balancear los datos
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(random_state=42)
# Separamos en X e y
X_titanic = titanic.drop('Survived', axis=1)
y_titanic = titanic.Survived
# Balanceamos los datos
X_over_titanic, y_over_titanic = undersample.fit_resample(X_titanic,y_titanic)
y_over_titanic.value_counts(normalize=True)
# Importamos las librerias para dividir el dataset
from sklearn.model_selection import train_test_split
# 30% para test y 70% para train
X_train_titanic, X_test_titanic, y_train_titanic, y_test_titanic = train_test_split(X_over_titanic,y_over_titanic, test_size=0.30, random_state=42)
# Importamos las librerias para la creacion del modelo
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
# Definir el clasificador y los valores de los hiperparámetros a probar
clf = DecisionTreeClassifier(random_state=42)
param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4, 5]}
# Realizar la búsqueda de hiperparámetros utilizando GridSearchCV
grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10, return_train_score=True)
grid_search.fit(X_train_titanic, y_train_titanic)
# Imprimir los resultados
print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)
print("Mejor puntuación de validación cruzada:")
print(grid_search.best_score_)
# Modelo decision tree con parametros optimizados
best_clf = grid_search.best_estimator_
# Predecimos Y
y_train_pred_titanic = best_clf.predict(X_train_titanic)
y_test_pred_titanic = best_clf.predict(X_test_titanic)
# Graficamos matriz de confusion
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
cm = confusion_matrix(y_test_titanic,y_test_pred_titanic,labels=best_clf.classes_)
ConfusionMatrixDisplay(cm, display_labels=best_clf.classes_).plot()
# Calculo de las predicciones en Train y test
y_train_pred = best_clf.predict(X_train_titanic)
y_test_pred = best_clf.predict(X_test_titanic)
from sklearn.metrics import accuracy_score
print('El accuracy en train es:',accuracy_score(y_train_titanic,y_train_pred_titanic))
print('El accuracy en test es:', accuracy_score(y_test_titanic,y_test_pred_titanic))
feature_scores_titanic = pd.DataFrame(pd.Series(grid_search.best_estimator_.feature_importances_, index=X_train_titanic.columns).sort_values(ascending=False)).T
plt.figure(figsize=(12,4))
sns.barplot(data=feature_scores_titanic)
for index, value in enumerate(feature_scores_titanic.values.flatten()):
plt.annotate(f'{value:.2f}', xy=(index, value), ha='center', va='bottom')
plt.title("Factores clave en la predicción de la supervivencia en el Titanic")
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')
df_car = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data', header=None)
columns_names = ['price','maint','doors','persons','lug_boot','safety','Class']
df_car.columns = columns_names
# Visualizacion del DataFrame
df_car.head()
# Visualizacion del shape del DF
df_car.shape
# Tipos de datos
df_car.info()
# Vereificamos valores missings
df_car.isnull().sum()
df_car.drop_duplicates()
df_car.isnull().sum()
# Funcion conteo y proporcion de datos
def dist(df,target):
count= df[target].value_counts(normalize=False)
prop = df[target].value_counts(normalize=True)
dist = pd.DataFrame({'Freq[N]':count,'Prop[%]':prop.round(3)})
return dist
# Ver el conteo y la proporción de Class
for i in columns_names:
print(' '*7,i.upper())
print(dist(df_car,i))
print("*"*23)
# Graficamos la variable Class separada por los distintos atributos
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 8))
for i, variable in enumerate(columns_names[:-1]):
row = i % 2
col = i // 2
sns.countplot(data=df_car, x='Class',hue=variable, ax=axes[row][col])
axes[row][col].set_title(f"Evaluation Classes by {variable} Category")
plt.tight_layout()
plt.show()
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(random_state=42)
# Separamos datos por X e Y
X_car = df_car.drop('Class',axis=1)
y_car = df_car.Class
# Se realiza el undersampling
X_car, y_car = undersample.fit_resample(X_car,y_car)
#!pip install category_encoders
import category_encoders as ce
encoder = ce.OrdinalEncoder(cols=columns_names[:-1])
X_car = encoder.fit_transform(X_car)
X_car.head()
X_car.dtypes
from sklearn.model_selection import train_test_split
X_train_car, X_test_car, y_train_car, y_test_car = train_test_split(X_car,y_car,test_size=0.3, random_state=42)
print('X:',X_train_car.shape, X_test_car.shape)
print('y:',y_train_car.shape, y_test_car.shape)
from sklearn.tree import DecisionTreeClassifier
# Instancia del modelo
tree_car = DecisionTreeClassifier(random_state=42)
from sklearn.model_selection import GridSearchCV
# parametros del decision tree
param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [2, 3, 4, 5]}
# Realizar la búsqueda de hiperparámetros utilizando GridSearchCV
grid_search = GridSearchCV(tree_car, param_grid=param_grid, cv=10, return_train_score=True)
grid_search.fit(X_train_car, y_train_car)
# Imprimir los resultados
print("Mejores hiperparámetros encontrados:")
print(grid_search.best_params_)
print("Mejor puntuación de validación cruzada:")
print(grid_search.best_score_)
# Modelo decision tree con parametros optimizados
best_tree_car = grid_search.best_estimator_
# Predecimos Y
y_train_pred_tree_car = best_tree_car.predict(X_train_car)
y_test_pred_tree_car = best_tree_car.predict(X_test_car)
from sklearn.metrics import accuracy_score, classification_report
# Calculo el accuract en train
train_acc = accuracy_score(y_true=y_test_car,y_pred=y_test_pred_tree_car)
# Calculo el accuract en test
test_acc = accuracy_score(y_true=y_train_car,y_pred=y_train_pred_tree_car)
print("El accuracy en train es:",train_acc)
print("El accuracy en test es:",test_acc)
print(classification_report(y_test_car,y_test_pred_tree_car))
feature_scores_car = pd.DataFrame(pd.Series(best_tree_car.feature_importances_, index=X_train_car.columns).sort_values(ascending=False)).T
plt.figure(figsize=(12,6))
sns.barplot(data=feature_scores_car)
for index, value in enumerate(feature_scores_car.values.flatten()):
plt.annotate(f'{value:.2f}', xy=(index, value), ha='center', va='bottom')
plt.title("Factores clave en la predicción de la calidad de un automovil")
plt.show()
pd.DataFrame(feature_scores_car.T)
plt.figure(figsize = (12,8))
from sklearn import tree
tree.plot_tree(best_tree_car.fit(X_train_car, y_train_car));
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid', context='notebook')
df_diabetes = pd.read_csv('pima-indians-diabetes.csv', header=None, sep=',')
df_columns = np.array(['preg','plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'Class'])
df_diabetes.columns = df_columns
# Visualizacion del DataFrame
df_diabetes.head()
# Visualizacion del shape del DF
df_diabetes.shape
# Tipos de datos
df_diabetes.info()
# No hay datos duplicados en el dataset
df_diabetes.drop_duplicates(inplace=True)
df_diabetes
import missingno as miss
miss.matrix(df_diabetes);
df_diabetes.iloc[:, 1:6].replace(to_replace=[0], value=np.nan).isna().sum().reset_index(name = 'missing_values').rename(columns={"index": "variable"}).assign( percentage = lambda df_reset: df_reset.missing_values / len(df_diabetes) * 100)
plt.figure(figsize=(7,7))
labels, counts = np.unique(df_diabetes.Class, return_counts=True)
plt.pie(counts, autopct='%1.1f%%',labels=labels)
plt.legend({'Diabetes Negativo','Diabetes positivo'})
plt.title('Proporcion de diabetes')
plt.show()
print(df_diabetes.Class.value_counts())
# Separamos en X e Y
X_diabetes = df_diabetes.drop('Class',axis=1)
y_diabetes = df_diabetes.Class
# Importamos las librerias para entrenamiento y testeo
from sklearn.model_selection import train_test_split
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(X_diabetes,y_diabetes, test_size=.20, random_state=42)
X_train_diabetes.shape,y_train_diabetes.shape
# Bosque aleatorio
from sklearn.ensemble import RandomForestClassifier
# Instancia del modelo
rfc_diabetes = RandomForestClassifier(random_state=42)
# Optimizacion de parametros
from sklearn.model_selection import GridSearchCV
# Definir los hiperparámetros y sus posibles valores
param_grid = {
'n_estimators': [10,25,50],
'max_depth' : [5,10,15],
'criterion' : ['mse', 'mae', 'gini', 'entropy', 'log_loss'],
'min_samples_split': [2,4,6],
'min_samples_leaf': [1,2,4],
}
# Crear el objeto GridSearchCV
grid_search = GridSearchCV(estimator=rfc_diabetes, param_grid=param_grid, cv=5, scoring='accuracy')
# Ajustar el modelo con GridSearchCV
grid_search.fit(X_train_diabetes, y_train_diabetes)
# Obtener el modelo con el mejor rendimiento
best_model_diabetes = grid_search.best_estimator_
# Mejores parametros del modelo
grid_search.best_params_
from sklearn.metrics import accuracy_score, classification_report
y_train_pred_diabetes = best_model_diabetes.predict(X_train_diabetes)
y_test_pred_diabetes = best_model_diabetes.predict(X_test_diabetes)
accuracy_train_diabetes = accuracy_score(y_train_diabetes,y_train_pred_diabetes)
accuracy_test_diabetes = accuracy_score(y_test_diabetes,y_test_pred_diabetes)
print(accuracy_train_diabetes)
print(accuracy_test_diabetes)
print(classification_report(y_test_diabetes, y_test_pred_diabetes))
feature_scores_diabetes = pd.DataFrame(pd.Series(best_model_diabetes.feature_importances_, index=X_train_diabetes.columns).sort_values(ascending=False)).T
plt.figure(figsize=(12,6))
sns.barplot(data=feature_scores_diabetes)
for index, value in enumerate(feature_scores_diabetes.values.flatten()):
plt.annotate(f'{value:.2f}', xy=(index, value), ha='center', va='bottom')
plt.title("Factores clave en la predicción de diabetes positivo en pima indians")
plt.show()
pd.DataFrame(feature_scores_diabetes).T
from sklearn.model_selection import cross_val_score, KFold
k = 5
cv = KFold(n_splits=k, shuffle=True, random_state=42)
scores = cross_val_score(best_model_diabetes, X_train_diabetes, y_train_diabetes, cv=cv, scoring='accuracy')
print("")
print("Accuracy scores for each fold:", scores*100)
print("Mean accuracy: ", scores.mean()*100)
print("Standard deviation: ", scores.std()*100)
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn import tree
plt.figure(figsize=(30, 30))
# Obtener un árbol aleatorio del Random Forest
tree_index = 0 # Índice del árbol deseado
Tree = best_model_diabetes.estimators_[tree_index]
# Visualizar el árbol utilizando plot_tree
tree.plot_tree(Tree, feature_names=X_train_diabetes.columns, filled=True)
plt.show()
X_car
y_car
X_train_car,X_test_car,y_train_car,y_test_car = train_test_split(X_car,y_car, random_state=42)
rfc_car = RandomForestClassifier(n_estimators=5,random_state=42)
rfc_car.fit(X_train_car,y_train_car)
y_pred_test_car = rfc_car.predict(X_test_car)
y_pred_train_car = rfc_car.predict(X_train_car)
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
print(accuracy_score(y_train_car,y_pred_train_car))
print(accuracy_score(y_test_car,y_pred_test_car))
print(classification_report(y_test_car,y_pred_test_car))
feature_scores_car = pd.DataFrame(pd.Series(rfc_car.feature_importances_, index=X_train_car.columns).sort_values(ascending=False)).T
plt.figure(figsize=(12,6))
sns.barplot(data=feature_scores_car)
for index, value in enumerate(feature_scores_car.values.flatten()):
plt.annotate(f'{value:.2f}', xy=(index, value), ha='center', va='bottom')
plt.title("Factores clave en la predicción de la calidad de un automovil")
plt.show()
pd.DataFrame(feature_scores_car).T