import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (confusion_matrix, classification_report,
roc_auc_score, roc_curve, accuracy_score)
Run to view results
df = pd.read_csv('train.csv')
df.head()
Run to view results
df2 = df.copy()
cols_drop = ['Ticket','Name','Cabin']
for c in cols_drop:
if c in df2.columns:
df2 = df2.drop(columns=c)
df2['FamilySize'] = df2['SibSp'] + df2['Parch'] + 1
df2['IsAlone'] = (df2['FamilySize'] == 1).astype(int)
df2['Pclass'] = df2['Pclass'].astype(str)
df2.head()
Run to view results
target = 'Survived'
X = df2.drop(columns=[target])
y = df2[target].values
# Columnas
num_features = ['Age','Fare','SibSp','Parch','FamilySize']
cat_features = ['Sex','Embarked','Pclass']
# Pipelines
num_pipe = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
cat_pipe = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
preprocessor = ColumnTransformer([
('num', num_pipe, num_features),
('cat', cat_pipe, cat_features)
])
Run to view results
SEED = 42
# Split the data
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=SEED, stratify=y)
X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc = preprocessor.transform(X_val)
X_train_proc = X_train_proc.astype(np.float32)
X_val_proc = X_val_proc.astype(np.float32)
X_train_proc.shape, X_val_proc.shape
Run to view results
def build_model(input_dim,
n_hidden_layers=2,
n_neurons=16,
activation='relu',
learning_rate=0.001,
dropout_rate=0.0):
inputs = keras.Input(shape=(input_dim,))
x = inputs
for i in range(n_hidden_layers):
x = keras.layers.Dense(n_neurons, activation=activation)(x)
if dropout_rate and dropout_rate > 0:
x = keras.layers.Dropout(dropout_rate)(x)
outputs = keras.layers.Dense(1, activation='sigmoid')(x)
model = keras.Model(inputs, outputs)
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer,
loss='binary_crossentropy',
metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
return model
Run to view results
# Define the model before using it
input_dim = X_train_proc.shape[1] # Number of features in the input
model = build_model(input_dim=input_dim, n_hidden_layers=2, n_neurons=16, activation='relu', learning_rate=0.001, dropout_rate=0.2)
# Train the model
epochs = 60
batch_size = 32
callbacks = [
keras.callbacks.EarlyStopping(monitor='val_auc', mode='max', patience=6, restore_best_weights=True)
]
history = model.fit(
X_train_proc, y_train,
validation_data=(X_val_proc, y_val),
epochs=epochs,
batch_size=batch_size,
callbacks=callbacks,
verbose=2
)
Run to view results
plt.figure(figsize=(8,5))
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Época')
plt.ylabel('Loss (binary_crossentropy)')
plt.title('Loss por época')
plt.legend()
plt.grid(True)
plt.show()
Run to view results
y_val_proba = model.predict(X_val_proc).ravel()
y_val_pred = (y_val_proba >= 0.5).astype(int)
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("ROC AUC:", roc_auc_score(y_val, y_val_proba))
print("\nClassification report:\n", classification_report(y_val, y_val_pred))
cm = confusion_matrix(y_val, y_val_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicho')
plt.ylabel('Real')
plt.title('Matriz de confusión')
plt.show()
fpr, tpr, _ = roc_curve(y_val, y_val_proba)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc_score(y_val, y_val_proba):.3f}')
plt.plot([0,1],[0,1],'k--')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.show()
Run to view results
from itertools import product
results = []
neuron_list = [16, 32]
layers_list = [1, 2]
activation_list = ['relu','tanh']
lr_list = [1e-3, 1e-4]
batch_list = [32]
for neurons, layers, act, lr, batch in product(neuron_list, layers_list, activation_list, lr_list, batch_list):
print(f"Entrenando: neurons={neurons}, layers={layers}, act={act}, lr={lr}, batch={batch}")
m = build_model(input_dim, n_hidden_layers=layers, n_neurons=neurons, activation=act, learning_rate=lr)
h = m.fit(X_train_proc, y_train, validation_data=(X_val_proc, y_val),
epochs=30, batch_size=batch, callbacks=[keras.callbacks.EarlyStopping(monitor='val_auc', mode='max', patience=5, restore_best_weights=True)],
verbose=0)
best_val_auc = max(h.history['val_auc'])
best_val_loss = min(h.history['val_loss'])
results.append({
'neurons': neurons, 'layers': layers, 'activation': act, 'lr': lr,
'batch': batch, 'val_auc': best_val_auc, 'val_loss': best_val_loss
})
res_df = pd.DataFrame(results).sort_values(by='val_auc', ascending=False)
res_df.head(10)
Run to view results
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt
# Predicciones
y_val_proba = model.predict(X_val_proc).ravel()
y_val_pred = (y_val_proba >= 0.5).astype(int)
# Métricas numéricas
print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("Precision:", precision_score(y_val, y_val_pred))
print("Recall:", recall_score(y_val, y_val_pred))
print("F1-score:", f1_score(y_val, y_val_pred))
print("ROC AUC:", roc_auc_score(y_val, y_val_proba))
print("\nClassification report:\n", classification_report(y_val, y_val_pred))
# Matriz de confusión
cm = confusion_matrix(y_val, y_val_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.title("Matriz de confusión")
plt.show()
# Curva ROC
fpr, tpr, _ = roc_curve(y_val, y_val_proba)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"AUC = {roc_auc_score(y_val, y_val_proba):.3f}")
plt.plot([0,1],[0,1],'k--')
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.title("ROC Curve")
plt.legend()
plt.grid(True)
plt.show()
Run to view results
df = pd.read_csv('test.csv')
df.head()
Run to view results
test = pd.read_csv("test.csv")
test_id = test["PassengerId"]
Run to view results
test2 = test.copy()
test2 = test2.drop(columns=["Name","Ticket","Cabin"], errors="ignore")
test2["FamilySize"] = test2["SibSp"] + test2["Parch"] + 1
test2["IsAlone"] = (test2["FamilySize"] == 1).astype(int)
test2["Pclass"] = test2["Pclass"].astype(str)
# 3. Aplicar el preprocesador entrenado en train
test_proc = preprocessor.transform(test2)
# 4. Predicciones
test_proba = model.predict(test_proc).ravel()
test_pred = (test_proba >= 0.5).astype(int)
# 5. Crear submission
submission = pd.DataFrame({
"PassengerId": test_id,
"Survived": test_pred
})
submission.to_csv("submission.csv", index=False)
print("Archivo submission.csv creado")
submission.head()
Run to view results