import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pydotplus
from io import StringIO
from IPython.display import Image, display
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
1. Análisis y limpieza de los datos
df = pd.read_csv('./Datasets/Titanic.csv')
df.sample(10)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df['Age'] = df['Age'].fillna(df['Age'].median())
df[df['Age'].isnull()] # al mostrar esto en pantalla podemos apreciar que no existen valores dentro de la columna age con datos nulos
df['Embarked'].describe()
df['Embarked'] = df['Embarked'].fillna('S')
df[df['Embarked'].isnull()] #verificando que se llenaron todos los datos nulos
categorical_cols = [cname for cname in df.columns if df[cname].nunique()<10 and df[cname].dtype=='object']
numerical_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
my_correct_cols = categorical_cols + numerical_cols
train_predictors = df[my_correct_cols]
train_predictors.shape
train_predictors.sample(5)
train_predictors = train_predictors.drop(['PassengerId', 'Survived'], axis=1)
train_predictors.head(5)
dummy_encoded_train_predictors = pd.get_dummies(train_predictors)
dummy_encoded_train_predictors.head()
y_target = df['Survived'].values
x_features = dummy_encoded_train_predictors.values
X_train, X_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.3, random_state=1)
model = DecisionTreeClassifier()
model = model.fit(X_train, y_train)
model_accuracy = round(model.score(X_test, y_test), 3)
print('Acurracy:%0.3f'%(model_accuracy))
Acurracy:0.757
out = StringIO()
export_graphviz(model, out_file=out)
graph = pydotplus.graph_from_dot_data(out.getvalue())
graph.write_png('titanic.png')
predicciones = model.predict(X=X_test)
rms = mean_squared_error(y_true=y_test, y_pred=predicciones, squared=False)
print('Error:%0.4f'%(rms))
Error:0.4925
cnf_matriz = confusion_matrix(y_test, predicciones)
cnf_matriz
class_names = [0,1]
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(cnf_matriz), annot=True, cmap='Blues_r', fmt='g')
ax.xaxis.set_label_position('top')
plt.tight_layout()
plt.title('Matriz de confusion', y=1.1)
plt.ylabel('Etiqueta actual')
plt.xlabel('Etiqueta de predicción')