import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
%matplotlib inline
sns.set()
test_df = pd.read_csv('titanic-test.csv')
train_df = pd.read_csv('titanic-train.csv')
train_df.head()
PassengerIdint64
Survivedint64
0
1
0
1
2
1
2
3
1
3
4
1
4
5
0
train_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
train_df.Sex.value_counts().plot(kind='bar',color=['b','r'])
plt.title('Distribución de sobrevivientes')
plt.show()
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
encoder_sex = label_encoder.fit_transform(train_df['Sex'])
train_df.head()
PassengerIdint64
Survivedint64
0
1
0
1
2
1
2
3
1
3
4
1
4
5
0
train_df['Age'] = train_df['Age'].fillna(train_df['Age'].median())
train_df['Embarked'] = train_df['Embarked'].fillna('S')
train_predictors = train_df.drop(['PassengerId','Survived','Name','Ticket','Cabin'], axis= 1)
categorical_cols = [cname for cname in train_predictors.columns if
train_predictors[cname].nunique() < 10 and
train_predictors[cname].dtype == 'object'
]
numerical_cols = [cname for cname in train_predictors.columns if
train_predictors[cname].dtype in ['int64', 'float64']
]
my_cols = categorical_cols + numerical_cols
train_predictors = train_predictors[my_cols]
dummy_encoded_train_predictors = pd.get_dummies(train_predictors)
train_df['Pclass'].value_counts()
y_target = train_df['Survived'].values
x_features_one = dummy_encoded_train_predictors.values
X_train, X_validation, Y_train, Y_validation = train_test_split(x_features_one,y_target, test_size=0.25, random_state=1)
tree_one = tree.DecisionTreeClassifier()
tree_one = tree_one.fit(x_features_one, y_target)
tree_one_accuracy = round(tree_one.score(x_features_one, y_target),4)
print('Accuracy: %0.4f'%(tree_one_accuracy))
Accuracy: 0.9798
from io import StringIO
from IPython.display import Image, display
import pydotplus
out=StringIO()
tree.export_graphviz(tree_one, out_file=out)
graph = pydotplus.graph_from_dot_data(out.getvalue())
graph.write_png('titanic.png')