Titanic Survival Prediction with Decision Tree
1. Exploring and pre-processing dataset
# Importing modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import tree
# Loading datasets
train = pd.read_csv('titanic-train.csv')
test = pd.read_csv('titanic-test.csv')
train.info()
# train['Sex'].value_counts().plot(kind = 'bar', color = ['g', 'y'])
# A label encoder will assign integer values to an array of strings
# For example, the array ['a','b','b','a','c'] would be encoded
# as an integer array [0,1,1,0,2]
from sklearn import preprocessing
encoder = preprocessing.LabelEncoder()
# encoder.fit_transform(array)
def prepro(data, isTest):
try:
# Remove useless columns
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
# if isTest:
# data = data.drop(['Survived'], axis = 1)
# Filling empty numerical values with median
data['Age'] = data['Age'].fillna(data['Age'].median())
data['Fare'] = data['Fare'].fillna(data['Fare'].median())
# Filling empty embarked with S
data['Embarked'] = data['Embarked'].fillna('S')
# Replacing categorical sex with integer values (0 for F and 1 for M)
data['Sex'] = encoder.fit_transform(data['Sex'])
except KeyError as e:
print('Invalid preprocess')
print(e)
# Returning preprocessed data
return data
train = prepro(train, True)
train.head()
test = prepro(test, False)
test.head()
# Extracting categorical columns
categorical_cols = [col for col in train.columns if
train[col].nunique() < 10 and
train[col].dtype == 'object'
]
# Extracting numerical columns
numerical_cols = [col for col in train.columns if train[col].dtype in ['int64', 'float64']]
print(categorical_cols)
print(numerical_cols)
train = pd.get_dummies(train)
2. Constructing ML Model
x = train.drop(['Survived'], axis=1).values
y = train['Survived'].values
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.25, random_state=0)
classifier = tree.DecisionTreeClassifier()
classifier.fit(x_train, y_train)
3. Accuracy
classifier.score(x_validation, y_validation)
4. Visualazing Tree
pip install pydotplus
# Generate an image of the tree
from io import StringIO
from IPython.display import Image, display
import pydotplus
out = StringIO()
tree.export_graphviz(classifier, out_file=out)
img = pydotplus.graph_from_dot_data(out.getvalue())
img.write_png('titanic.png')