import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re # Regular expression
train_df = pd.read_csv("../data/train.csv")
test_df = pd.read_csv("../data/test.csv")
train_df.shape
cols = train_df.columns
cols
train_df.describe()
train_df.info()
train_df.isnull().sum()
test_df.isnull().sum()
ax = plt.axes()
sns.heatmap(train_df.isnull(), cmap = 'viridis', cbar=False, ax=ax)
ax.set_title('Null values heatmap')
plt.show()
def clean_cabins(df):
return np.where(df["Cabin"].isnull(), 0, 1)
train_df["has_cabin"] = clean_cabins(train_df)
test_df["has_cabin"] = clean_cabins(test_df)
def clean_embarked(df):
return df["Embarked"].fillna("S")
train_df["Embarked"] = clean_embarked(train_df)
test_df["Embarked"] = clean_embarked(test_df)
def clean_age(df):
mean = df["Age"].mean()
std = df["Age"].std()
min = mean - std
max = mean + std
return df["Age"].fillna(np.random.randint(min, max))
train_df["Age"] = clean_age(train_df)
test_df["Age"] = clean_age(test_df)
def clean_fare(df):
mean = df["Fare"].mean()
std = df["Fare"].std()
min = mean - std
max = mean + std
return df["Fare"].fillna(np.random.randint(min, max))
train_df["Fare"] = clean_fare(train_df)
test_df["Fare"] = clean_fare(test_df)
test_df.isnull().sum()
def create_fam_size(df):
return df["SibSp"] + df["Parch"] + 1
train_df["family_size"] = create_fam_size(train_df)
test_df["family_size"] = create_fam_size(test_df)
def create_categorical_fare(df):
return pd.qcut(x=df["Fare"], q=4, labels=[0, 1, 2, 3]).astype(int)
train_df["categorical_fare"] = create_categorical_fare(train_df)
test_df["categorical_fare"] = create_categorical_fare(test_df)
def create_categorical_title(df):
def find_title(name: str) -> str:
search = re.search(
" ([A-Za-z]+)\.", name
) # Search for a word with a point at the end
if search:
title = search.group(1)
if title in ["Mlle", "Ms"]:
return "Miss"
elif title in ["Mme", "Mrs"]:
return "Mrs"
elif title == "Mr":
return "Mr"
else:
return "Rare"
return ""
return_title = df["Name"].apply(find_title)
dict_title = {"Miss": 1, "Mrs": 2, "Mr": 3, "Rare": 4}
return return_title.replace(dict_title)
train_df["Title"] = create_categorical_title(train_df)
test_df["Title"] = create_categorical_title(test_df)
def create_categorical_sex(df):
return np.where(df["Sex"] == "male", 1, 0)
def create_categorical_embarked(df):
return df["Embarked"].replace({"S": 0, "C": 1, "Q": 2})
train_df["categorical_sex"] = create_categorical_sex(train_df)
train_df["Embarked"] = create_categorical_embarked(train_df)
test_df["categorical_sex"] = create_categorical_sex(test_df)
test_df["Embarked"] = create_categorical_embarked(test_df)
train_df.isnull().sum()
def remove_useless_features(df):
drop_list = ["PassengerId", "Cabin", "Ticket", "SibSp", "Name", "Sex"]
return df.drop(drop_list, axis=1)
train_df = remove_useless_features(train_df)
test_df = remove_useless_features(test_df)
train_df.head()
colormap = plt.cm.Blues
plt.figure(figsize=(14,10))
sns.heatmap(train_df.corr(), cmap=colormap, annot=True, linewidths=0.2)
train_df['Survived'].value_counts()
sns.countplot(x='Survived', data=train_df)
plt.title("Titanic Survived")
plt.show()
explode = [0, 0.05]
train_df['Survived'].value_counts().plot.pie(autopct = '%1.2f%%', explode=explode)
train_df['Pclass'].value_counts()
train_df.groupby(['Pclass', 'Survived'])['Survived'].count()
sns.catplot(x='Pclass', y='Survived', data=train_df, kind='point')
plt.title("Pclass vs Survived")
plt.show()
sns.catplot(x='categorical_sex', y='Fare', data=train_df, kind='boxen')
plt.title("Fare & Sex")
plt.show()
sns.catplot(x='categorical_sex', y='Age', data=train_df)
plt.title("Sex & age")
plt.show()
sns.catplot(x='categorical_sex', y='Age', data=train_df, kind='box', hue='Pclass')
plt.title("Sex, age & Pclass")
plt.show()
sns.catplot(x='Pclass', y='Age', data=train_df, kind='violin', hue='categorical_sex')
plt.title("Sex, age & Pclass")
plt.show()
sns.relplot(x='Age', y='Fare', data=train_df, row='categorical_sex', col='Pclass')
plt.show()
train_df.head()
# Algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
# K-Fold Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format
train_df = train_df.reindex(np.random.permutation(train_df.index)) # shuffle the training set
x_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(x_train, y_train)
y_pred = sgd.predict(test_df)
sgd.score(x_train, y_train)
acc_sgd = round(sgd.score(x_train, y_train) * 100, 2)
randomforest = RandomForestClassifier(n_estimators=100)
randomforest.fit(x_train, y_train)
y_pred = randomforest.predict(test_df)
randomforest.score(x_train, y_train)
acc_random_forest = round(randomforest.score(x_train, y_train) * 100, 2)
logreg = LogisticRegression(max_iter=10000)
logreg.fit(x_train, y_train)
y_pred = logreg.predict(test_df)
acc_log = round(logreg.score(x_train, y_train) * 100, 2)
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train, y_train)
y_pred = knn.predict(test_df)
acc_knn = round(knn.score(x_train, y_train) * 100, 2)
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)
y_pred = decision_tree.predict(test_df)
acc_decision_tree = round(decision_tree.score(x_train, y_train) * 100, 2)
gaussian = GaussianNB()
gaussian.fit(x_train, y_train)
y_pred = gaussian.predict(test_df)
acc_gaussian = round(gaussian.score(x_train, y_train) * 100, 2)
perceptron = Perceptron(max_iter=10000)
perceptron.fit(x_train, y_train)
Y_pred = perceptron.predict(test_df)
acc_perceptron = round(perceptron.score(x_train, y_train) * 100, 2)
linear_svc = LinearSVC(max_iter=10000)
linear_svc.fit(x_train, y_train)
Y_pred = linear_svc.predict(test_df)
acc_linear_svc = round(linear_svc.score(x_train, y_train) * 100, 2)
results = pd.DataFrame(
{
"Model": [
"Support Vector Machines",
"KNN",
"Logistic Regression",
"Random Forest",
"Naive Bayes",
"Perceptron",
"Stochastic Gradient Decent",
"Decision Tree",
],
"Score": [
acc_linear_svc,
acc_knn,
acc_log,
acc_random_forest,
acc_gaussian,
acc_perceptron,
acc_sgd,
acc_decision_tree,
],
}
)
result_df = results.sort_values(by="Score", ascending=False)
result_df = result_df.set_index("Score")
result_df.head(9)
scores = cross_val_score(decision_tree, x_train, y_train, cv=10, scoring="accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())
importances = pd.DataFrame(
{
"feature": x_train.columns,
"importance": np.round(randomforest.feature_importances_, 3),
}
)
importances = importances.sort_values("importance", ascending=False).set_index(
"feature"
)
importances.plot.bar()
train_df = train_df.drop("Parch", axis=1)
test_df = test_df.drop("Parch", axis=1)
x_train = train_df.drop("Survived", axis=1)
y_train = train_df["Survived"]
random_forest = RandomForestClassifier(n_estimators=100, oob_score=True)
random_forest.fit(x_train.values, y_train.values)
y_prediction = random_forest.predict(test_df.values)
random_forest.score(x_train.values, y_train.values)
acc_random_forest = round(random_forest.score(x_train.values, y_train.values) * 100, 2)
print(
round(
acc_random_forest,
2,
),
"%",
)
print("oob score:", round(random_forest.oob_score_, 4)*100, "%")
random_forest = RandomForestClassifier(
criterion="gini",
min_samples_leaf=1,
min_samples_split=12,
n_estimators=1500,
max_features="auto",
oob_score=True,
random_state=1,
n_jobs=-1,
)
random_forest.fit(x_train.values, y_train.values)
Y_prediction = random_forest.predict(test_df.values)
random_forest.score(x_train.values, y_train.values)
print("oob score:", round(random_forest.oob_score_, 4) * 100, "%")