AFAME TECHNOLOGIES: TITANIC SURVIVAL PREDICTION
Importing Libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pandas as pd
Dataset: TITANIC SURVIVAL PREDICTION
data=pd.read_csv("Titanic-Dataset.csv")
data.head(10)
Categorical Columns - [Survived, PClass, Sex, SibSp, Parch, Embarked]
Numerical Columns - [Age, Fare, Passengerid]
Mixed Columns - [Name, Ticket, Cabin]
data.info()
data.shape
data['Survived'].value_counts()
Data Visualization
sns.countplot(data=data, x='Survived')
death_percent=round((data['Survived'].value_counts().values[0]/891)*100)
print((data['Pclass'].value_counts()/891)*100)
sns.countplot(data=data, x='Pclass')
print((data['Sex'].value_counts()/891)*100)
sns.countplot(data=data, x='Sex')
print(data['SibSp'].value_counts())
sns.countplot(data = data, x='SibSp')
print((data['Parch'].value_counts()/891)*100)
sns.countplot(data=data, x='Parch')
print((data['Embarked'].value_counts()/891)*100)
sns.countplot(data=data, x='Embarked')
sns.distplot(data['Age'])
print(data['Age'].skew())
print(data['Age'].kurt())
sns.boxplot(data, x ='Age')
sns.countplot(data, x='Survived', hue='Pclass')
pd.crosstab(data['Pclass'], data['Survived']).apply(lambda r: round((r/r.sum())*100,1), axis=1)
sns.countplot(data, x='Survived', hue='Sex')
pd.crosstab(data['Sex'], data['Survived']).apply(lambda r: round((r/r.sum())*100,1), axis=1)
sns.countplot(data, x='Survived', hue='Embarked')
pd.crosstab(data['Embarked'], data['Survived']).apply(lambda r: round((r/r.sum())*100,1), axis=1)
plt.figure(figsize=(15,6))
sns.distplot(data[data['Survived']==0]['Age'])
sns.distplot(data[data['Survived']==1]['Age'])
plt.figure(figsize=(15,6))
sns.distplot(data[data['Survived']==0]['Fare'])
sns.distplot(data[data['Survived']==1]['Fare'])
from sklearn import preprocessing as pp
data['Fare'].head()
data_scaler=pp.MinMaxScaler(feature_range=(0,1))
fare_arr=data[['Fare']]
fare_arr
fare_scaled=data_scaler.fit_transform(fare_arr)
fare_scaled
data['fare_scaled']=fare_scaled
data.info()
col_to_drop=['Name','Cabin']
data=data.drop(col_to_drop,axis=1)
data.info()
data=data.dropna()
data.info()
pip install xgboost
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['Sex'] = label_encoder.fit_transform(data['Sex'])
print(data['Sex'].unique())
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['Embarked'] = label_encoder.fit_transform(data['Embarked'])
print(data['Embarked'].unique())
data.head()
x=data.drop(['Survived','Ticket','Fare','PassengerId'],axis=1)
y=data['Survived']
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)
X_train.shape,X_test.shape
x.head()
ML Model Learning
models = {
'KNN': KNeighborsClassifier(),
'SVM': SVC(),
'BN': GaussianNB(),
'RBF': RandomForestClassifier(),
'DT': DecisionTreeClassifier(),
'XGBoost': XGBClassifier()
}
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
f1 = (2*tp)/(2*tp + fp + fn)
precision = tp/(tp+fp)
results[name] = {
'Accuracy': accuracy,
'Sensitivity': sensitivity, #recall
'Specificity': specificity,
'F1 Score': f1,
'Precision': precision,
}
# Print the results
for name, result in results.items():
print(f"Model: {name}")
print(f"Accuracy: {result['Accuracy']}")
print(f"Precision: {result['Precision']}")
print(f"Sensitivity: {result['Sensitivity']}")
print(f"Specificity: {result['Specificity']}")
print(f"F1 Score: {result['F1 Score']}")
print()
!pip install lightgbm==4.3.0
!pip install catboost==1.2.5
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
models = {
'ET': ExtraTreesClassifier(),
'LIGHTGBM': LGBMClassifier(),
'RC': RidgeClassifier(),
'LR': LogisticRegression(),
'gb': GradientBoostingClassifier(),
'catboost': CatBoostClassifier()
}
results = {}
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
f1 = (2*tp)/(2*tp + fp + fn)
precision = tp/(tp+fp)
results[name] = {
'Accuracy': accuracy,
'Sensitivity': sensitivity,
'Specificity': specificity,
'F1 Score': f1,
'Precision': precision,
}
for name, result in results.items():
print(f"Model: {name}")
print(f"Accuracy: {result['Accuracy']}")
print(f"Precision: {result['Precision']}")
print(f"Sensitivity: {result['Sensitivity']}")
print(f"Specificity: {result['Specificity']}")
print(f"F1 Score: {result['F1 Score']}")
print()
Among different ML algorithms, "GradientBoostingClassifier" outperforms with the highest accuracy.
new_passenger = [[0,0,43.0,1,0,1,0.103644]]
model=KNeighborsClassifier()
model.fit(X_train,y_train)
model.predict(X_test)
prediction = model.predict(new_passenger)
print("Survived Prediction:", prediction[0])