import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sc
df_train = pd.read_csv('./Titanic/train.csv')
df_test = pd.read_csv('./Titanic/test.csv')
pd.options.display.float_format = '{:,.2f}'.format
print(df_train.shape)
df_train.info()
(891, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df_train.head(5)
PassengerIdint64
Survivedint64
0
1
0
1
2
1
2
3
1
3
4
1
4
5
0
df_train.describe(include='all').fillna('-')
PassengerIdobject
-27.3%
891.018.2%
5 others54.5%
Survivedobject
-27.3%
0.027.3%
4 others45.5%
count
891.0
891.0
unique
-
-
top
-
-
freq
-
-
mean
446.0
0.3838383838383838
std
257.3538420152301
0.4865924542648575
min
1.0
0.0
25%
223.5
0.0
50%
446.0
0.0
75%
668.5
1.0
df_train.groupby(['Survived'])['Survived'].count()
sns.countplot(x=df_train['Survived'])
df_train.groupby(['Survived', 'Sex'])['Sex'].count()
sns.countplot(data=df_train, x='Sex', hue='Survived')
edad = { 'Age': [min,max, np.mean, np.median]}
df_train.groupby(['Survived', 'Sex']).aggregate(edad)
Age minfloat64
Age maxfloat64
(0, 'female')
2
57
(0, 'male')
1
74
(1, 'female')
0.75
63
(1, 'male')
0.42
80
sns.boxplot(data=df_train, x='Survived', y='Age', hue='Sex')
niños = df_train[df_train['Age'] < 18.0]
num_niños = len(niños['PassengerId'])/len(df_train['Age'].notnull())*100
print(f'Porcentage of kids a board {num_niños} \nHow many kids are? {len(niños)}')
niños.groupby(['Survived', 'Sex']).aggregate(edad)
Porcentage of kids a board 12.682379349046016
How many kids are? 113
Age minfloat64
Age maxfloat64
(0, 'female')
2
17
(0, 'male')
1
17
(1, 'female')
0.75
17
(1, 'male')
0.42
17
df_train.groupby('Pclass')['Pclass'].count()
ticket_class = df_train.groupby(['Survived', 'Pclass'])['Survived'].count()
print(ticket_class)
print(ticket_class.groupby(level=0).apply(lambda x: x/ x.sum() *100))
print(ticket_class.groupby(level=1).apply(lambda x: x/ x.sum() *100))
Survived Pclass
0 1 80
2 97
3 372
1 1 136
2 87
3 119
Name: Survived, dtype: int64
Survived Pclass
0 1 14.57
2 17.67
3 67.76
1 1 39.77
2 25.44
3 34.80
Name: Survived, dtype: float64
Survived Pclass
0 1 37.04
2 52.72
3 75.76
1 1 62.96
2 47.28
3 24.24
Name: Survived, dtype: float64
sns.countplot(data=df_train, x='Survived', hue='Pclass')
df_train.groupby(['Survived', 'Pclass'])['Age'].mean()
df_train.groupby(['Survived', 'Pclass'])['Age'].mean().groupby(level=1).apply(lambda x: np.mean(x))
sns.boxplot(x=df_train['Pclass'], y=df_train['Age'])
df_train.groupby(['Survived', 'Embarked'])['Survived'].count()
sns.countplot(data=df_train, x='Embarked', hue='Survived')
df_train.groupby(['Survived', 'SibSp', 'Parch'])['Survived'].count().unstack().fillna('-')
0object
-41.7%
374.08.3%
6 others50%
1object
-33.3%
13.08.3%
7 others58.3%
(0, 0)
374.0
13.0
(0, 1)
59.0
23.0
(0, 2)
12.0
1.0
(0, 3)
-
7.0
(0, 4)
-
9.0
(0, 5)
-
-
(0, 8)
-
-
(1, 0)
163.0
25.0
(1, 1)
64.0
34.0
(1, 2)
4.0
6.0
sns.barplot(data=df_train, x='SibSp', y='Parch', hue='Survived')
df_test['Family'] = np.where(((df_test['Parch'] > 0) & (df_test['SibSp'] > 0)), 1, 0)
df_train['Family'] = np.where(((df_train['Parch'] > 0) & (df_train['SibSp'] > 0)), 1, 0)
df_train['Family'].value_counts()
sns.countplot(data=df_train, x='Family', hue='Survived')
pd.cut(df_train['Fare'], bins= 3).value_counts()
df_test['Cost'] = pd.cut(df_test['Fare'], bins= [-1,171,342,513])
df_train['Cost'] = pd.cut(df_train['Fare'], bins= [-1,171,342,513])
df_train['Cost'].sample(5)
sns.countplot(data=df_train, x='Cost', hue='Survived')
df_train.groupby(['Survived', 'Cost'])['Survived'].count()
df_train.isnull().sum()
sns.displot(df_train['Age'].dropna(), kde=True, color='darkblue')
df_train.groupby('Pclass')['Age'].mean()
class_age = {
1:38,
2:29,
3:25
}
def fill_age(columns):
age = columns[0]
pclass = np.int64(columns[1])
return class_age[pclass] if pd.isnull(age) else age
df_test['Age'] = df_test[['Age', 'Pclass']].apply(fill_age, axis=1)
df_train['Age'] = df_train[['Age', 'Pclass']].apply(fill_age, axis=1)
df_train.isnull().sum()
df_test = df_test.drop(columns='Cabin')
df_train_clean = df_train.drop(columns='Cabin')
df_train_clean.sample(5)
PassengerIdint64
Survivedint64
548
549
0
682
683
0
827
828
1
25
26
1
387
388
1
df_train_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 891 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Embarked 889 non-null object
11 Family 891 non-null int64
12 Cost 891 non-null category
dtypes: category(1), float64(2), int64(6), object(4)
memory usage: 84.8+ KB
df_test[['T_cheap', 'T_medium', 'T_expensive']] = pd.get_dummies(df_test['Cost'])
df_train_clean[['T_cheap', 'T_medium', 'T_expensive']] = pd.get_dummies(df_train_clean['Cost'])
df_train_clean[['Survived', 'Pclass', 'Family', 'Cost', 'Sex']] = df_train_clean[['Survived', 'Pclass', 'Family', 'Cost', 'Sex']].astype('category')
df_test[['Pclass', 'Family', 'Cost', 'Sex']] = df_test[['Pclass', 'Family', 'Cost', 'Sex']].astype('category')
df_train_clean.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null category
2 Pclass 891 non-null category
3 Name 891 non-null object
4 Sex 891 non-null category
5 Age 891 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Embarked 889 non-null object
11 Family 891 non-null category
12 Cost 891 non-null category
13 T_cheap 891 non-null uint8
14 T_medium 891 non-null uint8
15 T_expensive 891 non-null uint8
dtypes: category(5), float64(2), int64(3), object(3), uint8(3)
memory usage: 63.5+ KB
print(pd.get_dummies(df_train_clean['Sex']))
df_test[['Female', 'Male']] = pd.get_dummies(df_test['Sex'])
df_train_clean[['Female', 'Male']] = pd.get_dummies(df_train_clean['Sex'])
df_train_clean[['Female', 'Male']]
female male
0 0 1
1 1 0
2 1 0
3 1 0
4 0 1
.. ... ...
886 0 1
887 1 0
888 1 0
889 0 1
890 0 1
[891 rows x 2 columns]
Femaleuint8
0 - 1
Maleuint8
0 - 1
0
0
1
1
1
0
2
1
0
3
1
0
4
0
1
5
0
1
6
0
1
7
0
1
8
1
0
9
1
0
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
feature_columns = ['Age', 'Pclass', 'Female', 'Male', 'Family', 'T_cheap', 'T_medium', 'T_expensive']
x = df_train_clean[feature_columns]
y = df_train_clean['Survived']
# Modelo de regresion logistica
logreg = LogisticRegression()
logreg.fit(x, y)
# Modelo de arboles de decisión
tree_one = DecisionTreeClassifier()
tree_one.fit(x, y)
# labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
# labels = np.asarray(labels).reshape(2,2)
# Modelo de regresion logistica
y_predict = logreg.predict(x)
cf_matrix = metrics.confusion_matrix(y, y_predict)
print(sns.heatmap(cf_matrix, annot=True, fmt='.2f', cmap='Blues', cbar=False))
print(metrics.accuracy_score(y, y_predict))
AxesSubplot(0.125,0.125;0.775x0.755)
0.8047138047138047
# Modelo de arbol de decisición
y_tree_predict = tree_one.predict(x)
cf_matrix_tree = metrics.confusion_matrix(y, y_tree_predict)
print(sns.heatmap(cf_matrix_tree, annot=True, fmt='.2f', cmap='Greens', cbar=False))
print(metrics.accuracy_score(y, y_tree_predict))
AxesSubplot(0.125,0.125;0.775x0.755)
0.8967452300785634
def conf_mat_acc(modelo):
disp = metrics.plot_confusion_matrix(modelo, x, y, cmap=plt.cm.Blues, values_format="d")
true_pred = disp.confusion_matrix[0,0]+disp.confusion_matrix[1,1]
total_data = np.sum(disp.confusion_matrix)
accuracy = true_pred/total_data
print('accuracy: ', np.round(accuracy, 3))
plt.show()
conf_mat_acc(logreg)
/shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
accuracy: 0.805
# Modelo de regresión logistica
x_test = df_test[['Age', 'Pclass', 'Female', 'Male', 'Family', 'T_cheap', 'T_medium', 'T_expensive']]
x_test.to_csv('./Titanic/pruebaUnitaria.csv')
df_test['Survived'] = tree_one.predict(x_test)
data_send = df_test[['PassengerId', 'Survived']]
data_send.to_csv('answer.csv', index=False)
# Prueba unitaria
df_prueba = pd.read_csv('./Titanic/pruebaUnitaria.csv')
x_test = df_prueba[['Age', 'Pclass', 'Female', 'Male', 'Family', 'T_cheap', 'T_medium', 'T_expensive']]
df_prueba['Survived'] = tree_one.predict(x_test)
data_send = df_prueba['Survived']
data_send.to_csv('./Titanic/pruebaUnitariaRespuesta.csv', index=False)
%pylab inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
Populating the interactive namespace from numpy and matplotlib
# Visualizacion de superviviente
def visualizacion(respuesta):
if respuesta == 1:
img = mpimg.imread('./Image/success.png')
else:
img = mpimg.imread('./Image/fail.png')
plt.imshow(img)
plt.title("¿Sobreviviste?", {'fontsize':24})
plt.axis(False)
plt.show()
resultado = pd.read_csv('./Titanic/pruebaUnitariaRespuesta.csv')
visualizacion(resultado['Survived'][0])