import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sc
df_train = pd.read_csv('./Titanic/train.csv')
df_test = pd.read_csv('./Titanic/test.csv')
pd.options.display.float_format = '{:,.2f}'.format
print(df_train.shape)
df_train.info()
df_train.head(5)
df_train.describe(include='all').fillna('-')
df_train.groupby(['Survived'])['Survived'].count()
sns.countplot(x=df_train['Survived'])
df_train.groupby(['Survived', 'Sex'])['Sex'].count()
sns.countplot(data=df_train, x='Sex', hue='Survived')
edad = { 'Age': [min,max, np.mean, np.median]}
df_train.groupby(['Survived', 'Sex']).aggregate(edad)
sns.boxplot(data=df_train, x='Survived', y='Age', hue='Sex')
niños = df_train[df_train['Age'] < 18.0]
num_niños = len(niños['PassengerId'])/len(df_train['Age'].notnull())*100
print(f'Porcentage of kids a board {num_niños} \nHow many kids are? {len(niños)}')
niños.groupby(['Survived', 'Sex']).aggregate(edad)
df_train.groupby('Pclass')['Pclass'].count()
ticket_class = df_train.groupby(['Survived', 'Pclass'])['Survived'].count()
print(ticket_class)
print(ticket_class.groupby(level=0).apply(lambda x: x/ x.sum() *100))
print(ticket_class.groupby(level=1).apply(lambda x: x/ x.sum() *100))
sns.countplot(data=df_train, x='Survived', hue='Pclass')
df_train.groupby(['Survived', 'Pclass'])['Age'].mean()
df_train.groupby(['Survived', 'Pclass'])['Age'].mean().groupby(level=1).apply(lambda x: np.mean(x))
sns.boxplot(x=df_train['Pclass'], y=df_train['Age'])
df_train.groupby(['Survived', 'Embarked'])['Survived'].count()
sns.countplot(data=df_train, x='Embarked', hue='Survived')
df_train.groupby(['Survived', 'SibSp', 'Parch'])['Survived'].count().unstack().fillna('-')
sns.barplot(data=df_train, x='SibSp', y='Parch', hue='Survived')
df_test['Family'] = np.where(((df_test['Parch'] > 0) & (df_test['SibSp'] > 0)), 1, 0)
df_train['Family'] = np.where(((df_train['Parch'] > 0) & (df_train['SibSp'] > 0)), 1, 0)
df_train['Family'].value_counts()
sns.countplot(data=df_train, x='Family', hue='Survived')
pd.cut(df_train['Fare'], bins= 3).value_counts()
df_test['Cost'] = pd.cut(df_test['Fare'], bins= [-1,171,342,513])
df_train['Cost'] = pd.cut(df_train['Fare'], bins= [-1,171,342,513])
df_train['Cost'].sample(5)
sns.countplot(data=df_train, x='Cost', hue='Survived')
df_train.groupby(['Survived', 'Cost'])['Survived'].count()
df_train.isnull().sum()
sns.displot(df_train['Age'].dropna(), kde=True, color='darkblue')
df_train.groupby('Pclass')['Age'].mean()
class_age = {
1:38,
2:29,
3:25
}
def fill_age(columns):
age = columns[0]
pclass = np.int64(columns[1])
return class_age[pclass] if pd.isnull(age) else age
df_test['Age'] = df_test[['Age', 'Pclass']].apply(fill_age, axis=1)
df_train['Age'] = df_train[['Age', 'Pclass']].apply(fill_age, axis=1)
df_train.isnull().sum()
df_test = df_test.drop(columns='Cabin')
df_train_clean = df_train.drop(columns='Cabin')
df_train_clean.sample(5)
df_train_clean.info()
df_test[['T_cheap', 'T_medium', 'T_expensive']] = pd.get_dummies(df_test['Cost'])
df_train_clean[['T_cheap', 'T_medium', 'T_expensive']] = pd.get_dummies(df_train_clean['Cost'])
df_train_clean[['Survived', 'Pclass', 'Family', 'Cost', 'Sex']] = df_train_clean[['Survived', 'Pclass', 'Family', 'Cost', 'Sex']].astype('category')
df_test[['Pclass', 'Family', 'Cost', 'Sex']] = df_test[['Pclass', 'Family', 'Cost', 'Sex']].astype('category')
df_train_clean.info()
print(pd.get_dummies(df_train_clean['Sex']))
df_test[['Female', 'Male']] = pd.get_dummies(df_test['Sex'])
df_train_clean[['Female', 'Male']] = pd.get_dummies(df_train_clean['Sex'])
df_train_clean[['Female', 'Male']]
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
feature_columns = ['Age', 'Pclass', 'Female', 'Male', 'Family', 'T_cheap', 'T_medium', 'T_expensive']
x = df_train_clean[feature_columns]
y = df_train_clean['Survived']
# Modelo de regresion logistica
logreg = LogisticRegression()
logreg.fit(x, y)
# Modelo de arboles de decisión
tree_one = DecisionTreeClassifier()
tree_one.fit(x, y)
# labels = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
# labels = np.asarray(labels).reshape(2,2)
# Modelo de regresion logistica
y_predict = logreg.predict(x)
cf_matrix = metrics.confusion_matrix(y, y_predict)
print(sns.heatmap(cf_matrix, annot=True, fmt='.2f', cmap='Blues', cbar=False))
print(metrics.accuracy_score(y, y_predict))
# Modelo de arbol de decisición
y_tree_predict = tree_one.predict(x)
cf_matrix_tree = metrics.confusion_matrix(y, y_tree_predict)
print(sns.heatmap(cf_matrix_tree, annot=True, fmt='.2f', cmap='Greens', cbar=False))
print(metrics.accuracy_score(y, y_tree_predict))
def conf_mat_acc(modelo):
disp = metrics.plot_confusion_matrix(modelo, x, y, cmap=plt.cm.Blues, values_format="d")
true_pred = disp.confusion_matrix[0,0]+disp.confusion_matrix[1,1]
total_data = np.sum(disp.confusion_matrix)
accuracy = true_pred/total_data
print('accuracy: ', np.round(accuracy, 3))
plt.show()
conf_mat_acc(logreg)
# Modelo de regresión logistica
x_test = df_test[['Age', 'Pclass', 'Female', 'Male', 'Family', 'T_cheap', 'T_medium', 'T_expensive']]
x_test.to_csv('./Titanic/pruebaUnitaria.csv')
df_test['Survived'] = tree_one.predict(x_test)
data_send = df_test[['PassengerId', 'Survived']]
data_send.to_csv('answer.csv', index=False)
# Prueba unitaria
df_prueba = pd.read_csv('./Titanic/pruebaUnitaria.csv')
x_test = df_prueba[['Age', 'Pclass', 'Female', 'Male', 'Family', 'T_cheap', 'T_medium', 'T_expensive']]
df_prueba['Survived'] = tree_one.predict(x_test)
data_send = df_prueba['Survived']
data_send.to_csv('./Titanic/pruebaUnitariaRespuesta.csv', index=False)
%pylab inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
# Visualizacion de superviviente
def visualizacion(respuesta):
if respuesta == 1:
img = mpimg.imread('./Image/success.png')
else:
img = mpimg.imread('./Image/fail.png')
plt.imshow(img)
plt.title("¿Sobreviviste?", {'fontsize':24})
plt.axis(False)
plt.show()
resultado = pd.read_csv('./Titanic/pruebaUnitariaRespuesta.csv')
visualizacion(resultado['Survived'][0])