import pandas as pd
import openpyxl
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from sklearn.model_selection import train_test_split
#Este código carga los datops de la hoja DataTitanic del archivo de excel-
df = pd.read_excel("/work/Prueba Técnica DS.xlsx", sheet_name='DataTitanic')
# que columnas tienen los datos?
df.columns
# qué tamaño tienen los datos?
df.shape
# hay valores nulos en los datos?
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
dtypes: float64(1), int64(3), object(2)
memory usage: 41.9+ KB
# como se distribuyen las variables numéricas
df.describe()
PassengerIdfloat64
Survivedfloat64
count
891.0
891.0
mean
446.0
0.3838383838383838
std
257.3538420152301
0.4865924542648575
min
1.0
0.0
25%
223.5
0.0
50%
446.0
0.0
75%
668.5
1.0
max
891.0
1.0
# como se comportan las variables categóricas
df.describe(include=['O'])
Nameobject
Sexobject
count
891
891
unique
891
2
top
Andersson, Master. Sigvard Harald Elias
male
freq
1
577
df.groupby(['Survived']).count()['PassengerId']
# target vs sex
df.groupby(['Survived','Sex']).count()['PassengerId']
grouped_sex = df.groupby(['Survived','Sex']).count()['PassengerId']
print(grouped_sex)
(grouped_sex.unstack(level=0).plot.bar())
plt.show()
Survived Sex
0 female 81
male 468
1 female 233
male 109
Name: PassengerId, dtype: int64
Procesamiento de datos
selección de Variables
Survived
Sex
Age
Pclass
Tratamos Datos faltantes en Age train['Age'].isna() La variable "Sex" aparece como "object" y queremos "int" o "float" para el algoritmo
# Cómo se distribuyen los nulos en edad
(df[df['Age'].isna()]
.groupby(['Sex', 'Pclass'])
.count()['PassengerId']
.unstack(level=0))
femaleint64
maleint64
1
9
21
2
2
9
3
42
94
# calcular mediana de Age para imputar
df['Age'].median()
# imputar valor para rellenar nulos
df['Age'] = df['Age'].fillna(28.0)
df[['Survived', 'Sex', 'Age', 'Pclass']].info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Survived 891 non-null int64
1 Sex 891 non-null object
2 Age 891 non-null float64
3 Pclass 891 non-null int64
dtypes: float64(1), int64(2), object(1)
memory usage: 28.0+ KB
# map para label encoding
df['Sex'] = df['Sex'].map({'female': 1, 'male': 0}).astype(int)
df[['Survived', 'Sex', 'Age', 'Pclass']].head(3)
Survivedint64
Sexint64
0
0
0
1
1
1
2
1
1
Datos entrenar el modelo
df= df[['Survived', 'Sex', 'Age', 'Pclass']]
df
Survivedint64
0 - 1
Sexint64
0 - 1
0
0
0
1
1
1
2
1
1
3
1
1
4
0
0
5
0
0
6
0
0
7
0
0
8
1
1
9
1
1
Se partirá el set de datos para obtener el conjunto de entrenamiento (75%) y el de prueba (25%). El tamaño se escogió por criterio propio.
df_features = df[['Sex', 'Age', 'Pclass']]
df_target = df["Survived"]
X_train,X_test,Y_train,Y_test =train_test_split(df_features,df_target, test_size=0.25, random_state=41)
print(Y_train.shape, X_train.shape)
(668,) (668, 3)
se probaran dos modelos: regresión logística y árboles de decisión.
# entrenando modelo regresión logistica
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
# entrenando modelo arboles de decisión
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, Y_train)
Evaluación mediante matriz de confusión
from sklearn.metrics import plot_confusion_matrix
def conf_mat_acc(modelo):
disp = plot_confusion_matrix(modelo, X_test, Y_test,
cmap=plt.cm.Blues, values_format="d")
true_pred = disp.confusion_matrix[0,0]+disp.confusion_matrix[1,1]
total_data = np.sum(disp.confusion_matrix)
accuracy = true_pred/total_data
print('accuracy: ', np.round(accuracy, 2))
plt.show()
conf_mat_acc(logreg)
/shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
accuracy: 0.8
conf_mat_acc(decision_tree)
/shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
accuracy: 0.84
vemos que el árbol de decisión presenta una mejor precisión.
A continuación se aplicara el modelo a los datos del crucero IDICO.
#Este código carga los datops de la hoja DataTitanic del archivo de excel-
df_idico = pd.read_excel("/work/Prueba Técnica DS.xlsx", sheet_name='DataCruceroIDICO')
df_idico.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Pasajero 2500 non-null int64
1 Estrato 2500 non-null int64
2 Edad 2500 non-null int64
3 Género 2500 non-null int64
4 Ciudad Origen 2500 non-null int64
5 Estudios 2500 non-null int64
6 Discapacidad 2500 non-null int64
dtypes: int64(7)
memory usage: 136.8 KB
df_idico.columns
df_idico_fit = df_idico[[ 'Estrato', 'Edad', 'Género',]]
msno.bar(df_idico_fit)
para este caso se tienen 5 categorías en la variable estrato. lo que se hará es definir que los 1 y 2 pertenecen a la clase 3; los 3, a la 2 y los 5 a la 1.
def clases(x):
if x == 1 or x == 2:
y=3
elif x == 3 or x == 4:
y=2
else:
y =1
return y
df_idico_fit['Pclass'] = df_idico_fit['Estrato'].apply(clases)
/tmp/ipykernel_596/3664849910.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_idico_fit['Pclass'] = df_idico_fit['Estrato'].apply(clases)
df_idico_fit
Estratoint64
1 - 5
Ageint64
1 - 95
0
3
31
1
3
44
2
4
50
3
4
18
4
3
69
5
3
4
6
4
75
7
1
52
8
3
84
9
2
3
df_idico_fit=df_idico_fit.rename({'Edad': 'Age', 'Género': 'Sex'}, axis=1)
df_idico_fit=df_idico_fit[['Sex', 'Age', 'Pclass']]
df_idico_fit
Sexint64
0 - 1
Ageint64
1 - 95
0
0
31
1
0
44
2
0
50
3
1
18
4
0
69
5
1
4
6
1
75
7
1
52
8
0
84
9
1
3
y_pred = decision_tree.predict(df_idico_fit)
df_idico_fit['Survived']=y_pred
df_idico_fit
Sexint64
0 - 1
Ageint64
1 - 95
0
0
31
1
0
44
2
0
50
3
1
18
4
0
69
5
1
4
6
1
75
7
1
52
8
0
84
9
1
3
df_idico_fit['Survived'].value_counts()
Porbabilidad_sobrevivir=799/1701
print(Porbabilidad_sobrevivir)
0.4697236919459142