# Importamos las librerias
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
## Obtenemos el dataset del historial crediticio
historial_credito = pd.read_csv('data/credit_record.csv')
## Obtenemos el dataset del registro de aplicación de credito
registro_aplicacion = pd.read_csv('data/application_record.csv')
registro_aplicacion.head()
IDint64
CODE_GENDERobject
0
5008804
M
1
5008805
M
2
5008806
M
3
5008808
F
4
5008809
F
historial_credito.head()
IDint64
MONTHS_BALANCEint64
0
5001711
0
1
5001711
-1
2
5001711
-2
3
5001711
-3
4
5001712
0
## Info de los datasets
print(historial_credito.shape)
historial_credito.info()
(1048575, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 1048575 non-null int64
1 MONTHS_BALANCE 1048575 non-null int64
2 STATUS 1048575 non-null object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB
print(registro_aplicacion.shape)
registro_aplicacion.info()
(438557, 18)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 438557 non-null int64
1 CODE_GENDER 438557 non-null object
2 FLAG_OWN_CAR 438557 non-null object
3 FLAG_OWN_REALTY 438557 non-null object
4 CNT_CHILDREN 438557 non-null int64
5 AMT_INCOME_TOTAL 438557 non-null float64
6 NAME_INCOME_TYPE 438557 non-null object
7 NAME_EDUCATION_TYPE 438557 non-null object
8 NAME_FAMILY_STATUS 438557 non-null object
9 NAME_HOUSING_TYPE 438557 non-null object
10 DAYS_BIRTH 438557 non-null int64
11 DAYS_EMPLOYED 438557 non-null int64
12 FLAG_MOBIL 438557 non-null int64
13 FLAG_WORK_PHONE 438557 non-null int64
14 FLAG_PHONE 438557 non-null int64
15 FLAG_EMAIL 438557 non-null int64
16 OCCUPATION_TYPE 304354 non-null object
17 CNT_FAM_MEMBERS 438557 non-null float64
dtypes: float64(2), int64(8), object(8)
memory usage: 60.2+ MB
## Numero de datos nulos y su porcentaje en las columnas
porcentaje_nulos = registro_aplicacion.isnull().mean().round(4) * 100
pd.merge(registro_aplicacion.isnull().sum().reset_index(), porcentaje_nulos.reset_index(), how='inner', on='index', suffixes=("_Valores Nulos", "_Porcentaje de valores nulos"))
indexobject
ID5.6%
CODE_GENDER5.6%
16 others88.9%
0_Valores Nulosint64
0 - 134203