Data Preparation Exercise - Training Dataset
import pandas as pd
df = pd.read_csv( 'mortgage-testing-data.csv' )
df.dtypes
import numpy as np
# create new columns
df['conforming_loan_limit_is_c'] = np.where( df['conforming_loan_limit'] == 'C', 1, 0 )
df['conforming_loan_limit_is_nc'] = np.where( df['conforming_loan_limit'] == 'NC', 1, 0 )
# delete original column
del df['conforming_loan_limit']
df
# create new columns
df['derived_sex_is_male'] = np.where( df['derived_sex'] == 'Male', 1, 0 )
df['derived_sex_is_female'] = np.where( df['derived_sex'] == 'Female', 1, 0 )
# delete original column
del df['derived_sex']
df
# create new column
df['application_is_accepted'] = np.where( df['action_taken'] == 1, 1, 0 )
# delete original column
del df['action_taken']
df
dict_to_replace_debt_income = { '<20%': 10,
'20%-<30%' : 25,
'30%-<36%' : 33,
'50%-60%' : 55,
'>60%': 70 }
df['debt_to_income_ratio'] = df['debt_to_income_ratio'].map( dict_to_replace_debt_income )\
.fillna( df['debt_to_income_ratio'] ).astype( 'float64' )
df
df.dtypes
df.to_csv( 'cleaned-mortgage-testing-data.csv' )