import pandas as pd
df = pd.read_csv( 'practice-project-dataset-1.csv' )
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 99 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 activity_year 15120 non-null int64
1 lei 15120 non-null object
2 derived_msa_md 15120 non-null int64
3 state_code 14929 non-null object
4 county_code 14775 non-null float64
5 census_tract 14753 non-null float64
6 conforming_loan_limit 15055 non-null object
7 derived_loan_product_type 15120 non-null object
8 derived_dwelling_category 15120 non-null object
9 derived_ethnicity 15120 non-null object
10 derived_race 15120 non-null object
11 derived_sex 15120 non-null object
12 action_taken 15120 non-null int64
13 purchaser_type 15120 non-null int64
14 preapproval 15120 non-null int64
15 loan_type 15120 non-null int64
16 loan_purpose 15120 non-null int64
17 lien_status 15120 non-null int64
18 reverse_mortgage 15120 non-null int64
19 open_end_line_of_credit 15120 non-null int64
20 business_or_commercial_purpose 15120 non-null int64
21 loan_amount 15120 non-null int64
22 loan_to_value_ratio 10203 non-null object
23 interest_rate 10061 non-null object
24 rate_spread 7549 non-null object
25 hoepa_status 15120 non-null int64
26 total_loan_costs 7482 non-null object
27 total_points_and_fees 489 non-null object
28 origination_charges 7576 non-null object
29 discount_points 2817 non-null object
30 lender_credits 3098 non-null object
31 loan_term 14906 non-null object
32 prepayment_penalty_term 1196 non-null object
33 intro_rate_period 3137 non-null object
34 negative_amortization 15120 non-null int64
35 interest_only_payment 15120 non-null int64
36 balloon_payment 15120 non-null int64
37 other_nonamortizing_features 15120 non-null int64
38 property_value 12424 non-null object
39 construction_method 15120 non-null int64
40 occupancy_type 15120 non-null int64
41 manufactured_home_secured_property_type 15120 non-null int64
42 manufactured_home_land_property_interest 15120 non-null int64
43 total_units 15120 non-null object
44 multifamily_affordable_units 425 non-null object
45 income 13446 non-null float64
46 debt_to_income_ratio 10304 non-null object
47 applicant_credit_score_type 15120 non-null int64
48 co_applicant_credit_score_type 15120 non-null int64
49 applicant_ethnicity_1 15116 non-null float64
50 applicant_ethnicity_2 609 non-null float64
51 applicant_ethnicity_3 5 non-null float64
52 applicant_ethnicity_4 1 non-null float64
53 applicant_ethnicity_5 0 non-null float64
54 co_applicant_ethnicity_1 15120 non-null int64
55 co_applicant_ethnicity_2 208 non-null float64
56 co_applicant_ethnicity_3 4 non-null float64
57 co_applicant_ethnicity_4 0 non-null float64
58 co_applicant_ethnicity_5 0 non-null float64
59 applicant_ethnicity_observed 15120 non-null int64
60 co_applicant_ethnicity_observed 15120 non-null int64
61 applicant_race_1 15116 non-null float64
62 applicant_race_2 497 non-null float64
63 applicant_race_3 42 non-null float64
64 applicant_race_4 9 non-null float64
65 applicant_race_5 6 non-null float64
66 co_applicant_race_1 15120 non-null int64
67 co_applicant_race_2 239 non-null float64
68 co_applicant_race_3 14 non-null float64
69 co_applicant_race_4 4 non-null float64
70 co_applicant_race_5 3 non-null float64
71 applicant_race_observed 15120 non-null int64
72 co_applicant_race_observed 15120 non-null int64
73 applicant_sex 15120 non-null int64
74 co_applicant_sex 15120 non-null int64
75 applicant_sex_observed 15120 non-null int64
76 co_applicant_sex_observed 15120 non-null int64
77 applicant_age 15120 non-null object
78 co_applicant_age 15120 non-null object
79 applicant_age_above_62 13631 non-null object
80 co_applicant_age_above_62 6082 non-null object
81 submission_of_application 15120 non-null int64
82 initially_payable_to_institution 15120 non-null int64
83 aus_1 15120 non-null int64
84 aus_2 653 non-null float64
85 aus_3 258 non-null float64
86 aus_4 131 non-null float64
87 aus_5 108 non-null float64
88 denial_reason_1 15120 non-null int64
89 denial_reason_2 598 non-null float64
90 denial_reason_3 95 non-null float64
91 denial_reason_4 5 non-null float64
92 tract_population 15120 non-null int64
93 tract_minority_population_percent 15120 non-null float64
94 ffiec_msa_md_median_family_income 15120 non-null int64
95 tract_to_msa_income_percentage 15120 non-null int64
96 tract_owner_occupied_units 15120 non-null int64
97 tract_one_to_four_family_homes 15120 non-null int64
98 tract_median_age_of_housing_units 15120 non-null int64
dtypes: float64(29), int64(43), object(27)
memory usage: 11.4+ MB
df = df[['interest_rate','property_value','state_code','tract_minority_population_percent','derived_race','derived_sex','applicant_age']]
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 interest_rate 10061 non-null object
1 property_value 12424 non-null object
2 state_code 14929 non-null object
3 tract_minority_population_percent 15120 non-null float64
4 derived_race 15120 non-null object
5 derived_sex 15120 non-null object
6 applicant_age 15120 non-null object
dtypes: float64(1), object(6)
memory usage: 827.0+ KB
import numpy as np
df['interest_rate'] = df['interest_rate'].replace( 'Exempt', np.nan )
df['interest_rate'] = df['interest_rate'].astype( float )
df['property_value'] = df['property_value'].replace( 'Exempt', np.nan )
df['property_value'] = df['property_value'].astype( float )
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 interest_rate 9660 non-null float64
1 property_value 12024 non-null float64
2 state_code 14929 non-null object
3 tract_minority_population_percent 15120 non-null float64
4 derived_race 15120 non-null object
5 derived_sex 15120 non-null object
6 applicant_age 15120 non-null object
dtypes: float64(3), object(4)
memory usage: 827.0+ KB
df['applicant_age'].value_counts()
df['derived_race'] = df['derived_race'].astype( 'category' )
df['derived_sex'] = df['derived_sex'].astype( 'category' )
df['applicant_age'] = df['applicant_age'].astype( 'category' )
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 interest_rate 9660 non-null float64
1 property_value 12024 non-null float64
2 state_code 14929 non-null object
3 tract_minority_population_percent 15120 non-null float64
4 derived_race 15120 non-null category
5 derived_sex 15120 non-null category
6 applicant_age 15120 non-null category
dtypes: category(3), float64(3), object(1)
memory usage: 517.8+ KB
lower_prices = df[df['property_value'] < 500000]
high_minority = lower_prices[lower_prices['tract_minority_population_percent'] > 75]
low_minority = lower_prices[lower_prices['tract_minority_population_percent'] < 25]
import matplotlib.pyplot as plt
plt.hist( [ high_minority['property_value'], low_minority['property_value'] ],
bins=20, density=True )
plt.legend( [ 'High % minority', 'Low % minority' ] )
plt.title( 'Sample of 2018 Home Mortgage Applications' )
plt.xlabel( 'Property Value' )
plt.ylabel( 'Proportion' )
plt.show()
high_minority['property_value'].mean(), low_minority['property_value'].mean()
from scipy import stats
alpha = 0.05
statistic, pvalue = stats.ttest_ind( high_minority['property_value'],
low_minority['property_value'],
equal_var=False )
pvalue < alpha # reject H_0?