import pandas as pd
df = pd.read_csv( 'practice-project-dataset-1.csv' )
df.head()
df.info()
df = df[['interest_rate','property_value','state_code','tract_minority_population_percent',
'derived_race','derived_sex','applicant_age']]
df.info()
import numpy as np
df['interest_rate'] = df['interest_rate'].replace( 'Exempt', np.nan )
df['interest_rate'] = df['interest_rate'].astype( float )
df['property_value'] = df['property_value'].replace( 'Exempt', np.nan )
df['property_value'] = df['property_value'].astype( float )
df.info()
df['applicant_age'].value_counts()
df['derived_race'] = df['derived_race'].astype( 'category' )
df['derived_sex'] = df['derived_sex'].astype( 'category' )
df['applicant_age'] = df['applicant_age'].astype( 'category' )
df.info()
lower_prices = df[df['property_value'] < 500000]
high_minority = lower_prices[lower_prices['tract_minority_population_percent'] > 75]
low_minority = lower_prices[lower_prices['tract_minority_population_percent'] < 25]
import matplotlib.pyplot as plt
plt.hist( [ high_minority['property_value'], low_minority['property_value'] ],
bins=20, density=True )
plt.legend( [ 'High % minority', 'Low % minority' ] )
plt.title( 'Sample of 2018 Home Mortgage Applications' )
plt.xlabel( 'Property Value' )
plt.ylabel( 'Proportion' )
plt.show()
high_minority['property_value'].mean(), low_minority['property_value'].mean()
from scipy import stats
alpha = 0.05
statistic, pvalue = stats.ttest_ind( high_minority['property_value'],
low_minority['property_value'],
equal_var=False )
pvalue < alpha # reject H_0?