import pandas as pd
df = pd.read_csv( 'practice-project-dataset-1.csv' )
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Columns: 101 entries, Unnamed: 0 to tract_median_age_of_housing_units
dtypes: float64(31), int64(43), object(27)
memory usage: 11.7+ MB
df = df[['interest_rate','property_value','state_code','tract_minority_population_percent','derived_race','derived_sex','applicant_age']]
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 interest_rate 10061 non-null object
1 property_value 12424 non-null object
2 state_code 14929 non-null object
3 tract_minority_population_percent 15120 non-null float64
4 derived_race 15120 non-null object
5 derived_sex 15120 non-null object
6 applicant_age 15120 non-null object
dtypes: float64(1), object(6)
memory usage: 827.0+ KB
import numpy as np
df['interest_rate'] = df['interest_rate'].replace( 'Exempt', np.nan )
df['interest_rate'] = df['interest_rate'].astype( float )
df['property_value'] = df['property_value'].replace( 'Exempt', np.nan )
df['property_value'] = df['property_value'].astype( float )
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 interest_rate 9660 non-null float64
1 property_value 12024 non-null float64
2 state_code 14929 non-null object
3 tract_minority_population_percent 15120 non-null float64
4 derived_race 15120 non-null object
5 derived_sex 15120 non-null object
6 applicant_age 15120 non-null object
dtypes: float64(3), object(4)
memory usage: 827.0+ KB
df['applicant_age'].value_counts()
df['derived_race'] = df['derived_race'].astype( 'category' )
df['derived_sex'] = df['derived_sex'].astype( 'category' )
df['applicant_age'] = df['applicant_age'].astype( 'category' )
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15120 entries, 0 to 15119
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 interest_rate 9660 non-null float64
1 property_value 12024 non-null float64
2 state_code 14929 non-null object
3 tract_minority_population_percent 15120 non-null float64
4 derived_race 15120 non-null category
5 derived_sex 15120 non-null category
6 applicant_age 15120 non-null category
dtypes: category(3), float64(3), object(1)
memory usage: 517.9+ KB
# Create two new DataFrames derived from "df", filtered based upon minority
# population percentage
lower_prices = df[df['property_value'] < 500000]
high_minority = lower_prices[lower_prices['tract_minority_population_percent'] > 75]
low_minority = lower_prices[lower_prices['tract_minority_population_percent'] < 25]
# The dataframes high_minority and low_minority were created for graphing purposes
# as filtered DataFrames to compare property values in high- and low-minority areas
# Plot the derived DataFrames as a histogram for juxtaposition
import matplotlib.pyplot as plt
plt.hist( [ high_minority['property_value'], low_minority['property_value'] ],
bins=20, density=True )
plt.legend( [ 'High % minority', 'Low % minority' ] )
plt.title( 'Sample of 2018 Home Mortgage Applications' )
plt.xlabel( 'Property Value' )
plt.ylabel( 'Proportion' )
plt.show()
# A histogram was constructed using 20 points of reference to compare the
# property values of areas with high and low percentages of minority residents
# Construct a calculation to formulate the confidence interval of the data
high_minority['property_value'].mean(), low_minority['property_value'].mean()
# The high and low minority DataFrames were averaged in order to compare the
# statistical average property value to rudimentally evaluate price difference
# Test the null hypothesis utilizing a 5% confidence level
from scipy import stats
alpha = 0.05
statistic, pvalue = stats.ttest_ind( high_minority['property_value'],
low_minority['property_value'],
equal_var=False )
pvalue < alpha # reject H_0?
# The high_minority and low_minority DataFrames are used to conduct a T-Test to test
# H_0, that the mean property value is constant regardless of minority population