import pandas as pd
housing = pd.read_csv('/work/data/train.csv')
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Id 1460 non-null int64
1 MSSubClass 1460 non-null int64
2 MSZoning 1460 non-null object
3 LotFrontage 1201 non-null float64
4 LotArea 1460 non-null int64
5 Street 1460 non-null object
6 Alley 91 non-null object
7 LotShape 1460 non-null object
8 LandContour 1460 non-null object
9 Utilities 1460 non-null object
10 LotConfig 1460 non-null object
11 LandSlope 1460 non-null object
12 Neighborhood 1460 non-null object
13 Condition1 1460 non-null object
14 Condition2 1460 non-null object
15 BldgType 1460 non-null object
16 HouseStyle 1460 non-null object
17 OverallQual 1460 non-null int64
18 OverallCond 1460 non-null int64
19 YearBuilt 1460 non-null int64
20 YearRemodAdd 1460 non-null int64
21 RoofStyle 1460 non-null object
22 RoofMatl 1460 non-null object
23 Exterior1st 1460 non-null object
24 Exterior2nd 1460 non-null object
25 MasVnrType 1452 non-null object
26 MasVnrArea 1452 non-null float64
27 ExterQual 1460 non-null object
28 ExterCond 1460 non-null object
29 Foundation 1460 non-null object
30 BsmtQual 1423 non-null object
31 BsmtCond 1423 non-null object
32 BsmtExposure 1422 non-null object
33 BsmtFinType1 1423 non-null object
34 BsmtFinSF1 1460 non-null int64
35 BsmtFinType2 1422 non-null object
36 BsmtFinSF2 1460 non-null int64
37 BsmtUnfSF 1460 non-null int64
38 TotalBsmtSF 1460 non-null int64
39 Heating 1460 non-null object
40 HeatingQC 1460 non-null object
41 CentralAir 1460 non-null object
42 Electrical 1459 non-null object
43 1stFlrSF 1460 non-null int64
44 2ndFlrSF 1460 non-null int64
45 LowQualFinSF 1460 non-null int64
46 GrLivArea 1460 non-null int64
47 BsmtFullBath 1460 non-null int64
48 BsmtHalfBath 1460 non-null int64
49 FullBath 1460 non-null int64
50 HalfBath 1460 non-null int64
51 BedroomAbvGr 1460 non-null int64
52 KitchenAbvGr 1460 non-null int64
53 KitchenQual 1460 non-null object
54 TotRmsAbvGrd 1460 non-null int64
55 Functional 1460 non-null object
56 Fireplaces 1460 non-null int64
57 FireplaceQu 770 non-null object
58 GarageType 1379 non-null object
59 GarageYrBlt 1379 non-null float64
60 GarageFinish 1379 non-null object
61 GarageCars 1460 non-null int64
62 GarageArea 1460 non-null int64
63 GarageQual 1379 non-null object
64 GarageCond 1379 non-null object
65 PavedDrive 1460 non-null object
66 WoodDeckSF 1460 non-null int64
67 OpenPorchSF 1460 non-null int64
68 EnclosedPorch 1460 non-null int64
69 3SsnPorch 1460 non-null int64
70 ScreenPorch 1460 non-null int64
71 PoolArea 1460 non-null int64
72 PoolQC 7 non-null object
73 Fence 281 non-null object
74 MiscFeature 54 non-null object
75 MiscVal 1460 non-null int64
76 MoSold 1460 non-null int64
77 YrSold 1460 non-null int64
78 SaleType 1460 non-null object
79 SaleCondition 1460 non-null object
80 SalePrice 1460 non-null int64
dtypes: float64(3), int64(35), object(43)
memory usage: 924.0+ KB
housing.describe()
Idfloat64
MSSubClassfloat64
count
1460
1460
mean
730.5
56.89726027
std
421.6100094
42.30057099
min
1
20
25%
365.75
20
50%
730.5
50
75%
1095.25
70
max
1460
190
housing.head()
Idint64
MSSubClassint64
0
1
60
1
2
20
2
3
60
3
4
70
4
5
60
housing.FireplaceQu.fillna('NA',inplace=True)
housing.FireplaceQu.value_counts()
housing = housing.drop(['Alley','PoolQC','Fence','MiscFeature'], axis=1)
housing = housing.dropna(axis=0)
def select_cols_corr(df_corr, target_col, min_corr, max_corr):
#creating df target_corr
target_corr = df_corr[target_col].reset_index()
return target_corr.loc[(target_corr.iloc[:,1] < max_corr) & (target_corr.iloc[:,1] > min_corr),:]
select_cols_corr(housing.corr(), 'SalePrice', min_corr=.4, max_corr=.95)
indexobject
OverallQual8.3%
YearBuilt8.3%
10 others83.3%
SalePricefloat64
0.4730731635797986 - 0.7763293256883151
4
OverallQual
0.7763293257
6
YearBuilt
0.4730731636
7
YearRemodAdd
0.5197465658
8
MasVnrArea
0.4810886371
12
TotalBsmtSF
0.5749262195
13
1stFlrSF
0.5609026401
16
GrLivArea
0.6622890383
19
FullBath
0.5086736733
23
TotRmsAbvGrd
0.5472253411
25
GarageYrBlt
0.4880180727
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
categorical_var = ['Electrical','CentralAir', 'HeatingQC', 'Heating','BsmtFinType2', 'BsmtFinType1', 'BsmtExposure','BsmtCond','BsmtQual','Foundation','ExterCond','ExterQual','MasVnrType', 'Exterior2nd', 'Exterior1st', 'RoofMatl','RoofStyle', 'HouseStyle','BldgType', 'Condition2', 'Condition1', 'Neighborhood', 'LandSlope', 'LotConfig', 'Utilities', 'LandContour', 'LotShape', 'Street','MSZoning','KitchenQual','Functional','FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond','PavedDrive', 'SaleType','SaleCondition']
train_X_encoded = pd.DataFrame(encoder.fit_transform(housing[categorical_var]))
train_X_encoded.columns = encoder.get_feature_names_out()
train_X_encoded.head()
Electrical_FuseAfloat64
Electrical_FuseFfloat64
0
0
0
1
0
0
2
0
0
3
0
0
4
0
0
housing = pd.concat([housing,train_X_encoded],axis=1)
housing.head()
Idfloat64
MSSubClassfloat64
0
nan
nan
1
2
20
2
3
60
3
4
70
4
5
60
Sem as variáveis categóricas
x_treino = housing[['OverallQual','YearBuilt','YearRemodAdd','FullBath','Fireplaces',
'MasVnrArea','TotalBsmtSF','1stFlrSF','GrLivArea','TotRmsAbvGrd']]
y_treino = housing['SalePrice']
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
rs_model = RandomForestRegressor(random_state=1)
scores_rs_model = cross_val_score(rs_model,x_treino,y_treino,cv=10)
print(np.mean(scores_rs_model))
0.7856668365404486
Com as variáveis categóricas
variaveis = ['OverallQual','YearBuilt','YearRemodAdd','FullBath','Fireplaces',
'MasVnrArea','TotalBsmtSF','1stFlrSF','GrLivArea','TotRmsAbvGrd']+list(train_X_encoded.columns)
variaveis
housing.dropna(axis=0,inplace=True)
x_treino2 = housing[variaveis]
y_treino2 = housing['SalePrice']
x_treino2.isnull().sum()
rs_model2 = RandomForestRegressor(random_state=1)
scores_rs_model2 = cross_val_score(rs_model2,x_treino2,y_treino2,cv=5)
print(np.mean(scores_rs_model2))
0.7509831598656218