Use Random Forest to Predict House Price
import pandas as pd
train_file_path = '/work/train.csv'
train_file = pd.read_csv(train_file_path)
# find out the columns without null value
col_complete_value = [col for col in train_file.columns
if train_file[col].notnull().all()]
print(col_complete_value)
print(len(col_complete_value))
['Id', 'MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice']
62
# limite the trainig data to those with complete valus
train_data = train_file [col_complete_value]
y = train_data['SalePrice']
x = train_data [['MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']]
x.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 60 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MSSubClass 1460 non-null int64
1 MSZoning 1460 non-null object
2 LotArea 1460 non-null int64
3 Street 1460 non-null object
4 LotShape 1460 non-null object
5 LandContour 1460 non-null object
6 Utilities 1460 non-null object
7 LotConfig 1460 non-null object
8 LandSlope 1460 non-null object
9 Neighborhood 1460 non-null object
10 Condition1 1460 non-null object
11 Condition2 1460 non-null object
12 BldgType 1460 non-null object
13 HouseStyle 1460 non-null object
14 OverallQual 1460 non-null int64
15 OverallCond 1460 non-null int64
16 YearBuilt 1460 non-null int64
17 YearRemodAdd 1460 non-null int64
18 RoofStyle 1460 non-null object
19 RoofMatl 1460 non-null object
20 Exterior1st 1460 non-null object
21 Exterior2nd 1460 non-null object
22 ExterQual 1460 non-null object
23 ExterCond 1460 non-null object
24 Foundation 1460 non-null object
25 BsmtFinSF1 1460 non-null int64
26 BsmtFinSF2 1460 non-null int64
27 BsmtUnfSF 1460 non-null int64
28 TotalBsmtSF 1460 non-null int64
29 Heating 1460 non-null object
30 HeatingQC 1460 non-null object
31 CentralAir 1460 non-null object
32 1stFlrSF 1460 non-null int64
33 2ndFlrSF 1460 non-null int64
34 LowQualFinSF 1460 non-null int64
35 GrLivArea 1460 non-null int64
36 BsmtFullBath 1460 non-null int64
37 BsmtHalfBath 1460 non-null int64
38 FullBath 1460 non-null int64
39 HalfBath 1460 non-null int64
40 BedroomAbvGr 1460 non-null int64
41 KitchenAbvGr 1460 non-null int64
42 KitchenQual 1460 non-null object
43 TotRmsAbvGrd 1460 non-null int64
44 Functional 1460 non-null object
45 Fireplaces 1460 non-null int64
46 GarageCars 1460 non-null int64
47 GarageArea 1460 non-null int64
48 PavedDrive 1460 non-null object
49 WoodDeckSF 1460 non-null int64
50 OpenPorchSF 1460 non-null int64
51 EnclosedPorch 1460 non-null int64
52 3SsnPorch 1460 non-null int64
53 ScreenPorch 1460 non-null int64
54 PoolArea 1460 non-null int64
55 MiscVal 1460 non-null int64
56 MoSold 1460 non-null int64
57 YrSold 1460 non-null int64
58 SaleType 1460 non-null object
59 SaleCondition 1460 non-null object
dtypes: int64(33), object(27)
memory usage: 684.5+ KB
feature_int = ['MSSubClass','LotArea','OverallQual','OverallCond','YearBuilt','YearRemodAdd']
x_int = train_data [feature_int]
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(x_int,y)
test_file_path = '/work/test.csv'
test_file = pd.read_csv(test_file_path)
val_x = test_file [feature_int]
predictions = forest_model.predict(val_x)
predictions