Use Random Forest to Predict House Price
import pandas as pd
train_file_path = '/work/train.csv'
train_file = pd.read_csv(train_file_path)
# find out the columns without null value
col_complete_value = [col for col in train_file.columns
if train_file[col].notnull().all()]
print(col_complete_value)
print(len(col_complete_value))
# limite the trainig data to those with complete valus
train_data = train_file [col_complete_value]
y = train_data['SalePrice']
x = train_data [['MSSubClass', 'MSZoning', 'LotArea', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'GarageCars', 'GarageArea', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition']]
x.info()
feature_int = ['MSSubClass','LotArea','OverallQual','OverallCond','YearBuilt','YearRemodAdd']
x_int = train_data [feature_int]
from sklearn.ensemble import RandomForestRegressor
forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(x_int,y)
test_file_path = '/work/test.csv'
test_file = pd.read_csv(test_file_path)
val_x = test_file [feature_int]
predictions = forest_model.predict(val_x)
predictions