import pandas as pd
housing = pd.read_csv('/work/data/train.csv')
housing.info()
housing.describe()
housing.head()
housing.FireplaceQu.fillna('NA',inplace=True)
housing.FireplaceQu.value_counts()
housing = housing.drop(['Alley','PoolQC','Fence','MiscFeature'], axis=1)
housing = housing.dropna(axis=0)
def select_cols_corr(df_corr, target_col, min_corr, max_corr):
#creating df target_corr
target_corr = df_corr[target_col].reset_index()
return target_corr.loc[(target_corr.iloc[:,1] < max_corr) & (target_corr.iloc[:,1] > min_corr),:]
select_cols_corr(housing.corr(), 'SalePrice', min_corr=.4, max_corr=.95)
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse=False)
categorical_var = ['Electrical','CentralAir', 'HeatingQC', 'Heating','BsmtFinType2', 'BsmtFinType1', 'BsmtExposure','BsmtCond','BsmtQual','Foundation','ExterCond','ExterQual','MasVnrType', 'Exterior2nd', 'Exterior1st', 'RoofMatl','RoofStyle', 'HouseStyle','BldgType', 'Condition2', 'Condition1', 'Neighborhood', 'LandSlope', 'LotConfig', 'Utilities', 'LandContour', 'LotShape', 'Street','MSZoning','KitchenQual','Functional','FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual','GarageCond','PavedDrive', 'SaleType','SaleCondition']
train_X_encoded = pd.DataFrame(encoder.fit_transform(housing[categorical_var]))
train_X_encoded.columns = encoder.get_feature_names_out()
train_X_encoded.head()
housing = pd.concat([housing,train_X_encoded],axis=1)
housing.head()
Sem as variáveis categóricas
x_treino = housing[['OverallQual','YearBuilt','YearRemodAdd','FullBath','Fireplaces',
'MasVnrArea','TotalBsmtSF','1stFlrSF','GrLivArea','TotRmsAbvGrd']]
y_treino = housing['SalePrice']
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
rs_model = RandomForestRegressor(random_state=1)
scores_rs_model = cross_val_score(rs_model,x_treino,y_treino,cv=10)
print(np.mean(scores_rs_model))
Com as variáveis categóricas
variaveis = ['OverallQual','YearBuilt','YearRemodAdd','FullBath','Fireplaces',
'MasVnrArea','TotalBsmtSF','1stFlrSF','GrLivArea','TotRmsAbvGrd']+list(train_X_encoded.columns)
variaveis
housing.dropna(axis=0,inplace=True)
x_treino2 = housing[variaveis]
y_treino2 = housing['SalePrice']
x_treino2.isnull().sum()
rs_model2 = RandomForestRegressor(random_state=1)
scores_rs_model2 = cross_val_score(rs_model2,x_treino2,y_treino2,cv=5)
print(np.mean(scores_rs_model2))