#!pip install kaggle
#!cp -i /work/kaggle.json /root/.kaggle/kaggle.json
#!kaggle competitions download -c house-prices-advanced-regression-techniques
#!unzip house-prices-advanced-regression-techniques.zip -d /work/dados/
import pandas as pd
data = pd.read_csv("/work/dados/train.csv")
test_kaggle = pd.read_csv("/work/dados/test.csv")
data.head()
print(len(data))
data.describe()
Engenharia de Atributos
Tratamento dos Nans
null_columns = data.columns[data.isna().any()].tolist()
null_columns_data = data[null_columns].isnull().sum()
null_columns_data
null_columns = test_kaggle.columns[test_kaggle.isna().any()].tolist()
null_columns_data = test_kaggle[null_columns].isnull().sum()
null_columns_data
# remover atributos com muitas entradas em branco (NaN > 100)
threshold = 100
for column in null_columns_data.index:
if null_columns_data[column] >= threshold:
print(column, null_columns_data[column])
data.drop(column, axis='columns', inplace=True)
# drop test as well
test_kaggle.drop(column, axis='columns', inplace=True)
all_cat_cols = data.select_dtypes(include=['object']).columns.to_list()
all_num_cols = data.select_dtypes(exclude=['object']).columns.to_list()
null_cat_cols = []
for col in all_cat_cols:
if col in null_columns:
null_cat_cols.append(col)
nul_num_cols = []
for col in all_num_cols:
if col in null_columns:
nul_num_cols.append(col)
data.head()
for col in all_cat_cols:
mode = data[col].mode()[0]
data[col].fillna(mode, inplace=True)
# preenche as colunas do test
test_kaggle[col].fillna(mode, inplace=True)
def fillna_num_col(cols, type):
for col in cols:
if type == "mode":
mode = data[col].mode()[0]
data[col].fillna(mode, inplace=True)
test_kaggle[col].fillna(mode, inplace=True)
elif type == "median":
median = data[col].median()
data[col].fillna(median, inplace=True)
test_kaggle[col].fillna(median, inplace=True)
elif type == "mean":
mean = data[col].mean()
data[col].fillna(mean, inplace=True)
if col != "SalePrice":
test_kaggle[col].fillna(mean, inplace=True)
# inputação dos valores faltantes (NaN < 100)
#fillna_num_col(cols=["BsmtFullBath", "BsmtHalfBath"], type="mode")
#fillna_num_col(cols=["GarageCars"], type="median")
fillna_num_col(cols=all_num_cols, type="mean")
all_cat_cols = data.select_dtypes(include=['object']).columns.to_list()
all_cat_cols
data.shape
test_kaggle.shape
null_columns = data.columns[data.isna().any()].tolist()
null_columns_data = data[null_columns].isnull().sum()
null_columns_data
null_columns = test_kaggle.columns[test_kaggle.isna().any()].tolist()
null_columns_data = test_kaggle[null_columns].isnull().sum()
null_columns_data
Mapeamento dos Atributos Categóricos Ordinais
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
data["GarageType"] = label.fit_transform(data["GarageType"])
dict(zip(label.classes_, range(1, len(label.classes_)+1)))
test_kaggle["GarageType"] = label.fit_transform(test_kaggle["GarageType"])
data["GarageCond"] = data["GarageCond"].astype("str")
test_kaggle["GarageCond"] = test_kaggle["GarageCond"].astype("str")
pd.set_option('display.max_rows', data["GarageCond"].shape[0]+1)
print(data["GarageCond"])
print(data["GarageCond"])
mapping = {"Ex":6,
"Gd":5,
"TA":4,
"Fa":3,
"Po":2,
"nan":1}
data["GarageCond"] = data["GarageCond"].astype("str")
data["GarageCond"] = data["GarageCond"].apply(lambda x : mapping[x])
test_kaggle["GarageCond"] = test_kaggle["GarageCond"].apply(lambda x : mapping[x])
mapping = {"Fin":4,
"RFn":3,
"Unf":2,
"nan":1}
data["GarageFinish"] = data["GarageFinish"].astype("str")
data["GarageFinish"] = data["GarageFinish"].apply(lambda x : mapping[x])
test_kaggle["GarageFinish"] = test_kaggle["GarageFinish"].apply(lambda x : mapping[x])
mapping = {"Ex":6,
"Gd":5,
"TA":4,
"Fa":3,
"Po":2,
"nan":1}
data["GarageQual"] = data["GarageQual"].astype("str")
data["GarageQual"] = data["GarageQual"].apply(lambda x : mapping[x])
test_kaggle["GarageQual"] = test_kaggle["GarageQual"].apply(lambda x : mapping[x])
mapping = {"GLQ":7,
"ALQ":6,
"BLQ":5,
"Rec":4,
"LwQ":3,
"Unf":2,
"nan":1}
data["BsmtFinType2"] = data["BsmtFinType2"].astype("str")
data["BsmtFinType2"] = data["BsmtFinType2"].apply(lambda x : mapping[x])
test_kaggle["BsmtFinType2"] = test_kaggle["BsmtFinType2"].apply(lambda x : mapping[x])
mapping = {"Gd":5,
"Av":4,
"Mn":3,
"No":2,
"nan":1}
data["BsmtExposure"] = data["BsmtExposure"].astype("str")
data["BsmtExposure"] = data["BsmtExposure"].apply(lambda x : mapping[x])
test_kaggle["BsmtExposure"] = test_kaggle["BsmtExposure"].apply(lambda x : mapping[x])
mapping = {"GLQ":7,
"ALQ":6,
"BLQ":5,
"Rec":4,
"LwQ":3,
"Unf":2,
"nan":1}
data["BsmtFinType1"] = data["BsmtFinType1"].astype("str")
data["BsmtFinType1"] = data["BsmtFinType1"].apply(lambda x : mapping[x])
test_kaggle["BsmtFinType1"] = test_kaggle["BsmtFinType1"].apply(lambda x : mapping[x])
mapping = {"Ex":6,
"Gd":5,
"TA":4,
"Fa":3,
"Po":2,
"nan":1}
data["BsmtCond"] = data["BsmtCond"].astype("str")
data["BsmtCond"] = data["BsmtCond"].apply(lambda x : mapping[x])
test_kaggle["BsmtCond"] = test_kaggle["BsmtCond"].apply(lambda x : mapping[x])
mapping = {"Ex":6,
"Gd":5,
"TA":4,
"Fa":3,
"Po":2,
"nan":1}
data["BsmtQual"] = data["BsmtQual"].astype("str")
data["BsmtQual"] = data["BsmtQual"].apply(lambda x : mapping[x])
test_kaggle["BsmtQual"] = test_kaggle["BsmtQual"].apply(lambda x : mapping[x])
print(data.shape, test_kaggle.shape)
data
price = data["SalePrice"]
data.drop('SalePrice', axis='columns', inplace=True)
data = pd.concat([data, price], axis=1)
#deploy_data = data.copy(deep=True)
X = data.iloc[:, :-1]
y = data.iloc[:, -1].tolist()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train
Análise das Matrizes de Correlação
df_garage = X_train[["GarageType","GarageCond","GarageYrBlt","GarageFinish","GarageQual","GarageCars","GarageArea"]]
df_basement = X_train[["BsmtFinType2","BsmtExposure","BsmtFinType1","BsmtCond","BsmtQual","BsmtFinSF1","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF"]]
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(16,10))
corr = df_basement.corr()
ax = sns.heatmap(
corr,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right'
)
plt.show()
# bsmt types e square feet têm correlação alta, então removeremos bsmt square feet (1 e 2)
X_train.drop(columns = ["BsmtFinSF1","BsmtFinSF2", "BsmtUnfSF"], axis = 1, inplace=True)
X_test.drop(columns = ["BsmtFinSF1","BsmtFinSF2", "BsmtUnfSF"], axis = 1, inplace=True)
test_kaggle.drop(columns = ["BsmtFinSF1","BsmtFinSF2", "BsmtUnfSF"], axis = 1, inplace=True)
# unfinished square feet possuem uma correlação alta (embora inversa) com o termo removido,
# portanto será removido também, visto que já está representado pelo basement type
# e.g. um porão com score de tipo ruim provavelmente terá um unfinished square feet alto
plt.figure(figsize=(16,10))
corr = df_garage.corr()
ax = sns.heatmap(
corr,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(20, 220, n=200),
square=True
)
ax.set_xticklabels(
ax.get_xticklabels(),
rotation=45,
horizontalalignment='right'
)
plt.show()
# ano da garagem não é tão relevante, e é compreendido pelo finish
# qual é redundante em relação ao finish
# garagecars é completamente equivalente ao garage area
X_train.drop(columns = ["GarageYrBlt", "GarageQual", "GarageCars"], axis = 1, inplace=True)
X_test.drop(columns = ["GarageYrBlt", "GarageQual", "GarageCars"], axis = 1, inplace=True)
test_kaggle.drop(columns = ["GarageYrBlt", "GarageQual", "GarageCars"], axis = 1, inplace=True)
X_test.shape
X_train
y_train
train_data = X_train.copy(deep=True)
train_data['SalePrice'] = pd.Series(y_train).values
X_train.columns
X_train
#train_data = pd.concat([X_train, pd.DataFrame(y_train)], axis=1)
# colocar título sale price
train_data
import numpy as np
# taken from: https://www.kaggle.com/pmarcelino/comprehensive-data-exploration-with-python
#saleprice correlation matrix
plt.figure(figsize=(10,10))
k = 10 #number of variables for heatmap
corrmat = train_data.corr()
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train_data[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
X_train.columns
# tamanho total do porão é muito correlacionado com o tamanho do primeiro andar
# (afinal porões geralmente são uma extensão do primeiro andar só que pra baixo)
X_train.drop(columns = ["TotalBsmtSF"], axis = 1, inplace=True)
X_test.drop(columns = ["TotalBsmtSF"], axis = 1, inplace=True)
test_kaggle.drop(columns = ["TotalBsmtSF"], axis = 1, inplace=True)
Continuação dos Mapeamentos nos Atributos Categóricos Ordinais
X_train["MSSubClass"] = X_train["MSSubClass"].astype("str")
X_test["MSSubClass"] = X_train["MSSubClass"].astype("str")
test_kaggle["MSSubClass"] = test_kaggle["MSSubClass"].astype("str")
mapping = {"Reg":4,"IR1":3,"IR2":2,"IR3":1}
X_train["LotShape"] = X_train["LotShape"].apply(lambda x : mapping[x])
X_test["LotShape"] = X_test["LotShape"].apply(lambda x : mapping[x])
test_kaggle["LotShape"] = test_kaggle["LotShape"].apply(lambda x : mapping[x])
#df_test["Utilities"].fillna("AllPub")
mapping = {"AllPub":4,"NoSewr":3,"NoSeWa":2,"ELO":1,np.nan:0}
X_train["Utilities"] = X_train["Utilities"].apply(lambda x : mapping[x])
X_test["Utilities"] = X_test["Utilities"].apply(lambda x : mapping[x])
test_kaggle["Utilities"] = test_kaggle["Utilities"].apply(lambda x : mapping[x])
mapping = {"Gtl":3,"Mod":2,"Sev":1}
X_train["LandSlope"] = X_train["LandSlope"].apply(lambda x : mapping[x])
X_test["LandSlope"] = X_test["LandSlope"].apply(lambda x : mapping[x])
test_kaggle["LandSlope"] = test_kaggle["LandSlope"].apply(lambda x : mapping[x])
mapping = {"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1}
X_train["ExterQual"] = X_train["ExterQual"].apply(lambda x : mapping[x])
X_test["ExterQual"] = X_test["ExterQual"].apply(lambda x : mapping[x])
test_kaggle["ExterQual"] = test_kaggle["ExterQual"].apply(lambda x : mapping[x])
mapping = {"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1}
X_train["ExterCond"] = X_train["ExterCond"].apply(lambda x : mapping[x])
X_test["ExterCond"] = X_test["ExterCond"].apply(lambda x : mapping[x])
test_kaggle["ExterCond"] = test_kaggle["ExterCond"].apply(lambda x : mapping[x])
mapping = {"Ex":5,"Gd":4,"TA":3,"Fa":2,"Po":1}
X_train["KitchenQual"] = X_train["KitchenQual"].apply(lambda x : mapping[x])
X_test["KitchenQual"] = X_test["KitchenQual"].apply(lambda x : mapping[x])
test_kaggle["KitchenQual"] = test_kaggle["KitchenQual"].apply(lambda x : mapping[x])
mapping = {"Typ":8,"Min1":7,"Min2":6,"Mod":5,"Maj1":4,"Maj2":3,"Sev":2,"Sal":1}
X_train["Functional"] = X_train["Functional"].apply(lambda x : mapping[x])
X_test["Functional"] = X_test["Functional"].apply(lambda x : mapping[x])
test_kaggle["Functional"] = test_kaggle["Functional"].apply(lambda x : mapping[x])
mapping = {"Y":3,"P":2,"N":1}
X_train["PavedDrive"] = X_train["PavedDrive"].apply(lambda x : mapping[x])
X_test["PavedDrive"] = X_test["PavedDrive"].apply(lambda x : mapping[x])
test_kaggle["PavedDrive"] = test_kaggle["PavedDrive"].apply(lambda x : mapping[x])
Mapeamento dos Atributos Categóricos Nominais
X_train["MSSubClass"] = label.fit_transform(X_train["MSSubClass"])
X_train["MSZoning"] = label.fit_transform(X_train["MSZoning"])
X_train["Street"] = label.fit_transform(X_train["Street"])
X_train["LandContour"] = label.fit_transform(X_train["LandContour"])
X_train["LotConfig"] = label.fit_transform(X_train["LotConfig"])
X_train["Neighborhood"] = label.fit_transform(X_train["Neighborhood"])
X_train["Condition1"] = label.fit_transform(X_train["Condition1"])
X_train["Condition2"] = label.fit_transform(X_train["Condition2"])
X_train["BldgType"] = label.fit_transform(X_train["BldgType"])
X_train["HouseStyle"] = label.fit_transform(X_train["HouseStyle"])
X_train["RoofStyle"] = label.fit_transform(X_train["RoofStyle"])
X_train["RoofMatl"] = label.fit_transform(X_train["RoofMatl"])
X_train["Exterior1st"] = label.fit_transform(X_train["Exterior1st"])
X_train["Exterior2nd"] = label.fit_transform(X_train["Exterior2nd"])
X_train["MasVnrType"] = label.fit_transform(X_train["MasVnrType"])
X_train["HeatingQC"] = label.fit_transform(X_train["HeatingQC"])
X_train["Foundation"] = label.fit_transform(X_train["Foundation"])
X_train["Heating"] = label.fit_transform(X_train["Heating"])
X_train["CentralAir"] = label.fit_transform(X_train["CentralAir"])
X_train["Electrical"] = label.fit_transform(X_train["Electrical"])
X_train["SaleType"] = label.fit_transform(X_train["SaleType"])
X_train["SaleCondition"] = label.fit_transform(X_train["SaleCondition"])
X_test["MSSubClass"] = label.fit_transform(X_test["MSSubClass"])
X_test["MSZoning"] = label.fit_transform(X_test["MSZoning"])
X_test["Street"] = label.fit_transform(X_test["Street"])
X_test["LandContour"] = label.fit_transform(X_test["LandContour"])
X_test["LotConfig"] = label.fit_transform(X_test["LotConfig"])
X_test["Neighborhood"] = label.fit_transform(X_test["Neighborhood"])
X_test["Condition1"] = label.fit_transform(X_test["Condition1"])
X_test["Condition2"] = label.fit_transform(X_test["Condition2"])
X_test["BldgType"] = label.fit_transform(X_test["BldgType"])
X_test["HouseStyle"] = label.fit_transform(X_test["HouseStyle"])
X_test["RoofStyle"] = label.fit_transform(X_test["RoofStyle"])
X_test["RoofMatl"] = label.fit_transform(X_test["RoofMatl"])
X_test["Exterior1st"] = label.fit_transform(X_test["Exterior1st"])
X_test["Exterior2nd"] = label.fit_transform(X_test["Exterior2nd"])
X_test["MasVnrType"] = label.fit_transform(X_test["MasVnrType"])
X_test["HeatingQC"] = label.fit_transform(X_test["HeatingQC"])
X_test["Foundation"] = label.fit_transform(X_test["Foundation"])
X_test["Heating"] = label.fit_transform(X_test["Heating"])
X_test["CentralAir"] = label.fit_transform(X_test["CentralAir"])
X_test["Electrical"] = label.fit_transform(X_test["Electrical"])
X_test["SaleType"] = label.fit_transform(X_test["SaleType"])
X_test["SaleCondition"] = label.fit_transform(X_test["SaleCondition"])
test_kaggle["MSSubClass"] = label.fit_transform(test_kaggle["MSSubClass"])
test_kaggle["MSZoning"] = label.fit_transform(test_kaggle["MSZoning"])
test_kaggle["Street"] = label.fit_transform(test_kaggle["Street"])
test_kaggle["LandContour"] = label.fit_transform(test_kaggle["LandContour"])
test_kaggle["LotConfig"] = label.fit_transform(test_kaggle["LotConfig"])
test_kaggle["Neighborhood"] = label.fit_transform(test_kaggle["Neighborhood"])
test_kaggle["Condition1"] = label.fit_transform(test_kaggle["Condition1"])
test_kaggle["Condition2"] = label.fit_transform(test_kaggle["Condition2"])
test_kaggle["BldgType"] = label.fit_transform(test_kaggle["BldgType"])
test_kaggle["HouseStyle"] = label.fit_transform(test_kaggle["HouseStyle"])
test_kaggle["RoofStyle"] = label.fit_transform(test_kaggle["RoofStyle"])
test_kaggle["RoofMatl"] = label.fit_transform(test_kaggle["RoofMatl"])
test_kaggle["Exterior1st"] = label.fit_transform(test_kaggle["Exterior1st"])
test_kaggle["Exterior2nd"] = label.fit_transform(test_kaggle["Exterior2nd"])
test_kaggle["MasVnrType"] = label.fit_transform(test_kaggle["MasVnrType"])
test_kaggle["HeatingQC"] = label.fit_transform(test_kaggle["HeatingQC"])
test_kaggle["Foundation"] = label.fit_transform(test_kaggle["Foundation"])
test_kaggle["Heating"] = label.fit_transform(test_kaggle["Heating"])
test_kaggle["CentralAir"] = label.fit_transform(test_kaggle["CentralAir"])
test_kaggle["Electrical"] = label.fit_transform(test_kaggle["Electrical"])
test_kaggle["SaleType"] = label.fit_transform(test_kaggle["SaleType"])
test_kaggle["SaleCondition"] = label.fit_transform(test_kaggle["SaleCondition"])
X_train.info()
#data = data.reset_index()
Codificação One-Hot
X_train.columns
X_train.drop(columns = [ "Id"], axis = 1, inplace=True)
X_test.drop(columns = ["Id"], axis = 1, inplace=True)
test_kaggle.drop(columns = ["Id"], axis = 1, inplace=True)
X_train.head()
X_train['isTest'] = 0
X_test['isTest'] = 1
train_and_test = pd.concat([X_train, X_test], ignore_index=True)
train_and_test
print(train_and_test.shape, test_kaggle.shape)
test_kaggle['isTest'] = 2
train_and_test = pd.concat([train_and_test, test_kaggle], ignore_index=True)
train_and_test = pd.get_dummies(train_and_test, columns=["MSSubClass", "MSZoning", "Street", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType",
"HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "Electrical",
"GarageType", "SaleType", "SaleCondition"]) # sem noção de ordem, converter para one-hotX_
#test_kaggle = pd.get_dummies(test_kaggle, columns=["MSSubClass", "MSZoning", "Street", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType",
#"HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "Electrical",
#"GarageType", "SaleType", "SaleCondition"]) # sem noção de ordem, converter para one-hot
X_train = train_and_test[train_and_test['isTest'] == 0]
X_test = train_and_test[train_and_test['isTest'] == 1]
test_kaggle = train_and_test[train_and_test['isTest'] == 2]
X_train.drop(columns = ["isTest"], axis = 1, inplace=True)
X_test.drop(columns = ["isTest"], axis = 1, inplace=True)
test_kaggle.drop(columns = ["isTest"], axis = 1, inplace=True)
train_and_test = train_and_test[train_and_test['isTest'] != 2]
train_and_test.drop(columns = ["isTest"], axis = 1, inplace=True)
print(train_and_test.shape, test_kaggle.shape)
# remover atributos com mais de 90% de zeros
total = X_train.shape[0]
threshold = 0.9
drop_list = []
for column in X_train.columns:
zeroes = len(X_train[X_train[column] == 0])
ratio = zeroes / total
if ratio > threshold:
drop_list.append(column)
X_train.drop(columns = drop_list, axis = 1, inplace=True)
X_test.drop(columns = drop_list, axis = 1, inplace=True)
test_kaggle.drop(columns = drop_list, axis = 1, inplace=True)
train_and_test.drop(columns = drop_list, axis = 1, inplace=True)
X_test.head()
X_train.head()
X_train["GarageCond"] = X_train["GarageCond"].astype("uint8")
X_test["GarageCond"] = X_test["GarageCond"].astype("uint8")
test_kaggle["GarageCond"] = test_kaggle["GarageCond"].astype("uint8")
Experimentos com Modelos
X_train.info()
X_train
X_train.info()
Modelos Candidatos
!pip install xgboost
X_train.columns
X_test.columns
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn import linear_model
MLA = [
DecisionTreeRegressor(),
LinearRegression(),
XGBRegressor(),
RandomForestRegressor(),
MLPRegressor(random_state=1, max_iter=500),
linear_model.SGDRegressor( max_iter = 1000,penalty = "elasticnet",loss = 'huber',tol = 1e-3, average = True)
]
row_index = 0
MLA_compare = pd.DataFrame()
model_names = []
r2_scores = []
for model in MLA:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
model_r2_score = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
msl = mean_squared_log_error(y_test, abs(y_pred))
MLA_name = model.__class__.__name__
MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
MLA_compare.loc[row_index, 'R2 Score'] = model_r2_score
MLA_compare.loc[row_index, 'MSL'] = msl
MLA_compare.loc[row_index, 'MSE'] = mse
MLA_compare.loc[row_index, 'MAE'] = mae
row_index+=1
MLA_compare
# grid search comentado para não executar todas as vezes.
'''
from sklearn.model_selection import GridSearchCV
xgb1 = XGBRegressor()
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
'objective':['reg:linear'],
'learning_rate': [.03, 0.05, .07], #so called `eta` value
'max_depth': [5, 6, 7],
'min_child_weight': [4],
'silent': [1],
'subsample': [0.7],
'colsample_bytree': [0.7],
'n_estimators': [500]}
xgb_grid = GridSearchCV(xgb1,
parameters,
cv = 2,
n_jobs = 5,
verbose=True)
xgb_grid.fit(X_train, y_train)
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)
'''
#0.8616263070651717
#{'colsample_bytree': 0.7, 'learning_rate': 0.03, 'max_depth': 5, 'min_child_weight': 4, 'n_estimators': 500, 'nthread': 4, 'objective': 'reg:linear', 'silent': 1, 'subsample': 0.7}
grid_result = XGBRegressor(colsample_bytree=0.7,
learning_rate=0.03,
max_depth=5,
min_child_weight=4,
n_estimators=500,
nthread=4,
objective='reg:linear',
silent=1,
subsample=0.7)
grid_result.fit(X_train, y_train)
y_pred = grid_result.predict(X_test)
print(r2_score(y_test, y_pred))
print(mean_squared_error(y_test, y_pred))
print(mean_absolute_error(y_test, y_pred))
print(mean_squared_log_error(y_test, abs(y_pred)))
y_all = y_train + y_test
X_train = train_and_test
y_train = y_all
train_and_test.shape
len(y_train)
### Production
optimal_model = XGBRegressor(colsample_bytree=0.7,
learning_rate=0.03,
max_depth=5,
min_child_weight=4,
n_estimators=500,
nthread=4,
objective='reg:linear',
silent=1,
subsample=0.7)
optimal_model.fit(X_train, y_train)
y_pred = optimal_model.predict(test_kaggle)
test_kaggle.head(1)
test_pred = optimal_model.predict(test_kaggle.head(1))
test_pred
test_kaggle.tail(1)
test_pred = optimal_model.predict(test_kaggle.tail(1))
test_pred
test_csv = pd.read_csv("/work/dados/test.csv")
submission = pd.DataFrame({
'Id': np.asarray(test_csv.Id),
'SalePrice': y_pred.astype(int)
})
submission.to_csv('my_submission.csv', index=False)