# Importando bibliotecas
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/work/input/house-prices-advanced-regression-techniques/train.csv', sep=',')
df.head()
# função auxiliar para checar os valores que nao existem
def check_nulls(df):
rows = []
for column in df.columns:
row = {'coluna': column, 'nans': df[column].isnull().sum(), 'frac_nans': df[column].isnull().sum() / df.shape[0]}
rows.append(row)
res = pd.DataFrame(rows)
return res[res.nans>0].sort_values('nans', ascending=False)
check_nulls(df)
# Removendo LotFrontage, pois existem muitos valores nulos, e esta informação está parcialmente
# contida dentro da area
df_fill_na = df.drop(['LotFrontage'], axis=1)
# Removendo locais que não possuem informações sobre garagem pois são poucas amostras
df_fill_na = df_fill_na[~df_fill_na["GarageType"].isnull()]
# preenchendo os valores nulos com o valor descrito no dataset "NA"
for name in df_fill_na.select_dtypes("object"):
df_fill_na[name] = df_fill_na[name].fillna("NA")
df_fill_na.head()
check_nulls(df_fill_na)
# Preenchendo o valor destas áreas com 0 para as não informadas (imaginando que tal área não existe)
df_fill_na["MasVnrArea"] = df_fill_na["MasVnrArea"].fillna(0)
check_nulls(df_fill_na)
#Street =>{'Pave': 1454, 'Grvl': 6}
# Utilities => {'AllPub': 1459, 'NoSeWa': 1}
# Norm => 98% normal {'Norm': 1445, 'Feedr': 6, 'PosN': 2, 'RRNn': 2, 'Artery': 2, 'PosA': 1, 'RRAn': 1, 'RRAe': 1}
df_fill_na = df_fill_na.drop(["Street"], axis=1)
df_fill_na = df_fill_na.drop(["Utilities"], axis=1)
df_fill_na = df_fill_na.drop(["Condition2"], axis=1)
# Replace nominal features with integer values
replace_dict ={
"quality": {
"Ex": 5,
"Gd": 4,
"TA": 3,
"Fa": 2,
"Po": 1,
"NA": 0,
},
"lot_shape": {
"Reg": 4,
"IR1": 3,
"IR2": 2,
"IR3": 1,
},
"land_slope": {
"Gtl": 2,
"Gt2": 1,
"Gt3": 0,
},
"boolean": {
'Y': 1,
'N': 0
},
"bsmt_exposure": {
'Gd': 4,
'Av': 3,
'Mn': 2,
'No': 1,
'NA': 0,
},
"bsmt_fin_type": {
'GLQ': 6,
'ALQ': 5,
'BLQ': 4,
'Rec': 3,
'LwQ': 2,
'Unf': 1,
'NA': 0
},
"functional": {
'Typ': 7,
'Min1': 6,
'Min2': 5,
'Mod': 4,
'Maj1': 3,
'Maj2': 2,
'Sev': 1,
'Sal': 0
},
"garage_finish": {
'Fin': 3,
'RFn': 2,
'Unf': 1,
'NA': 0,
},
"fence": {
'GdPrv': 4,
'MnPrv': 3,
'GdWo': 2,
'MnWw': 1,
'NA': 0
},
"paved_drive": {
'Y': 2,
'P': 1,
'N': 0
},
"quality_no_na":{
"Ex": 5,
"Gd": 4,
"TA": 3,
"Fa": 2,
"Po": 1
},
"lotShape":{
"Reg":3,
"IR1":2,
"IR2":1,
"IR3":0
},
"utilities":{
"AllPub":3,
"NoSewr":2,
"NoSeWa":1,
"ELO":0
}
}
# Instala e importa o modulo de category encoder para coficar em strings binárias
!pip install category_encoders
import category_encoders as ce
Requirement already satisfied: category_encoders in /root/venv/lib/python3.7/site-packages (2.3.0)
Requirement already satisfied: scipy>=1.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from category_encoders) (1.7.1)
Requirement already satisfied: patsy>=0.5.1 in /root/venv/lib/python3.7/site-packages (from category_encoders) (0.5.2)
Requirement already satisfied: pandas>=0.21.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from category_encoders) (1.2.5)
Requirement already satisfied: scikit-learn>=0.20.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from category_encoders) (1.0)
Requirement already satisfied: statsmodels>=0.9.0 in /root/venv/lib/python3.7/site-packages (from category_encoders) (0.13.0)
Requirement already satisfied: numpy>=1.14.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from category_encoders) (1.19.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.21.1->category_encoders) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.21.1->category_encoders) (2021.3)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.1->category_encoders) (1.16.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn>=0.20.0->category_encoders) (3.0.0)
Requirement already satisfied: joblib>=0.11 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn>=0.20.0->category_encoders) (1.1.0)
WARNING: You are using pip version 21.2.4; however, version 21.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
replaced_df = df_fill_na.copy()
# substitui os atributos categóricos por numericos
for label in replace_dict:
replace = replace_dict[label]
replaced_df = replaced_df.replace(replace)
# fazendo enconding do resto como strings binárias => cada bit equivale a um valor categórico
# e se transforma em uma coluna
category_columns = replaced_df.select_dtypes("object").columns
encoder = ce.BinaryEncoder(cols=category_columns)
binarizado_df = encoder.fit_transform(replaced_df)
binarizado_df
# Aplica o one-hot nos dados que ainda são textuais
category_columns = replaced_df.select_dtypes("object").columns
df_one_hot = pd.get_dummies(replaced_df, columns=category_columns)
df_one_hot
check_nulls(df_one_hot)
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Salva os valores do conjunto de dados do valor de objetivo
X = df_one_hot.drop("SalePrice", axis=1)
X_bin = binarizado_df.drop("SalePrice", axis=1)
y = df_one_hot["SalePrice"]
y_bin = binarizado_df["SalePrice"]
y.head()
# separa os conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
X_bin_train, X_bin_test, y_bin_train, y_bin_test = train_test_split(X_bin, y_bin, test_size=0.3, random_state=1) # 70% training and 30% test
import numpy as np
# inicializa o modelo de árvore de regressão
tree_model = DecisionTreeRegressor(random_state=42)
tree_model_bin = DecisionTreeRegressor(random_state=42)
# treina o modelo utilizando o método fit
tree_model.fit(X_train, y_train)
tree_model_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
y_pred = tree_model.predict(X_test)
y_bin_pred = tree_model_bin.predict(X_bin_test)
# Imprime os resultados alcançados
mse = mean_squared_error(y_test, y_pred, squared=False)
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100
print("One hot encoding: ",mape, mse)
mse = mean_squared_error(y_bin_test, y_bin_pred, squared=False)
mape = np.mean(np.abs((y_bin_test - y_bin_pred)/y_bin_test))*100
print("Binarization: ",mape, mse)
One hot encoding: 15.490235313287323 47083.11587342369
Binarization: 14.792065991758266 44178.29122425594
from sklearn.ensemble import RandomForestRegressor
# inicializa o modelo de random forest
rfce_model = RandomForestRegressor(random_state=42, n_estimators = 100)
rfce_model_bin = RandomForestRegressor(random_state=42, n_estimators = 100)
# treina o modelo utilizando o método fit
rfce_model.fit(X_train, y_train)
rfce_model_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
rfce_y_pred = rfce_model.predict(X_test)
rfce_y_bin_pred = rfce_model_bin.predict(X_bin_test)
# Imprime os resultados alcançados
rfce_mse = mean_squared_error(y_test, rfce_y_pred, squared=False)
rfce_mape = np.mean(np.abs((y_test - rfce_y_pred)/y_test))*100
print("One hot encoding: ",rfce_mape, rfce_mse)
rfce_mse = mean_squared_error(y_bin_test, rfce_y_bin_pred, squared=False)
rfce_mape = np.mean(np.abs((y_bin_test - rfce_y_bin_pred)/y_bin_test))*100
print("Binarization: ",rfce_mape, rfce_mse)
One hot encoding: 11.166492577553734 39215.549445777855
Binarization: 11.225056560953462 40289.71189130177
# inicializa o modelo de random forest
rfcg_model = RandomForestRegressor(criterion = 'absolute_error', random_state=42, n_estimators = 100)
rfcg_model_bin = RandomForestRegressor(criterion = 'absolute_error', random_state=42, n_estimators = 100)
# treina o modelo utilizando o método fit
rfcg_model.fit(X_train, y_train)
rfcg_model_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
rfcg_y_pred = rfcg_model.predict(X_test)
rfcg_y_bin_pred = rfcg_model_bin.predict(X_bin_test)
# Imprime os resultados alcançados
rfcg_mse = mean_squared_error(y_test, rfcg_y_pred, squared=False)
rfcg_mape = np.mean(np.abs((y_test - rfcg_y_pred)/y_test))*100
print("One hot encoding: ",rfcg_mape, rfcg_mse)
rfcg_mse = mean_squared_error(y_bin_test, rfcg_y_bin_pred, squared=False)
rfcg_mape = np.mean(np.abs((y_bin_test - rfcg_y_bin_pred)/y_bin_test))*100
print("Binarization: ",rfcg_mape, rfcg_mse)
One hot encoding: 10.931671147043067 38851.00049711633
Binarization: 11.05995509464024 38307.25008646592
try:
import xgboost as xgb
except:
!pip install xgboost
import xgboost as xgb
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg_bin = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
# treina o modelo utilizando o método fit
xg_reg.fit(X_train, y_train)
xg_reg_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
xg_y_pred = xg_reg.predict(X_test)
xg_y_bin_pred = xg_reg_bin.predict(X_bin_test)
# Imprime os resultados alcançados
xg_mse = mean_squared_error(y_test, xg_y_pred, squared=False)
xg_mape = np.mean(np.abs((y_test - xg_y_pred)/y_test))*100
print("One hot encoding: ",xg_mape, xg_mse)
xg_mse = mean_squared_error(y_bin_test, xg_y_bin_pred, squared=False)
xg_mape = np.mean(np.abs((y_bin_test - xg_y_bin_pred)/y_bin_test))*100
print("Binarization: ",xg_mape, xg_mse)
One hot encoding: 9.992614291323196 38688.17174858294
Binarization: 10.176256650571306 38085.50223698712
# Importa as bilbiotecas
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split
from sklearn.preprocessing import StandardScaler
torch.manual_seed(42)
# Carrega os dados em um formato adequado para redes neurais
class Dataset(torch.utils.data.Dataset):
def __init__(self, X, y):
if not torch.is_tensor(X) and not torch.is_tensor(y):
# Aplica scaling nos dados
X = StandardScaler().fit_transform(X)
self.X = torch.from_numpy(X)
self.y = torch.from_numpy(np.array(y))
def __len__(self):
return len(self.X)
def __getitem__(self, i):
return self.X[i], self.y[i]
validation_size = int(0.1 * len(X_train)) # Validação representa 10% dos dados de treino
training_size = len(X_train) - validation_size
train_X, validation_X = random_split(X_train,
[training_size, validation_size])
train_y, validation_y = random_split(y_train,
[training_size, validation_size])
# train_y.dataset
train_dataset = Dataset(train_X.dataset, train_y.dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=1)
validation_dataset = Dataset(validation_X.dataset, validation_y.dataset)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=10, shuffle=False, num_workers=1)
test_dataset = Dataset(X_test, y_test)
input_layer = len(X_train.columns)
input_layer
input_layer = len(X_train.columns)
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(input_layer, 64),
nn.ReLU(),
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, 1),
)
def forward(self, x):
'''
Forward pass
'''
return self.layers(x)
# Inicializa a MLP
mlp = MLP()
# Define a função de loss e o otimizador
loss_function = nn.L1Loss() # Função MAE
optimizer = torch.optim.SGD(mlp.parameters(), lr=1e-3)
import copy
history = {
'train_loss': list(),
'validation_loss': list(),
}
min_loss = float("inf")
best_model = copy.deepcopy(mlp.state_dict())
# Executa o loop de treinamento
for epoch in range(0, 40):
print(f'Starting epoch {epoch+1}')
# Inicializa loss
current_loss = 0.0
mlp.train()
# Itera sobre os dados de treino
for inputs, targets in train_loader:
# Converte os inputs para float
inputs, targets = inputs.float(), targets.float()
targets = targets.reshape((targets.shape[0], 1))
# Zera os gradientes
optimizer.zero_grad()
with torch.set_grad_enabled(True):
# Obtem as predições do modelo
outputs = mlp(inputs)
# Calcula a loss
loss = loss_function(outputs, targets)
# Realiza o back propagation
loss.backward()
# Realiza a otimização
optimizer.step()
current_loss += loss.item()
# Cacula o loss médio de treinamento da época
train_loss = current_loss / len(train_loader)
current_loss = 0.0
mlp.eval()
# Itera sobre os dados de validação
for inputs, targets in validation_loader:
# Converte os inputs para float
inputs, targets = inputs.float(), targets.float()
targets = targets.reshape((targets.shape[0], 1))
outputs = mlp(inputs)
# Calcula a loss
loss = loss_function(outputs, targets)
current_loss += loss.item()
# Cacula o loss médio de validação da época
validation_loss = current_loss / len(validation_loader)
current_loss = 0.0
if validation_loss < min_loss:
best_model = copy.deepcopy(mlp.state_dict())
min_loss = validation_loss
print(f"Training loss: {train_loss}")
print(f"Validation loss: {validation_loss}\n")
history['train_loss'].append(train_loss)
history['validation_loss'].append(validation_loss)
mlp.load_state_dict(best_model)
# Process is complete.
print('Training process has finished.')
print(f"Mininum loss found: {min_loss}")
Starting epoch 1
Training loss: 184072.85083762885
Validation loss: 183908.947003866
Starting epoch 2
Training loss: 183932.04719716494
Validation loss: 183908.75144974227
Starting epoch 3
Training loss: 184278.7474226804
Validation loss: 183908.52126288658
Starting epoch 4
Training loss: 183937.25080541236
Validation loss: 183908.22905927835
Starting epoch 5
Training loss: 184592.06990979382
Validation loss: 183907.8394007732
Starting epoch 6
Training loss: 184238.5980992268
Validation loss: 183907.28044458764
Starting epoch 7
Training loss: 184012.56797680413
Validation loss: 183906.4238079897
Starting epoch 8
Training loss: 184028.37290592783
Validation loss: 183904.99484536084
Starting epoch 9
Training loss: 183780.5401095361
Validation loss: 183902.31556056702
Starting epoch 10
Training loss: 184086.54824420103
Validation loss: 183896.31539948453
Starting epoch 11
Training loss: 183940.36146907217
Validation loss: 183877.90850515463
Starting epoch 12
Training loss: 184027.04655283506
Validation loss: 183772.5219072165
Starting epoch 13
Training loss: 183337.7866462629
Validation loss: 179115.7213273196
Starting epoch 14
Training loss: 81555.39171230671
Validation loss: 41723.375
Starting epoch 15
Training loss: 35227.62751691366
Validation loss: 32200.297247503222
Starting epoch 16
Training loss: 32650.937942976805
Validation loss: 28364.669911807345
Starting epoch 17
Training loss: 28810.398950950388
Validation loss: 34710.0316929768
Starting epoch 18
Training loss: 28869.805019732605
Validation loss: 22334.62491945876
Starting epoch 19
Training loss: 25539.558412532217
Validation loss: 28953.932083601805
Starting epoch 20
Training loss: 26160.280112355027
Validation loss: 28120.993113724227
Starting epoch 21
Training loss: 25733.084689110823
Validation loss: 27653.4255597616
Starting epoch 22
Training loss: 25828.215206185567
Validation loss: 45389.884685083765
Starting epoch 23
Training loss: 25573.944517155283
Validation loss: 37475.09139416881
Starting epoch 24
Training loss: 25044.7549935567
Validation loss: 37597.79900934279
Starting epoch 25
Training loss: 25960.73656974871
Validation loss: 30481.29253584085
Starting epoch 26
Training loss: 26564.654236469072
Validation loss: 16736.90443278834
Starting epoch 27
Training loss: 26336.26456789626
Validation loss: 16749.60273034794
Starting epoch 28
Training loss: 27204.405202963917
Validation loss: 34736.911233489045
Starting epoch 29
Training loss: 24179.01186976482
Validation loss: 42493.502698131444
Starting epoch 30
Training loss: 24941.272833440722
Validation loss: 23359.01500080541
Starting epoch 31
Training loss: 26637.50974548969
Validation loss: 33649.80400289949
Starting epoch 32
Training loss: 25680.09298485825
Validation loss: 30054.359838112112
Starting epoch 33
Training loss: 24704.07311130799
Validation loss: 36723.532236630155
Starting epoch 34
Training loss: 24549.72046150129
Validation loss: 19785.4258919942
Starting epoch 35
Training loss: 26113.322149846972
Validation loss: 37095.911163015466
Starting epoch 36
Training loss: 25189.418542606316
Validation loss: 25444.017578125
Starting epoch 37
Training loss: 26082.08270578286
Validation loss: 18806.809952883377
Starting epoch 38
Training loss: 24832.176707474227
Validation loss: 31265.759866301545
Starting epoch 39
Training loss: 24648.283439714884
Validation loss: 21279.72417646585
Starting epoch 40
Training loss: 27756.952178640466
Validation loss: 37726.55330823131
Training process has finished.
Mininum loss found: 16736.90443278834
def plot_losses(history):
train_loss = history["train_loss"]
validation_loss = history["validation_loss"]
epochs = range(len(train_loss))
plt.figure(figsize=(20, 8), dpi=80)
plt.plot(train_loss, "-r") # r = Red
plt.plot(validation_loss, "-b") # b = Blue
plt.xticks(epochs)
plt.legend(["Training loss", "Validation loss"])
plt.xlabel("Epochs")
plt.ylabel("Losses")
plot_losses(history)
predicted = mlp(test_dataset.X.float())
y_pred = predicted.cpu().detach().numpy()[:, 0]
y_true = test_dataset.y.tolist()
mape = np.mean(np.abs((y_true - y_pred)/y_true))*100
rmse = mean_squared_error(y_true, y_pred, squared=False)
print(mape, rmse)
15.703894699555335 51518.22043040924
df_test = pd.read_csv('/work/input/house-prices-advanced-regression-techniques/test.csv', sep=',')
df_test.head()
df_test_fill_na = df_test.drop(['LotFrontage'], axis=1)
# df_test_fill_na = df_test_fill_na[~df_test_fill_na["GarageType"].isnull()]
for name in df_test_fill_na.select_dtypes("object"):
df_test_fill_na[name] = df_test_fill_na[name].fillna("NA")
df_test_fill_na["MasVnrArea"] = df_test_fill_na["MasVnrArea"].fillna(0)
df_test_fill_na = df_test_fill_na.drop(["Street"], axis=1)
df_test_fill_na = df_test_fill_na.drop(["Utilities"], axis=1)
df_test_fill_na = df_test_fill_na.drop(["Condition2"], axis=1)
replaced_test_df = df_test_fill_na.copy()
for label in replace_dict:
replace = replace_dict[label]
replaced_test_df = replaced_test_df.replace(replace)
category_columns = replaced_test_df.select_dtypes("object").columns
df_test_one_hot = pd.get_dummies(replaced_test_df, columns=category_columns)
df_test_one_hot.head()
columns_in_test = set(df_test_one_hot.columns.tolist()) - set(X.columns.tolist())
columns_in_train = set(X.columns.tolist()) - set(df_test_one_hot.columns.tolist())
print("Columns missing in test dataset: ")
print(columns_in_train)
print("Columns missing in train dataset: ")
print(columns_in_test)
print("Difference: ", len(columns_in_train) - len(columns_in_test))
Columns missing in test dataset:
{'Electrical_Mix', 'Heating_Floor', 'RoofMatl_Metal', 'Exterior1st_ImStucc', 'Electrical_0', 'RoofMatl_ClyTile', 'Exterior2nd_Other', 'Heating_OthW', 'Exterior1st_Stone', 'RoofMatl_Membran', 'MiscFeature_TenC', 'RoofMatl_Roll', 'HouseStyle_2.5Fin'}
Columns missing in train dataset:
{'SaleType_0', 'Exterior1st_AsphShn', 'Exterior1st_0', 'GarageType_0', 'MSZoning_0', 'Exterior2nd_0'}
Difference: 7
df_test_drop = df_test_one_hot.copy()
for column in columns_in_test:
df_test_drop = df_test_drop.drop(column, axis=1)
df_test_add = df_test_drop.copy()
for column in columns_in_train:
df_test_add[column] = 0
# Reorder columns
train_columns = X.columns.tolist()
test_df = df_test_add[train_columns]
test_df.head()
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg.fit(X, y)
xg_y_pred = xg_reg.predict(test_df)
df_test["SalePrice"] = xg_y_pred
result = df_test[["Id", "SalePrice"]]
result
result.to_csv("result.csv", index=False)