# Importando bibliotecas
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/work/input/house-prices-advanced-regression-techniques/train.csv', sep=',')
df.head()
# função auxiliar para checar os valores que nao existem
def check_nulls(df):
rows = []
for column in df.columns:
row = {'coluna': column, 'nans': df[column].isnull().sum(), 'frac_nans': df[column].isnull().sum() / df.shape[0]}
rows.append(row)
res = pd.DataFrame(rows)
return res[res.nans>0].sort_values('nans', ascending=False)
check_nulls(df)
# Removendo LotFrontage, pois existem muitos valores nulos, e esta informação está parcialmente
# contida dentro da area
df_fill_na = df.drop(['LotFrontage'], axis=1)
# Removendo locais que não possuem informações sobre garagem pois são poucas amostras
df_fill_na = df_fill_na[~df_fill_na["GarageType"].isnull()]
# preenchendo os valores nulos com o valor descrito no dataset "NA"
for name in df_fill_na.select_dtypes("object"):
df_fill_na[name] = df_fill_na[name].fillna("NA")
df_fill_na.head()
check_nulls(df_fill_na)
# Preenchendo o valor destas áreas com 0 para as não informadas (imaginando que tal área não existe)
df_fill_na["MasVnrArea"] = df_fill_na["MasVnrArea"].fillna(0)
check_nulls(df_fill_na)
#Street =>{'Pave': 1454, 'Grvl': 6}
# Utilities => {'AllPub': 1459, 'NoSeWa': 1}
# Norm => 98% normal {'Norm': 1445, 'Feedr': 6, 'PosN': 2, 'RRNn': 2, 'Artery': 2, 'PosA': 1, 'RRAn': 1, 'RRAe': 1}
df_fill_na = df_fill_na.drop(["Street"], axis=1)
df_fill_na = df_fill_na.drop(["Utilities"], axis=1)
df_fill_na = df_fill_na.drop(["Condition2"], axis=1)
# Replace nominal features with integer values
replace_dict ={
"quality": {
"Ex": 5,
"Gd": 4,
"TA": 3,
"Fa": 2,
"Po": 1,
"NA": 0,
},
"lot_shape": {
"Reg": 4,
"IR1": 3,
"IR2": 2,
"IR3": 1,
},
"land_slope": {
"Gtl": 2,
"Gt2": 1,
"Gt3": 0,
},
"boolean": {
'Y': 1,
'N': 0
},
"bsmt_exposure": {
'Gd': 4,
'Av': 3,
'Mn': 2,
'No': 1,
'NA': 0,
},
"bsmt_fin_type": {
'GLQ': 6,
'ALQ': 5,
'BLQ': 4,
'Rec': 3,
'LwQ': 2,
'Unf': 1,
'NA': 0
},
"functional": {
'Typ': 7,
'Min1': 6,
'Min2': 5,
'Mod': 4,
'Maj1': 3,
'Maj2': 2,
'Sev': 1,
'Sal': 0
},
"garage_finish": {
'Fin': 3,
'RFn': 2,
'Unf': 1,
'NA': 0,
},
"fence": {
'GdPrv': 4,
'MnPrv': 3,
'GdWo': 2,
'MnWw': 1,
'NA': 0
},
"paved_drive": {
'Y': 2,
'P': 1,
'N': 0
},
"quality_no_na":{
"Ex": 5,
"Gd": 4,
"TA": 3,
"Fa": 2,
"Po": 1
},
"lotShape":{
"Reg":3,
"IR1":2,
"IR2":1,
"IR3":0
},
"utilities":{
"AllPub":3,
"NoSewr":2,
"NoSeWa":1,
"ELO":0
}
}
# Instala e importa o modulo de category encoder para coficar em strings binárias
!pip install category_encoders
import category_encoders as ce
replaced_df = df_fill_na.copy()
# substitui os atributos categóricos por numericos
for label in replace_dict:
replace = replace_dict[label]
replaced_df = replaced_df.replace(replace)
# fazendo enconding do resto como strings binárias => cada bit equivale a um valor categórico
# e se transforma em uma coluna
category_columns = replaced_df.select_dtypes("object").columns
encoder = ce.BinaryEncoder(cols=category_columns)
binarizado_df = encoder.fit_transform(replaced_df)
binarizado_df
# Aplica o one-hot nos dados que ainda são textuais
category_columns = replaced_df.select_dtypes("object").columns
df_one_hot = pd.get_dummies(replaced_df, columns=category_columns)
df_one_hot
check_nulls(df_one_hot)
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Salva os valores do conjunto de dados do valor de objetivo
X = df_one_hot.drop("SalePrice", axis=1)
X_bin = binarizado_df.drop("SalePrice", axis=1)
y = df_one_hot["SalePrice"]
y_bin = binarizado_df["SalePrice"]
y.head()
# separa os conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
X_bin_train, X_bin_test, y_bin_train, y_bin_test = train_test_split(X_bin, y_bin, test_size=0.3, random_state=1) # 70% training and 30% test
import numpy as np
# inicializa o modelo de árvore de regressão
tree_model = DecisionTreeRegressor(random_state=42)
tree_model_bin = DecisionTreeRegressor(random_state=42)
# treina o modelo utilizando o método fit
tree_model.fit(X_train, y_train)
tree_model_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
y_pred = tree_model.predict(X_test)
y_bin_pred = tree_model_bin.predict(X_bin_test)
# Imprime os resultados alcançados
mse = mean_squared_error(y_test, y_pred, squared=False)
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100
print("One hot encoding: ",mape, mse)
mse = mean_squared_error(y_bin_test, y_bin_pred, squared=False)
mape = np.mean(np.abs((y_bin_test - y_bin_pred)/y_bin_test))*100
print("Binarization: ",mape, mse)
from sklearn.ensemble import RandomForestRegressor
# inicializa o modelo de random forest
rfce_model = RandomForestRegressor(random_state=42, n_estimators = 100)
rfce_model_bin = RandomForestRegressor(random_state=42, n_estimators = 100)
# treina o modelo utilizando o método fit
rfce_model.fit(X_train, y_train)
rfce_model_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
rfce_y_pred = rfce_model.predict(X_test)
rfce_y_bin_pred = rfce_model_bin.predict(X_bin_test)
# Imprime os resultados alcançados
rfce_mse = mean_squared_error(y_test, rfce_y_pred, squared=False)
rfce_mape = np.mean(np.abs((y_test - rfce_y_pred)/y_test))*100
print("One hot encoding: ",rfce_mape, rfce_mse)
rfce_mse = mean_squared_error(y_bin_test, rfce_y_bin_pred, squared=False)
rfce_mape = np.mean(np.abs((y_bin_test - rfce_y_bin_pred)/y_bin_test))*100
print("Binarization: ",rfce_mape, rfce_mse)
# inicializa o modelo de random forest
rfcg_model = RandomForestRegressor(criterion = 'absolute_error', random_state=42, n_estimators = 100)
rfcg_model_bin = RandomForestRegressor(criterion = 'absolute_error', random_state=42, n_estimators = 100)
# treina o modelo utilizando o método fit
rfcg_model.fit(X_train, y_train)
rfcg_model_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
rfcg_y_pred = rfcg_model.predict(X_test)
rfcg_y_bin_pred = rfcg_model_bin.predict(X_bin_test)
# Imprime os resultados alcançados
rfcg_mse = mean_squared_error(y_test, rfcg_y_pred, squared=False)
rfcg_mape = np.mean(np.abs((y_test - rfcg_y_pred)/y_test))*100
print("One hot encoding: ",rfcg_mape, rfcg_mse)
rfcg_mse = mean_squared_error(y_bin_test, rfcg_y_bin_pred, squared=False)
rfcg_mape = np.mean(np.abs((y_bin_test - rfcg_y_bin_pred)/y_bin_test))*100
print("Binarization: ",rfcg_mape, rfcg_mse)
try:
import xgboost as xgb
except:
!pip install xgboost
import xgboost as xgb
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg_bin = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
# treina o modelo utilizando o método fit
xg_reg.fit(X_train, y_train)
xg_reg_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
xg_y_pred = xg_reg.predict(X_test)
xg_y_bin_pred = xg_reg_bin.predict(X_bin_test)
# Imprime os resultados alcançados
xg_mse = mean_squared_error(y_test, xg_y_pred, squared=False)
xg_mape = np.mean(np.abs((y_test - xg_y_pred)/y_test))*100
print("One hot encoding: ",xg_mape, xg_mse)
xg_mse = mean_squared_error(y_bin_test, xg_y_bin_pred, squared=False)
xg_mape = np.mean(np.abs((y_bin_test - xg_y_bin_pred)/y_bin_test))*100
print("Binarization: ",xg_mape, xg_mse)
# Importa as bilbiotecas
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split
from sklearn.preprocessing import StandardScaler
torch.manual_seed(42)
# Carrega os dados em um formato adequado para redes neurais
class Dataset(torch.utils.data.Dataset):
def __init__(self, X, y):
if not torch.is_tensor(X) and not torch.is_tensor(y):
# Aplica scaling nos dados
X = StandardScaler().fit_transform(X)
self.X = torch.from_numpy(X)
self.y = torch.from_numpy(np.array(y))
def __len__(self):
return len(self.X)
def __getitem__(self, i):
return self.X[i], self.y[i]
validation_size = int(0.1 * len(X_train)) # Validação representa 10% dos dados de treino
training_size = len(X_train) - validation_size
train_X, validation_X = random_split(X_train,
[training_size, validation_size])
train_y, validation_y = random_split(y_train,
[training_size, validation_size])
# train_y.dataset
train_dataset = Dataset(train_X.dataset, train_y.dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=1)
validation_dataset = Dataset(validation_X.dataset, validation_y.dataset)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=10, shuffle=False, num_workers=1)
test_dataset = Dataset(X_test, y_test)
input_layer = len(X_train.columns)
input_layer
input_layer = len(X_train.columns)
class MLP(nn.Module):
def __init__(self):
super().__init__()
self.layers = nn.Sequential(
nn.Linear(input_layer, 64),
nn.ReLU(),
nn.Linear(64, 32),
nn.ReLU(),
nn.Linear(32, 1),
)
def forward(self, x):
'''
Forward pass
'''
return self.layers(x)
# Inicializa a MLP
mlp = MLP()
# Define a função de loss e o otimizador
loss_function = nn.L1Loss() # Função MAE
optimizer = torch.optim.SGD(mlp.parameters(), lr=1e-3)
import copy
history = {
'train_loss': list(),
'validation_loss': list(),
}
min_loss = float("inf")
best_model = copy.deepcopy(mlp.state_dict())
# Executa o loop de treinamento
for epoch in range(0, 40):
print(f'Starting epoch {epoch+1}')
# Inicializa loss
current_loss = 0.0
mlp.train()
# Itera sobre os dados de treino
for inputs, targets in train_loader:
# Converte os inputs para float
inputs, targets = inputs.float(), targets.float()
targets = targets.reshape((targets.shape[0], 1))
# Zera os gradientes
optimizer.zero_grad()
with torch.set_grad_enabled(True):
# Obtem as predições do modelo
outputs = mlp(inputs)
# Calcula a loss
loss = loss_function(outputs, targets)
# Realiza o back propagation
loss.backward()
# Realiza a otimização
optimizer.step()
current_loss += loss.item()
# Cacula o loss médio de treinamento da época
train_loss = current_loss / len(train_loader)
current_loss = 0.0
mlp.eval()
# Itera sobre os dados de validação
for inputs, targets in validation_loader:
# Converte os inputs para float
inputs, targets = inputs.float(), targets.float()
targets = targets.reshape((targets.shape[0], 1))
outputs = mlp(inputs)
# Calcula a loss
loss = loss_function(outputs, targets)
current_loss += loss.item()
# Cacula o loss médio de validação da época
validation_loss = current_loss / len(validation_loader)
current_loss = 0.0
if validation_loss < min_loss:
best_model = copy.deepcopy(mlp.state_dict())
min_loss = validation_loss
print(f"Training loss: {train_loss}")
print(f"Validation loss: {validation_loss}\n")
history['train_loss'].append(train_loss)
history['validation_loss'].append(validation_loss)
mlp.load_state_dict(best_model)
# Process is complete.
print('Training process has finished.')
print(f"Mininum loss found: {min_loss}")
def plot_losses(history):
train_loss = history["train_loss"]
validation_loss = history["validation_loss"]
epochs = range(len(train_loss))
plt.figure(figsize=(20, 8), dpi=80)
plt.plot(train_loss, "-r") # r = Red
plt.plot(validation_loss, "-b") # b = Blue
plt.xticks(epochs)
plt.legend(["Training loss", "Validation loss"])
plt.xlabel("Epochs")
plt.ylabel("Losses")
plot_losses(history)
predicted = mlp(test_dataset.X.float())
y_pred = predicted.cpu().detach().numpy()[:, 0]
y_true = test_dataset.y.tolist()
mape = np.mean(np.abs((y_true - y_pred)/y_true))*100
rmse = mean_squared_error(y_true, y_pred, squared=False)
print(mape, rmse)
df_test = pd.read_csv('/work/input/house-prices-advanced-regression-techniques/test.csv', sep=',')
df_test.head()
df_test_fill_na = df_test.drop(['LotFrontage'], axis=1)
# df_test_fill_na = df_test_fill_na[~df_test_fill_na["GarageType"].isnull()]
for name in df_test_fill_na.select_dtypes("object"):
df_test_fill_na[name] = df_test_fill_na[name].fillna("NA")
df_test_fill_na["MasVnrArea"] = df_test_fill_na["MasVnrArea"].fillna(0)
df_test_fill_na = df_test_fill_na.drop(["Street"], axis=1)
df_test_fill_na = df_test_fill_na.drop(["Utilities"], axis=1)
df_test_fill_na = df_test_fill_na.drop(["Condition2"], axis=1)
replaced_test_df = df_test_fill_na.copy()
for label in replace_dict:
replace = replace_dict[label]
replaced_test_df = replaced_test_df.replace(replace)
category_columns = replaced_test_df.select_dtypes("object").columns
df_test_one_hot = pd.get_dummies(replaced_test_df, columns=category_columns)
df_test_one_hot.head()
columns_in_test = set(df_test_one_hot.columns.tolist()) - set(X.columns.tolist())
columns_in_train = set(X.columns.tolist()) - set(df_test_one_hot.columns.tolist())
print("Columns missing in test dataset: ")
print(columns_in_train)
print("Columns missing in train dataset: ")
print(columns_in_test)
print("Difference: ", len(columns_in_train) - len(columns_in_test))
df_test_drop = df_test_one_hot.copy()
for column in columns_in_test:
df_test_drop = df_test_drop.drop(column, axis=1)
df_test_add = df_test_drop.copy()
for column in columns_in_train:
df_test_add[column] = 0
# Reorder columns
train_columns = X.columns.tolist()
test_df = df_test_add[train_columns]
test_df.head()
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg.fit(X, y)
xg_y_pred = xg_reg.predict(test_df)
df_test["SalePrice"] = xg_y_pred
result = df_test[["Id", "SalePrice"]]
result
result.to_csv("result.csv", index=False)