# Importando bibliotecas
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/work/input/house-prices-advanced-regression-techniques/train.csv', sep=',')
df.head()
# função auxiliar para checar os valores que nao existem
def check_nulls(df):
    rows = []
    for column in df.columns:
      row = {'coluna': column, 'nans': df[column].isnull().sum(), 'frac_nans': df[column].isnull().sum() / df.shape[0]}
      rows.append(row)
    res = pd.DataFrame(rows)
    return res[res.nans>0].sort_values('nans', ascending=False)
check_nulls(df)
# Removendo LotFrontage, pois existem muitos valores nulos, e esta informação está parcialmente
# contida dentro da area
df_fill_na = df.drop(['LotFrontage'], axis=1)
# Removendo locais que não possuem informações sobre garagem pois são poucas amostras
df_fill_na = df_fill_na[~df_fill_na["GarageType"].isnull()]
# preenchendo os valores nulos com o valor descrito no dataset "NA"
for name in df_fill_na.select_dtypes("object"):
    df_fill_na[name] = df_fill_na[name].fillna("NA")
df_fill_na.head()
check_nulls(df_fill_na)
# Preenchendo o valor destas áreas com 0 para as não informadas (imaginando que tal área não existe)
df_fill_na["MasVnrArea"] = df_fill_na["MasVnrArea"].fillna(0)
check_nulls(df_fill_na)
#Street =>{'Pave': 1454, 'Grvl': 6}
# Utilities => {'AllPub': 1459, 'NoSeWa': 1}
# Norm => 98% normal {'Norm': 1445, 'Feedr': 6, 'PosN': 2, 'RRNn': 2, 'Artery': 2, 'PosA': 1, 'RRAn': 1, 'RRAe': 1}
df_fill_na = df_fill_na.drop(["Street"], axis=1)
df_fill_na = df_fill_na.drop(["Utilities"], axis=1)
df_fill_na = df_fill_na.drop(["Condition2"], axis=1)
# Replace nominal features with integer values
replace_dict ={
    "quality": {
        "Ex": 5,
        "Gd": 4,
        "TA": 3,
        "Fa": 2,
        "Po": 1,
        "NA": 0,
    },
    "lot_shape": {
        "Reg": 4,
        "IR1": 3,
        "IR2": 2,
        "IR3": 1,
    },
    "land_slope": {
        "Gtl": 2,
        "Gt2": 1,
        "Gt3": 0,
    },
    "boolean": {
        'Y': 1,
        'N': 0
    },
    "bsmt_exposure": {
        'Gd': 4,
        'Av': 3,
        'Mn': 2,
        'No': 1,
        'NA': 0,
    },
    "bsmt_fin_type": {
        'GLQ': 6,
        'ALQ': 5,
        'BLQ': 4,
        'Rec': 3,
        'LwQ': 2,
        'Unf': 1,
        'NA': 0
    },
    "functional": {
        'Typ': 7,
        'Min1': 6,
        'Min2': 5,
        'Mod': 4,
        'Maj1': 3,
        'Maj2': 2,
        'Sev': 1,
        'Sal': 0
    },
    "garage_finish": {
        'Fin': 3,
        'RFn': 2,
        'Unf': 1,
        'NA': 0,
    },
    "fence": {
        'GdPrv': 4,
        'MnPrv': 3,
        'GdWo': 2,
        'MnWw': 1,
        'NA': 0
    },
    "paved_drive": {
        'Y': 2,
        'P': 1,
        'N': 0
    },
    "quality_no_na":{
        "Ex": 5,
        "Gd": 4,
        "TA": 3,
        "Fa": 2,
        "Po": 1
    },
    "lotShape":{
        "Reg":3,
        "IR1":2,
        "IR2":1,
        "IR3":0
    },
    "utilities":{
        "AllPub":3,
        "NoSewr":2,
        "NoSeWa":1,
        "ELO":0
    }
}
# Instala e importa o modulo de category encoder para coficar em strings binárias
!pip install category_encoders
import category_encoders as ce
replaced_df = df_fill_na.copy()
# substitui os atributos categóricos por numericos
for label in replace_dict:
    replace = replace_dict[label]
    replaced_df = replaced_df.replace(replace)
# fazendo enconding do resto como strings binárias => cada bit equivale a um valor categórico
#    e se transforma em uma coluna
category_columns = replaced_df.select_dtypes("object").columns
encoder = ce.BinaryEncoder(cols=category_columns)
binarizado_df = encoder.fit_transform(replaced_df)
binarizado_df
# Aplica o one-hot nos dados que ainda são textuais
category_columns = replaced_df.select_dtypes("object").columns
df_one_hot = pd.get_dummies(replaced_df, columns=category_columns)
df_one_hot
check_nulls(df_one_hot)
from sklearn.tree import DecisionTreeRegressor 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Salva os valores do conjunto de dados do valor de objetivo
X = df_one_hot.drop("SalePrice", axis=1)
X_bin = binarizado_df.drop("SalePrice", axis=1)
y = df_one_hot["SalePrice"]
y_bin = binarizado_df["SalePrice"]
y.head()
# separa os conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test
X_bin_train, X_bin_test, y_bin_train, y_bin_test = train_test_split(X_bin, y_bin, test_size=0.3, random_state=1) # 70% training and 30% test
import numpy as np
# inicializa o modelo de árvore de regressão
tree_model = DecisionTreeRegressor(random_state=42)
tree_model_bin = DecisionTreeRegressor(random_state=42)
# treina o modelo utilizando o método fit
tree_model.fit(X_train, y_train)
tree_model_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
y_pred = tree_model.predict(X_test)
y_bin_pred = tree_model_bin.predict(X_bin_test)
# Imprime os resultados alcançados
mse = mean_squared_error(y_test, y_pred, squared=False)
mape = np.mean(np.abs((y_test - y_pred)/y_test))*100
print("One hot encoding: ",mape, mse)
mse = mean_squared_error(y_bin_test, y_bin_pred, squared=False)
mape = np.mean(np.abs((y_bin_test - y_bin_pred)/y_bin_test))*100
print("Binarization: ",mape, mse)
from sklearn.ensemble import RandomForestRegressor
# inicializa o modelo de random forest
rfce_model = RandomForestRegressor(random_state=42, n_estimators = 100)
rfce_model_bin = RandomForestRegressor(random_state=42, n_estimators = 100)
# treina o modelo utilizando o método fit
rfce_model.fit(X_train, y_train)
rfce_model_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
rfce_y_pred = rfce_model.predict(X_test)
rfce_y_bin_pred = rfce_model_bin.predict(X_bin_test)
# Imprime os resultados alcançados
rfce_mse = mean_squared_error(y_test, rfce_y_pred, squared=False)
rfce_mape = np.mean(np.abs((y_test - rfce_y_pred)/y_test))*100
print("One hot encoding: ",rfce_mape, rfce_mse)
rfce_mse = mean_squared_error(y_bin_test, rfce_y_bin_pred, squared=False)
rfce_mape = np.mean(np.abs((y_bin_test - rfce_y_bin_pred)/y_bin_test))*100
print("Binarization: ",rfce_mape, rfce_mse)
# inicializa o modelo de random forest
rfcg_model = RandomForestRegressor(criterion = 'absolute_error', random_state=42, n_estimators = 100)
rfcg_model_bin = RandomForestRegressor(criterion = 'absolute_error', random_state=42, n_estimators = 100)
# treina o modelo utilizando o método fit
rfcg_model.fit(X_train, y_train)
rfcg_model_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
rfcg_y_pred = rfcg_model.predict(X_test)
rfcg_y_bin_pred = rfcg_model_bin.predict(X_bin_test)
# Imprime os resultados alcançados
rfcg_mse = mean_squared_error(y_test, rfcg_y_pred, squared=False)
rfcg_mape = np.mean(np.abs((y_test - rfcg_y_pred)/y_test))*100
print("One hot encoding: ",rfcg_mape, rfcg_mse)
rfcg_mse = mean_squared_error(y_bin_test, rfcg_y_bin_pred, squared=False)
rfcg_mape = np.mean(np.abs((y_bin_test - rfcg_y_bin_pred)/y_bin_test))*100
print("Binarization: ",rfcg_mape, rfcg_mse)
try:
    import xgboost as xgb
except:
    !pip install xgboost
    import xgboost as xgb
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg_bin = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
# treina o modelo utilizando o método fit
xg_reg.fit(X_train, y_train)
xg_reg_bin.fit(X_bin_train, y_bin_train)
# faz as previsões com o conjunto de testes
xg_y_pred = xg_reg.predict(X_test)
xg_y_bin_pred = xg_reg_bin.predict(X_bin_test)
# Imprime os resultados alcançados
xg_mse = mean_squared_error(y_test, xg_y_pred, squared=False)
xg_mape = np.mean(np.abs((y_test - xg_y_pred)/y_test))*100
print("One hot encoding: ",xg_mape, xg_mse)
xg_mse = mean_squared_error(y_bin_test, xg_y_bin_pred, squared=False)
xg_mape = np.mean(np.abs((y_bin_test - xg_y_bin_pred)/y_bin_test))*100
print("Binarization: ",xg_mape, xg_mse)
# Importa as bilbiotecas
import torch
from torch import nn
from torch.utils.data import DataLoader, random_split
from sklearn.preprocessing import StandardScaler
torch.manual_seed(42)
# Carrega os dados em um formato adequado para redes neurais
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        if not torch.is_tensor(X) and not torch.is_tensor(y):
            # Aplica scaling nos dados
            X = StandardScaler().fit_transform(X)
            
            self.X = torch.from_numpy(X)
            self.y = torch.from_numpy(np.array(y))
    def __len__(self):
        return len(self.X)
    def __getitem__(self, i):
        return self.X[i], self.y[i]
validation_size = int(0.1 * len(X_train))  # Validação representa 10% dos dados de treino
training_size = len(X_train) - validation_size
train_X, validation_X = random_split(X_train, 
                                    [training_size, validation_size])
train_y, validation_y = random_split(y_train, 
                                    [training_size, validation_size])
# train_y.dataset
train_dataset = Dataset(train_X.dataset, train_y.dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=1)
validation_dataset = Dataset(validation_X.dataset, validation_y.dataset)
validation_loader = torch.utils.data.DataLoader(validation_dataset, batch_size=10, shuffle=False, num_workers=1)
test_dataset = Dataset(X_test, y_test)
input_layer = len(X_train.columns)
input_layer
input_layer = len(X_train.columns)
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_layer, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
        )
    def forward(self, x):
        '''
        Forward pass
        '''
        return self.layers(x)
  # Inicializa a MLP
  mlp = MLP()
  
  # Define a função de loss e o otimizador
  loss_function = nn.L1Loss()  # Função MAE
  optimizer = torch.optim.SGD(mlp.parameters(), lr=1e-3)
import copy
history = {
    'train_loss': list(),
    'validation_loss': list(), 
}
min_loss = float("inf")
best_model = copy.deepcopy(mlp.state_dict())
# Executa o loop de treinamento
for epoch in range(0, 40): 
    print(f'Starting epoch {epoch+1}')
    
    # Inicializa loss
    current_loss = 0.0
    
    mlp.train()
    # Itera sobre os dados de treino
    for inputs, targets in train_loader: 
        # Converte os inputs para float
        inputs, targets = inputs.float(), targets.float()
        targets = targets.reshape((targets.shape[0], 1))
        
        # Zera os gradientes
        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            # Obtem as predições do modelo
            outputs = mlp(inputs)
            
            # Calcula a loss
            loss = loss_function(outputs, targets)
            
            # Realiza o back propagation
            loss.backward()
            
            # Realiza a otimização
            optimizer.step()
        
        current_loss += loss.item()
    
    # Cacula o loss médio de treinamento da época
    train_loss = current_loss / len(train_loader)
    current_loss = 0.0
    mlp.eval()
    # Itera sobre os dados de validação
    for inputs, targets in validation_loader: 
        # Converte os inputs para float
        inputs, targets = inputs.float(), targets.float()
        targets = targets.reshape((targets.shape[0], 1))
        
        outputs = mlp(inputs)
        # Calcula a loss
        loss = loss_function(outputs, targets)
        current_loss += loss.item()
    
    # Cacula o loss médio de validação da época
    validation_loss = current_loss / len(validation_loader)
    current_loss = 0.0
    if validation_loss < min_loss:
        best_model = copy.deepcopy(mlp.state_dict())
        min_loss = validation_loss
    print(f"Training loss: {train_loss}")
    print(f"Validation loss: {validation_loss}\n")
    history['train_loss'].append(train_loss)
    history['validation_loss'].append(validation_loss)
mlp.load_state_dict(best_model)
# Process is complete.
print('Training process has finished.')
print(f"Mininum loss found: {min_loss}")
def plot_losses(history):
    train_loss = history["train_loss"]
    validation_loss = history["validation_loss"]
    epochs = range(len(train_loss))
    plt.figure(figsize=(20, 8), dpi=80)
    
    plt.plot(train_loss, "-r")  # r = Red
    plt.plot(validation_loss, "-b")  # b = Blue
    plt.xticks(epochs)
    
    plt.legend(["Training loss", "Validation loss"])
    plt.xlabel("Epochs")
    plt.ylabel("Losses")
plot_losses(history)
predicted = mlp(test_dataset.X.float())
y_pred = predicted.cpu().detach().numpy()[:, 0]
y_true = test_dataset.y.tolist()
mape = np.mean(np.abs((y_true - y_pred)/y_true))*100
rmse = mean_squared_error(y_true, y_pred, squared=False)
print(mape, rmse)
df_test = pd.read_csv('/work/input/house-prices-advanced-regression-techniques/test.csv', sep=',')
df_test.head()
df_test_fill_na = df_test.drop(['LotFrontage'], axis=1)
# df_test_fill_na = df_test_fill_na[~df_test_fill_na["GarageType"].isnull()]
for name in df_test_fill_na.select_dtypes("object"):
    df_test_fill_na[name] = df_test_fill_na[name].fillna("NA")
df_test_fill_na["MasVnrArea"] = df_test_fill_na["MasVnrArea"].fillna(0)
df_test_fill_na = df_test_fill_na.drop(["Street"], axis=1)
df_test_fill_na = df_test_fill_na.drop(["Utilities"], axis=1)
df_test_fill_na = df_test_fill_na.drop(["Condition2"], axis=1)
replaced_test_df = df_test_fill_na.copy()
for label in replace_dict:
    replace = replace_dict[label]
    replaced_test_df = replaced_test_df.replace(replace)
category_columns = replaced_test_df.select_dtypes("object").columns
df_test_one_hot = pd.get_dummies(replaced_test_df, columns=category_columns)
df_test_one_hot.head()
columns_in_test = set(df_test_one_hot.columns.tolist()) - set(X.columns.tolist())
columns_in_train = set(X.columns.tolist()) - set(df_test_one_hot.columns.tolist())
print("Columns missing in test dataset: ")
print(columns_in_train)
print("Columns missing in train dataset: ")
print(columns_in_test)
print("Difference: ", len(columns_in_train) - len(columns_in_test))
df_test_drop = df_test_one_hot.copy()
for column in columns_in_test:
    df_test_drop = df_test_drop.drop(column, axis=1)
df_test_add = df_test_drop.copy()
for column in columns_in_train:
    df_test_add[column] = 0
# Reorder columns
train_columns = X.columns.tolist()
test_df = df_test_add[train_columns]
test_df.head()
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg.fit(X, y)
xg_y_pred = xg_reg.predict(test_df)
df_test["SalePrice"] = xg_y_pred
result = df_test[["Id", "SalePrice"]]
result
result.to_csv("result.csv", index=False)