Ames Housing Pricing

Authors: Leonardo Lecci Xitlali Magana Nathalia Cardona Riccardo Perez

# import libraries import matplotlib.pyplot as plt # data visualization doc: https://matplotlib.org/2.0.2/api/pyplot_api.html import pandas as pd # data science essentials doc: https://pandas.pydata.org/docs/ import seaborn as sns # enhanced data visualization doc: https://seaborn.pydata.org/ import numpy as np # numpy library for math functions and arrays doc: https://numpy.org/doc/ from sklearn.model_selection import train_test_split # train-test split doc: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html from sklearn.metrics import make_scorer, mean_squared_error # metrics doc: https://scikit-learn.org/stable/modules/model_evaluation.html from sklearn.preprocessing import StandardScaler # standard scaler doc: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html from tpot import TPOTRegressor # doc: http://epistasislab.github.io/tpot/ import datetime # datetime doc: https://docs.python.org/3/library/datetime.html

# read train dataset train = pd.read_csv('train.csv') train = train.set_index('Id') # read test dataset test = pd.read_csv('test.csv') test = test.set_index('Id') # create a column to identify if train or test train['is_train'] = True test['is_train'] = False # merge the dataset test and train into all_data all_data = pd.concat([train,test], axis=0)

########################### numeric features ############################## # creating a list of the numeric values numeric_features = all_data.select_dtypes(include=np.number).columns # filling missing values for numeric values for column in numeric_features: # identiyfing missing values if all_data[column].isna().sum() > 0: # identify the median of the column column_median = all_data[column].median() # fill the missing value with the column median all_data[column] = all_data[column].fillna(column_median) ########################### categorical features ############################## # separate the categorical features and the numeric features categorical_features = set(all_data.columns) - set(numeric_features) - set(['is_train']) # defining a function def fill_with_mode(df, columns): # filling missing categorial feature values for col in columns: # identifying the mode of the column mode = df[col].mode()[0] # fill missing value with the mode df[col] = df[col].fillna(mode) return df # clean categorical features using mode for train data all_data = fill_with_mode(all_data, ['Electrical', 'Fence', 'MasVnrType', 'Functional']) # STUFF I ADDED LATER all_data = fill_with_mode(all_data, ['MSZoning', 'KitchenQual', 'Utilities', 'Exterior1st','Exterior2nd','SaleType']) # clean other categorical features using "NA" where it's an option # NA is not a null value, it is an actual input - checked the data description all_data["FireplaceQu"].fillna(value = "NA", inplace = True) all_data["GarageCond"].fillna(value = "NA", inplace = True) all_data["GarageYrBlt"].fillna(value = "NA", inplace = True) all_data["GarageType"].fillna(value = "NA", inplace = True) all_data["GarageFinish"].fillna(value = "NA", inplace = True) all_data["PoolQC"].fillna(value = "NA", inplace = True) all_data["BsmtQual"].fillna(value = "NA", inplace = True) all_data["GarageQual"].fillna(value = "NA", inplace = True) all_data["Alley"].fillna(value = "NA", inplace = True) all_data["BsmtFinType1"].fillna(value = "NA", inplace = True) all_data["BsmtFinType2"].fillna(value = "NA", inplace = True) all_data["BsmtCond"].fillna(value = "NA", inplace = True) all_data["MiscFeature"].fillna(value = "NA", inplace = True) ########################### unique features ############################## # cleaning these columns separately # basement exposure corresponds to the totalbsmtsf - use mode all_data.loc[(all_data["BsmtExposure"].isna()) & (all_data["TotalBsmtSF"]==936), "BsmtExposure"] = all_data["BsmtExposure"].mode()[0] # fill based on the previous output all_data["BsmtExposure"].fillna(value = "NA", inplace = True) # separate data based on bsmtfintype not being unfinished all_data_filtered = all_data.loc[all_data["BsmtFinType2"] != 'Unf'] # bsmtfintype2 and bsmtfinsf2 and related - use mode all_data.loc[(all_data["BsmtFinType2"].isna()) & (all_data["BsmtFinSF2"]!=0), "BsmtFinType2"] = all_data_filtered["BsmtFinType2"].mode()[0] # fill with na based on previous output all_data["BsmtFinType2"].fillna(value = "NA", inplace = True)

# adding all of the square feet to determine total # set a starting value for total square feet all_data['tot_SF'] = 1 # for loop for all of the rows for index, row in all_data.iterrows(): # set val equal to garage finish to only add finished to tot_sf val = row['GarageFinish'] # if garage is finished then add it to the total sq if val != 'Unf': all_data.loc[index,'tot_SF'] = all_data.loc[index,'GrLivArea'] + all_data.loc[index,'TotalBsmtSF'] - all_data.loc[index,'BsmtUnfSF'] + all_data.loc[index,'GarageArea'] # if garage is unfinished then dont add it to the total sq else: all_data.loc[index,'tot_SF'] = all_data.loc[index,'GrLivArea'] + all_data.loc[index,'TotalBsmtSF'] - all_data.loc[index,'BsmtUnfSF'] # calculate the ratio of bethrooms per bedrooms # for loop for all the rows for index, row in all_data.iterrows(): # only use the rooms above ground if row['BedroomAbvGr'] > 0: all_data.loc[index,'BathPerBedroom'] = (all_data.loc[index,'FullBath'] + all_data.loc[index,'HalfBath']) / all_data.loc[index,'BedroomAbvGr'] else: all_data.loc[index,'BathPerBedroom'] = 0 # total porch area all_data['total_porch'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch'] # has a feature # set starting value all_data['has_garage'] = 0 all_data['has_masvnr'] = 0 all_data['has_pool'] = 0 all_data['has_porch'] = 0 # for loop for all the rows for index, row in all_data.iterrows(): # there is a garage area if all_data.loc[index,'GarageArea'] > 0: all_data.loc[index,'has_garage'] = 1 # there is a masvnr area if all_data.loc[index,'MasVnrArea'] > 0: all_data.loc[index,'has_masvnr'] = 1 # there is a pool area if all_data.loc[index,'PoolArea'] > 0: all_data.loc[index,'has_pool'] = 1 # there is a pool area if all_data.loc[index,'total_porch'] > 0: all_data.loc[index,'has_porch'] = 1 # total number of bathrooms all_data['TotBath'] = all_data['FullBath'] + all_data['HalfBath'] + all_data['BsmtFullBath'] + all_data['BsmtHalfBath'] # frequency of fireplaces compared to rooms above ground all_data['FireplaceFreq'] = all_data['Fireplaces'] / all_data['TotRmsAbvGrd'] # square footage of the outdoor amenitities all_data['OutdoorAmenitiesArea'] = all_data['WoodDeckSF'] + all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch'] + all_data['PoolArea'] # calculate the age of the house all_data['house_age'] = datetime.datetime.now().year - all_data['YearBuilt'] # calculate how long since the last remodel all_data['remodel_age'] = datetime.datetime.now().year - all_data['YearRemodAdd']

####################### LOG TRANSFORMATIONS ############################ # list of skewed columns # removed the features that are years skewed_columns = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', # add the engineered features 'OutdoorAmenitiesArea', 'FireplaceFreq', 'tot_SF'] # for loop for the skewed columns for col in skewed_columns: # log and create a new column if col in all_data.columns: all_data['log_' + col] = np.log(all_data[col] + 0.001) ########################### DATA DROP ############################## # dropping skewed columns, but keep the logged ones all_data = all_data.drop(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'tot_SF', 'FireplaceFreq', 'OutdoorAmenitiesArea'], axis = 1)

# one hot encoding for categorical features for cat_col in categorical_features: print(cat_col) # create df with one hot encoded variables one_hot_encoded_features = pd.get_dummies(all_data[cat_col]) one_hot_encoded_features.columns = [cat_col + col for col in one_hot_encoded_features.columns] # combine one hot encode with the full dataset all_data = pd.concat([all_data, one_hot_encoded_features], axis = 1) # drop the columns no longer needed all_data = all_data.drop(cat_col, axis = 1)

# separate the full dataset into test and train train = all_data[all_data.is_train==True] test = all_data[all_data.is_train==False]

# drop the test/train column train = train.drop('is_train', axis = 1) train.head( n = 5 )

# drop the test/train column test = test.drop('is_train', axis = 1) test.head( n = 5 )

# identifying x variables x_labels = [x for x in list(train.columns) if x != 'SalePrice' and x!= 'Id'] # identify y variable y_labels = ['SalePrice'] # split the dataset into train and test using the seed 219 to make sure the results are replicable x_train, x_test, y_train, y_test = train_test_split( train[x_labels], # x-variables (can change this) train[y_labels], # y-variable (can change this) test_size = 0.25, random_state = 219)

def tpot_rmse(x_train, y_train, x_test, y_test, generations= 100, population_size = 100): # Define custom scoring function as RMSE rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False) # Define TPOTRegressor with custom scoring function tpot = TPOTRegressor(generations = generations, population_size = population_size, scoring = "neg_root_mean_squared_error", verbosity = 2, random_state = 42, n_jobs = -1) # Fit the TPOT Regressor to the training data tpot.fit(x_train, y_train) # Calculate predictions on test set y_pred = tpot.predict(x_test) # Calculate RMSE on test set rmse = mean_squared_error(y_test, y_pred) return tpot, rmse tpot, rmse = tpot_rmse(x_train, np.log(np.ravel(y_train)), x_test, np.log(np.ravel(y_test)), generations=10, population_size=75)

np.sqrt(rmse)

# y_prediction = tpot.predict(test[x_labels]) # np.exp(y_prediction) # submission = pd.DataFrame() # submission['Id'] = test.index # submission['SalePrice'] = np.exp(y_prediction) # submission = submission.set_index('Id') # submission.to_csv('submission_team10_tpot.csv')