# import libraries
import matplotlib.pyplot as plt # data visualization doc: https://matplotlib.org/2.0.2/api/pyplot_api.html
import pandas as pd # data science essentials doc: https://pandas.pydata.org/docs/
import seaborn as sns # enhanced data visualization doc: https://seaborn.pydata.org/
import numpy as np # numpy library for math functions and arrays doc: https://numpy.org/doc/
from sklearn.model_selection import train_test_split # train-test split doc: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.metrics import make_scorer, mean_squared_error # metrics doc: https://scikit-learn.org/stable/modules/model_evaluation.html
from sklearn.preprocessing import StandardScaler # standard scaler doc: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from tpot import TPOTRegressor # doc: http://epistasislab.github.io/tpot/
import datetime # datetime doc: https://docs.python.org/3/library/datetime.html
# read train dataset
train = pd.read_csv('train.csv')
train = train.set_index('Id')
# read test dataset
test = pd.read_csv('test.csv')
test = test.set_index('Id')
# create a column to identify if train or test
train['is_train'] = True
test['is_train'] = False
# merge the dataset test and train into all_data
all_data = pd.concat([train,test], axis=0)
########################### numeric features ##############################
# creating a list of the numeric values
numeric_features = all_data.select_dtypes(include=np.number).columns
# filling missing values for numeric values
for column in numeric_features:
# identiyfing missing values
if all_data[column].isna().sum() > 0:
# identify the median of the column
column_median = all_data[column].median()
# fill the missing value with the column median
all_data[column] = all_data[column].fillna(column_median)
########################### categorical features ##############################
# separate the categorical features and the numeric features
categorical_features = set(all_data.columns) - set(numeric_features) - set(['is_train'])
# defining a function
def fill_with_mode(df, columns):
# filling missing categorial feature values
for col in columns:
# identifying the mode of the column
mode = df[col].mode()[0]
# fill missing value with the mode
df[col] = df[col].fillna(mode)
return df
# clean categorical features using mode for train data
all_data = fill_with_mode(all_data, ['Electrical', 'Fence', 'MasVnrType', 'Functional'])
# STUFF I ADDED LATER
all_data = fill_with_mode(all_data, ['MSZoning', 'KitchenQual', 'Utilities', 'Exterior1st','Exterior2nd','SaleType'])
# clean other categorical features using "NA" where it's an option
# NA is not a null value, it is an actual input - checked the data description
all_data["FireplaceQu"].fillna(value = "NA", inplace = True)
all_data["GarageCond"].fillna(value = "NA", inplace = True)
all_data["GarageYrBlt"].fillna(value = "NA", inplace = True)
all_data["GarageType"].fillna(value = "NA", inplace = True)
all_data["GarageFinish"].fillna(value = "NA", inplace = True)
all_data["PoolQC"].fillna(value = "NA", inplace = True)
all_data["BsmtQual"].fillna(value = "NA", inplace = True)
all_data["GarageQual"].fillna(value = "NA", inplace = True)
all_data["Alley"].fillna(value = "NA", inplace = True)
all_data["BsmtFinType1"].fillna(value = "NA", inplace = True)
all_data["BsmtFinType2"].fillna(value = "NA", inplace = True)
all_data["BsmtCond"].fillna(value = "NA", inplace = True)
all_data["MiscFeature"].fillna(value = "NA", inplace = True)
########################### unique features ##############################
# cleaning these columns separately
# basement exposure corresponds to the totalbsmtsf - use mode
all_data.loc[(all_data["BsmtExposure"].isna()) & (all_data["TotalBsmtSF"]==936), "BsmtExposure"] = all_data["BsmtExposure"].mode()[0]
# fill based on the previous output
all_data["BsmtExposure"].fillna(value = "NA", inplace = True)
# separate data based on bsmtfintype not being unfinished
all_data_filtered = all_data.loc[all_data["BsmtFinType2"] != 'Unf']
# bsmtfintype2 and bsmtfinsf2 and related - use mode
all_data.loc[(all_data["BsmtFinType2"].isna()) & (all_data["BsmtFinSF2"]!=0), "BsmtFinType2"] = all_data_filtered["BsmtFinType2"].mode()[0]
# fill with na based on previous output
all_data["BsmtFinType2"].fillna(value = "NA", inplace = True)
# adding all of the square feet to determine total
# set a starting value for total square feet
all_data['tot_SF'] = 1
# for loop for all of the rows
for index, row in all_data.iterrows():
# set val equal to garage finish to only add finished to tot_sf
val = row['GarageFinish']
# if garage is finished then add it to the total sq
if val != 'Unf':
all_data.loc[index,'tot_SF'] = all_data.loc[index,'GrLivArea'] + all_data.loc[index,'TotalBsmtSF'] - all_data.loc[index,'BsmtUnfSF'] + all_data.loc[index,'GarageArea']
# if garage is unfinished then dont add it to the total sq
else:
all_data.loc[index,'tot_SF'] = all_data.loc[index,'GrLivArea'] + all_data.loc[index,'TotalBsmtSF'] - all_data.loc[index,'BsmtUnfSF']
# calculate the ratio of bethrooms per bedrooms
# for loop for all the rows
for index, row in all_data.iterrows():
# only use the rooms above ground
if row['BedroomAbvGr'] > 0:
all_data.loc[index,'BathPerBedroom'] = (all_data.loc[index,'FullBath'] + all_data.loc[index,'HalfBath']) / all_data.loc[index,'BedroomAbvGr']
else:
all_data.loc[index,'BathPerBedroom'] = 0
# total porch area
all_data['total_porch'] = all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch']
# has a feature
# set starting value
all_data['has_garage'] = 0
all_data['has_masvnr'] = 0
all_data['has_pool'] = 0
all_data['has_porch'] = 0
# for loop for all the rows
for index, row in all_data.iterrows():
# there is a garage area
if all_data.loc[index,'GarageArea'] > 0:
all_data.loc[index,'has_garage'] = 1
# there is a masvnr area
if all_data.loc[index,'MasVnrArea'] > 0:
all_data.loc[index,'has_masvnr'] = 1
# there is a pool area
if all_data.loc[index,'PoolArea'] > 0:
all_data.loc[index,'has_pool'] = 1
# there is a pool area
if all_data.loc[index,'total_porch'] > 0:
all_data.loc[index,'has_porch'] = 1
# total number of bathrooms
all_data['TotBath'] = all_data['FullBath'] + all_data['HalfBath'] + all_data['BsmtFullBath'] + all_data['BsmtHalfBath']
# frequency of fireplaces compared to rooms above ground
all_data['FireplaceFreq'] = all_data['Fireplaces'] / all_data['TotRmsAbvGrd']
# square footage of the outdoor amenitities
all_data['OutdoorAmenitiesArea'] = all_data['WoodDeckSF'] + all_data['OpenPorchSF'] + all_data['EnclosedPorch'] + all_data['3SsnPorch'] + all_data['ScreenPorch'] + all_data['PoolArea']
# calculate the age of the house
all_data['house_age'] = datetime.datetime.now().year - all_data['YearBuilt']
# calculate how long since the last remodel
all_data['remodel_age'] = datetime.datetime.now().year - all_data['YearRemodAdd']
####################### LOG TRANSFORMATIONS ############################
# list of skewed columns
# removed the features that are years
skewed_columns = ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1',
'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd',
'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
'ScreenPorch', 'PoolArea', 'MiscVal',
# add the engineered features
'OutdoorAmenitiesArea', 'FireplaceFreq', 'tot_SF']
# for loop for the skewed columns
for col in skewed_columns:
# log and create a new column
if col in all_data.columns:
all_data['log_' + col] = np.log(all_data[col] + 0.001)
########################### DATA DROP ##############################
# dropping skewed columns, but keep the logged ones
all_data = all_data.drop(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallCond', 'MasVnrArea', 'BsmtFinSF1',
'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd',
'Fireplaces', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
'ScreenPorch', 'PoolArea', 'MiscVal',
'tot_SF', 'FireplaceFreq', 'OutdoorAmenitiesArea'],
axis = 1)
# one hot encoding for categorical features
for cat_col in categorical_features:
print(cat_col)
# create df with one hot encoded variables
one_hot_encoded_features = pd.get_dummies(all_data[cat_col])
one_hot_encoded_features.columns = [cat_col + col for col in one_hot_encoded_features.columns]
# combine one hot encode with the full dataset
all_data = pd.concat([all_data, one_hot_encoded_features], axis = 1)
# drop the columns no longer needed
all_data = all_data.drop(cat_col, axis = 1)
# separate the full dataset into test and train
train = all_data[all_data.is_train==True]
test = all_data[all_data.is_train==False]
# drop the test/train column
train = train.drop('is_train', axis = 1)
train.head( n = 5 )
# drop the test/train column
test = test.drop('is_train', axis = 1)
test.head( n = 5 )
# identifying x variables
x_labels = [x for x in list(train.columns) if x != 'SalePrice' and x!= 'Id']
# identify y variable
y_labels = ['SalePrice']
# split the dataset into train and test using the seed 219 to make sure the results are replicable
x_train, x_test, y_train, y_test = train_test_split(
train[x_labels], # x-variables (can change this)
train[y_labels], # y-variable (can change this)
test_size = 0.25,
random_state = 219)
def tpot_rmse(x_train, y_train, x_test, y_test, generations= 100, population_size = 100):
# Define custom scoring function as RMSE
rmse_scorer = make_scorer(mean_squared_error, greater_is_better=False)
# Define TPOTRegressor with custom scoring function
tpot = TPOTRegressor(generations = generations,
population_size = population_size,
scoring = "neg_root_mean_squared_error",
verbosity = 2,
random_state = 42,
n_jobs = -1)
# Fit the TPOT Regressor to the training data
tpot.fit(x_train, y_train)
# Calculate predictions on test set
y_pred = tpot.predict(x_test)
# Calculate RMSE on test set
rmse = mean_squared_error(y_test, y_pred)
return tpot, rmse
tpot, rmse = tpot_rmse(x_train, np.log(np.ravel(y_train)), x_test, np.log(np.ravel(y_test)), generations=10, population_size=75)
np.sqrt(rmse)
# y_prediction = tpot.predict(test[x_labels])
# np.exp(y_prediction)
# submission = pd.DataFrame()
# submission['Id'] = test.index
# submission['SalePrice'] = np.exp(y_prediction)
# submission = submission.set_index('Id')
# submission.to_csv('submission_team10_tpot.csv')