Advance ML - Sales price

import pandas as pd

df_import = pd.read_csv('./data/data_immobilier.csv', sep=',') df_import

df_import.info()

df_initial = df_import.iloc[:, 1:]

# sklearn models from sklearn import linear_model from sklearn.ensemble import RandomForestRegressor from sklearn.tree import DecisionTreeRegressor # sklearn helpers from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV from sklearn.metrics import accuracy_score from sklearn.model_selection import validation_curve

def change_category_to_code(df): df_to_processed = df.copy() # select features that are typed as object columns = list(df_to_processed.select_dtypes(include=object).columns) # convert features above into categorical features for column in columns: df_to_processed[column] = df_to_processed[column].astype('category').cat.codes df_to_processed = df_to_processed.dropna() return df_to_processed

def train_model(model, train_data, test_data): target_feature = 'SalePrice' target_feature_filter = train_data.columns == target_feature # retrieve inputs & output x1 = train_data.loc[:, ~target_feature_filter] y1 = train_data.loc[:, target_feature_filter].values.ravel() x2 = test_data.loc[:, ~target_feature_filter] y2 = test_data.loc[:, target_feature_filter].values.ravel() # train model withc cross validation train_accuracies = cross_val_score(model, x1, y1) train_accuracy = train_accuracies.mean() # fit model model.fit(x1, y1) test_accuracy = model.score(x2, y2) return train_accuracy, test_accuracy

def train_ln_model(train_data, test_data): model = linear_model.LinearRegression() return train_model(model, train_data, test_data)

def train_rf_model(train_data, test_data): model = RandomForestRegressor() return train_model(model, train_data, test_data)

def train_dt_model(train_data, test_data): model = DecisionTreeRegressor() return train_model(model, train_data, test_data)

def tunning_rf_model(train_data, parameter_name, parameter_range): model = RandomForestRegressor() target_feature = 'SalePrice' target_feature_filter = train_data.columns == target_feature # retrieve inputs & output X = train_data.loc[:, ~target_feature_filter] y = train_data.loc[:, target_feature_filter].values.ravel() scores, val_scores = validation_curve( estimator=model, X=X, y=y, param_name=parameter_name, param_range=parameter_range, verbose=1, error_score='raise', n_jobs=-1) score = scores.mean(axis=1) val_score = val_scores.mean(axis=1) return score, val_score

def find_best_rf_model(train_data, params): model = RandomForestRegressor() target_feature = 'SalePrice' target_feature_filter = train_data.columns == target_feature # retrieve inputs & output X = train_data.loc[:, ~target_feature_filter] y = train_data.loc[:, target_feature_filter].values.ravel() grid = GridSearchCV(model, params, verbose=3, n_jobs=-1) grid.fit(X, y) best_params = grid.best_params_ best_model = grid.best_estimator_ return best_params, best_model

def analyse_with_plot(x, scores, val_scores, x_label, title, figsize=(12, 6)): plt.figure(figsize=figsize) plt.plot(x, scores, linestyle='dashed', marker='o', label='score') plt.plot(x, val_scores, linestyle='dashed', marker='o', label='validation score') plt.xlabel(x_label) plt.ylabel('Accuracy') plt.title(title) plt.legend() plt.grid(b=True, which='major', color='#bdc3c7', linestyle='--') plt.show()

def analyse_with_bar(x, scores, val_scores, x_label, title, figsize=(12, 6)): x_values = np.arange(len(x)) barWidth = 0.15 plt.figure(figsize=(12,6)) plt.bar(x_values, scores, width=barWidth, label='score') plt.bar(x_values - barWidth, val_scores, width=barWidth, label='validation score') plt.xlabel(x_label) plt.ylabel('Accuracy') plt.xticks([r for r in range(len(scores))], x) plt.title(title) plt.legend() plt.show()

df_first_attempt = change_category_to_code(df_initial) df_first_attempt_train, df_first_attempt_test = train_test_split(df_first_attempt, test_size=0.2)

lr_train_score, lr_test_score = train_ln_model(df_first_attempt_train, df_first_attempt_test) print('Avg train accuracy:', lr_train_score) print('Test accuracy:', lr_test_score)

rf_train_score, rf_test_score = train_rf_model(df_first_attempt_train, df_first_attempt_test) print('Avg train accuracy:', rf_train_score) print('Test accuracy:', rf_test_score)

dt_train_score, dt_test_score = train_dt_model(df_first_attempt_train, df_first_attempt_test) print('Avg train accuracy:', dt_train_score) print('Test accuracy:', dt_test_score)

import numpy as np import matplotlib.pyplot as plt import seaborn as sns pd.options.mode.chained_assignment = None

null_features_filter = ((df_initial.notnull().sum() * 100) / len(df_initial)) > 90.0

df_second_attempt = df_initial.loc[:, null_features_filter] df_second_attempt = change_category_to_code(df_second_attempt) df_second_attempt_train, df_second_attempt_test = train_test_split(df_second_attempt, test_size=0.2)

rf_train_score, rf_test_score = train_rf_model(df_second_attempt_train, df_second_attempt_test) print('Avg train accuracy:', rf_train_score) print('Test accuracy:', rf_test_score)

df_third_attempt = df_second_attempt.copy() df_third_attempt['SalePrice'] = np.log(df_third_attempt['SalePrice']) df_third_attempt_train, df_third_attempt_test = train_test_split(df_third_attempt, test_size=0.2)

rf_train_score, rf_test_score = train_rf_model(df_third_attempt_train, df_third_attempt_test) print('Avg train accuracy:', rf_train_score) print('Test accuracy:', rf_test_score)

rf_estimators_range = [int(x) for x in np.linspace(1, 200, 20)]

rf_estimators_scores, rf_estimators_val_scores = tunning_rf_model( df_third_attempt_train, 'n_estimators', rf_estimators_range)

analyse_with_plot(rf_estimators_range, rf_estimators_scores, rf_estimators_val_scores, 'Number of estimatior', 'Accuracy according to the estimatior count')

rf_estimators_range = [int(x) for x in np.linspace(100, 200, 5)]

rf_criterion_range = ["squared_error", "absolute_error", "poisson"]

rf_criterion_scores, rf_criterion_val_scores = tunning_rf_model( df_third_attempt_train, 'criterion', rf_criterion_range)

analyse_with_bar(rf_criterion_range, rf_criterion_scores, rf_criterion_val_scores, 'Criterion\'s type', 'Accuracy according to the criterion\'s type')

rf_criterion_range = ["squared_error", "absolute_error"]

rf_max_features_range = ['auto', 'sqrt']

rf_max_features_scores, rf_max_features_val_scores = tunning_rf_model( df_third_attempt_train, 'max_features', rf_max_features_range)

analyse_with_bar(rf_max_features_range, rf_max_features_scores, rf_max_features_val_scores, 'Max features', 'Accuracy according to the max features')

rf_max_depth_range = np.arange(10, 210, 20)

rf_max_depth_scores, rf_max_depth_val_scores = tunning_rf_model( df_third_attempt_train, 'max_depth', rf_max_depth_range)

analyse_with_plot(rf_max_depth_range, rf_max_depth_scores, rf_max_depth_val_scores, 'Max depth', 'Accuracy according to the max depth')

rf_max_depth_range = np.arange(70, 150, 20)

rf_min_samples_split_range = np.arange(2, 11, 1)

rf_min_samples_split_scores, rf_min_samples_split_val_scores = tunning_rf_model( df_third_attempt_train, 'min_samples_split', rf_min_samples_split_range)

analyse_with_plot(rf_min_samples_split_range, rf_min_samples_split_scores, rf_min_samples_split_val_scores, 'Min samples split', 'Accuracy according to the min samples split')

rf_min_samples_split_range = np.arange(4, 8, 1)

rf_min_samples_leaf_range = np.arange(1, 5, 1)

rf_min_samples_leaf_scores, rf_min_samples_leaf_val_scores = tunning_rf_model( df_third_attempt_train, 'min_samples_leaf', rf_min_samples_leaf_range)

analyse_with_plot(rf_min_samples_leaf_range, rf_min_samples_leaf_scores, rf_min_samples_leaf_val_scores, 'Min number of sample at each leaf node', 'Accuracy according to the min number of sample at each leaf node')

rf_min_samples_leaf_range = np.arange(1, 4, 1)

rf_parameters = { 'n_estimators': rf_estimators_range, 'criterion': rf_criterion_range, 'max_features': rf_max_features_range, 'max_depth': rf_max_depth_range, 'min_samples_split': rf_min_samples_split_range, 'min_samples_leaf': rf_min_samples_leaf_range } rf_best_param, rf_best_model = find_best_rf_model(df_third_attempt_train, rf_parameters)

print("Best parameters found :") print(rf_best_param)

rf_train_score, rf_test_score = train_model(rf_best_model, df_third_attempt_train, df_third_attempt_test) print('Final avg train accuracy:', rf_train_score) print('Final test accuracy:', rf_test_score)