import pandas as pd
df_import = pd.read_csv('./data/data_immobilier.csv', sep=',')
df_import
df_import.info()
df_initial = df_import.iloc[:, 1:]
# sklearn models
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
# sklearn helpers
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import validation_curve
def change_category_to_code(df):
df_to_processed = df.copy()
# select features that are typed as object
columns = list(df_to_processed.select_dtypes(include=object).columns)
# convert features above into categorical features
for column in columns:
df_to_processed[column] = df_to_processed[column].astype('category').cat.codes
df_to_processed = df_to_processed.dropna()
return df_to_processed
def train_model(model, train_data, test_data):
target_feature = 'SalePrice'
target_feature_filter = train_data.columns == target_feature
# retrieve inputs & output
x1 = train_data.loc[:, ~target_feature_filter]
y1 = train_data.loc[:, target_feature_filter].values.ravel()
x2 = test_data.loc[:, ~target_feature_filter]
y2 = test_data.loc[:, target_feature_filter].values.ravel()
# train model withc cross validation
train_accuracies = cross_val_score(model, x1, y1)
train_accuracy = train_accuracies.mean()
# fit model
model.fit(x1, y1)
test_accuracy = model.score(x2, y2)
return train_accuracy, test_accuracy
def train_ln_model(train_data, test_data):
model = linear_model.LinearRegression()
return train_model(model, train_data, test_data)
def train_rf_model(train_data, test_data):
model = RandomForestRegressor()
return train_model(model, train_data, test_data)
def train_dt_model(train_data, test_data):
model = DecisionTreeRegressor()
return train_model(model, train_data, test_data)
def tunning_rf_model(train_data, parameter_name, parameter_range):
model = RandomForestRegressor()
target_feature = 'SalePrice'
target_feature_filter = train_data.columns == target_feature
# retrieve inputs & output
X = train_data.loc[:, ~target_feature_filter]
y = train_data.loc[:, target_feature_filter].values.ravel()
scores, val_scores = validation_curve(
estimator=model,
X=X,
y=y,
param_name=parameter_name,
param_range=parameter_range,
verbose=1,
error_score='raise',
n_jobs=-1)
score = scores.mean(axis=1)
val_score = val_scores.mean(axis=1)
return score, val_score
def find_best_rf_model(train_data, params):
model = RandomForestRegressor()
target_feature = 'SalePrice'
target_feature_filter = train_data.columns == target_feature
# retrieve inputs & output
X = train_data.loc[:, ~target_feature_filter]
y = train_data.loc[:, target_feature_filter].values.ravel()
grid = GridSearchCV(model, params, verbose=3, n_jobs=-1)
grid.fit(X, y)
best_params = grid.best_params_
best_model = grid.best_estimator_
return best_params, best_model
def analyse_with_plot(x, scores, val_scores, x_label, title, figsize=(12, 6)):
plt.figure(figsize=figsize)
plt.plot(x, scores, linestyle='dashed', marker='o', label='score')
plt.plot(x, val_scores, linestyle='dashed', marker='o', label='validation score')
plt.xlabel(x_label)
plt.ylabel('Accuracy')
plt.title(title)
plt.legend()
plt.grid(b=True, which='major', color='#bdc3c7', linestyle='--')
plt.show()
def analyse_with_bar(x, scores, val_scores, x_label, title, figsize=(12, 6)):
x_values = np.arange(len(x))
barWidth = 0.15
plt.figure(figsize=(12,6))
plt.bar(x_values, scores, width=barWidth, label='score')
plt.bar(x_values - barWidth, val_scores, width=barWidth, label='validation score')
plt.xlabel(x_label)
plt.ylabel('Accuracy')
plt.xticks([r for r in range(len(scores))], x)
plt.title(title)
plt.legend()
plt.show()
df_first_attempt = change_category_to_code(df_initial)
df_first_attempt_train, df_first_attempt_test = train_test_split(df_first_attempt, test_size=0.2)
lr_train_score, lr_test_score = train_ln_model(df_first_attempt_train, df_first_attempt_test)
print('Avg train accuracy:', lr_train_score)
print('Test accuracy:', lr_test_score)
rf_train_score, rf_test_score = train_rf_model(df_first_attempt_train, df_first_attempt_test)
print('Avg train accuracy:', rf_train_score)
print('Test accuracy:', rf_test_score)
dt_train_score, dt_test_score = train_dt_model(df_first_attempt_train, df_first_attempt_test)
print('Avg train accuracy:', dt_train_score)
print('Test accuracy:', dt_test_score)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.mode.chained_assignment = None
null_features_filter = ((df_initial.notnull().sum() * 100) / len(df_initial)) > 90.0
df_second_attempt = df_initial.loc[:, null_features_filter]
df_second_attempt = change_category_to_code(df_second_attempt)
df_second_attempt_train, df_second_attempt_test = train_test_split(df_second_attempt, test_size=0.2)
rf_train_score, rf_test_score = train_rf_model(df_second_attempt_train, df_second_attempt_test)
print('Avg train accuracy:', rf_train_score)
print('Test accuracy:', rf_test_score)
df_third_attempt = df_second_attempt.copy()
df_third_attempt['SalePrice'] = np.log(df_third_attempt['SalePrice'])
df_third_attempt_train, df_third_attempt_test = train_test_split(df_third_attempt, test_size=0.2)
rf_train_score, rf_test_score = train_rf_model(df_third_attempt_train, df_third_attempt_test)
print('Avg train accuracy:', rf_train_score)
print('Test accuracy:', rf_test_score)
rf_estimators_range = [int(x) for x in np.linspace(1, 200, 20)]
rf_estimators_scores, rf_estimators_val_scores = tunning_rf_model(
df_third_attempt_train,
'n_estimators',
rf_estimators_range)
analyse_with_plot(rf_estimators_range,
rf_estimators_scores,
rf_estimators_val_scores,
'Number of estimatior',
'Accuracy according to the estimatior count')
rf_estimators_range = [int(x) for x in np.linspace(100, 200, 5)]
rf_criterion_range = ["squared_error", "absolute_error", "poisson"]
rf_criterion_scores, rf_criterion_val_scores = tunning_rf_model(
df_third_attempt_train,
'criterion',
rf_criterion_range)
analyse_with_bar(rf_criterion_range,
rf_criterion_scores,
rf_criterion_val_scores,
'Criterion\'s type',
'Accuracy according to the criterion\'s type')
rf_criterion_range = ["squared_error", "absolute_error"]
rf_max_features_range = ['auto', 'sqrt']
rf_max_features_scores, rf_max_features_val_scores = tunning_rf_model(
df_third_attempt_train,
'max_features',
rf_max_features_range)
analyse_with_bar(rf_max_features_range,
rf_max_features_scores,
rf_max_features_val_scores,
'Max features',
'Accuracy according to the max features')
rf_max_depth_range = np.arange(10, 210, 20)
rf_max_depth_scores, rf_max_depth_val_scores = tunning_rf_model(
df_third_attempt_train,
'max_depth',
rf_max_depth_range)
analyse_with_plot(rf_max_depth_range,
rf_max_depth_scores,
rf_max_depth_val_scores,
'Max depth',
'Accuracy according to the max depth')
rf_max_depth_range = np.arange(70, 150, 20)
rf_min_samples_split_range = np.arange(2, 11, 1)
rf_min_samples_split_scores, rf_min_samples_split_val_scores = tunning_rf_model(
df_third_attempt_train,
'min_samples_split',
rf_min_samples_split_range)
analyse_with_plot(rf_min_samples_split_range,
rf_min_samples_split_scores,
rf_min_samples_split_val_scores,
'Min samples split',
'Accuracy according to the min samples split')
rf_min_samples_split_range = np.arange(4, 8, 1)
rf_min_samples_leaf_range = np.arange(1, 5, 1)
rf_min_samples_leaf_scores, rf_min_samples_leaf_val_scores = tunning_rf_model(
df_third_attempt_train,
'min_samples_leaf',
rf_min_samples_leaf_range)
analyse_with_plot(rf_min_samples_leaf_range,
rf_min_samples_leaf_scores,
rf_min_samples_leaf_val_scores,
'Min number of sample at each leaf node',
'Accuracy according to the min number of sample at each leaf node')
rf_min_samples_leaf_range = np.arange(1, 4, 1)
rf_parameters = {
'n_estimators': rf_estimators_range,
'criterion': rf_criterion_range,
'max_features': rf_max_features_range,
'max_depth': rf_max_depth_range,
'min_samples_split': rf_min_samples_split_range,
'min_samples_leaf': rf_min_samples_leaf_range
}
rf_best_param, rf_best_model = find_best_rf_model(df_third_attempt_train, rf_parameters)
print("Best parameters found :")
print(rf_best_param)
rf_train_score, rf_test_score = train_model(rf_best_model, df_third_attempt_train, df_third_attempt_test)
print('Final avg train accuracy:', rf_train_score)
print('Final test accuracy:', rf_test_score)