%run Utility_tools.ipynb
%run Libraries.ipynb
raw = pd.read_csv('diamond_pred.csv', index_col=[0])
df = deepcopy(raw)
df.sample(4)
df.drop(['x', 'y', 'z', 'table', 'clarity', 'depth', 'color', 'cut'], axis=1, inplace=True)
df = pd.concat([df.iloc[:,-1], df.iloc[:,:-1]], axis=1)
#Pre-modeling
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)
scaler = RobustScaler()
x_test = scaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)
lin_mod = LinearRegression()
lin_mod.fit(x_train, y_train)
preds = lin_mod.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
rmse
lin_mod.score(x_train, y_train)
r2_score(y_test, preds)
mean_absolute_error(y_test, preds)
normalized_rmse = rmse/(df['price'].max()-df['price'].min()) #rmse divided by the target's range
normalized_rmse
df_1 = deepcopy(raw)
df_1.drop(['size_volume', 'table', 'clarity', 'depth', 'color', 'cut'], axis=1, inplace=True)
df_1 = pd.concat([df_1.iloc[:,2:], df_1.iloc[:,:2]], axis=1)
#Pre-modeling
x = df_1.iloc[:,:-1]
y = df_1.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)
scaler = RobustScaler()
x_test = scaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)
lin_mod = LinearRegression()
lin_mod.fit(x_train, y_train)
preds = lin_mod.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
rmse
lin_mod.score(x_train, y_train)
r2_score(y_test, preds)
mean_absolute_error(y_test, preds)
normalized_rmse = rmse/(df['price'].max()-df['price'].min())
normalized_rmse
df_2 = deepcopy(raw)
df_2 = df_2[['carat', 'x', 'price']]
#Pre-modeling
x = df_2.iloc[:,:-1]
y = df_2.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)
scaler = RobustScaler()
x_test = scaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)
lin_mod = LinearRegression()
lin_mod.fit(x_train, y_train)
preds = lin_mod.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
rmse
lin_mod.score(x_train, y_train)
r2_score(y_test, preds)
mean_absolute_error(y_test, preds)
normalized_rmse = rmse/(df['price'].max()-df['price'].min())
normalized_rmse
df_3 = deepcopy(raw)
df_3.sample(3)
df_3_cat = df_3.select_dtypes(include='object')
df_3 = pd.concat([pd.get_dummies(df_3_cat), df_3.select_dtypes(include='number')], axis=1)\
.drop(['depth', 'table', 'size_volume'], axis=1)
df_3 = pd.concat([df_3[[column for column in df_3.columns if column!='price']], df_3['price']], axis=1)
#Pre-modeling
x = df_3.iloc[:,:-1]
y = df_3.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)
scaler = RobustScaler()
x_test = scaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)
lin_mod = LinearRegression()
lin_mod.fit(x_train, y_train)
preds = lin_mod.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
rmse
lin_mod.score(x_train, y_train)
r2_score(y_test, preds)
mean_absolute_error(y_test, preds)
normalized_rmse = rmse/(df_3['price'].max()-df_3['price'].min())
normalized_rmse #We get the best result when using categorical features as well
rfc_ = RandomForestRegressor(random_state=42)
param_grid = {
'n_estimators': [300, 500],
'max_features': ['auto', 'sqrt', 'log2'],
'max_depth' : [12],
"min_samples_split" : [2,4,8],
"bootstrap": [True, False]
}
CV_rfc = GridSearchCV(estimator=rfc_, param_grid=param_grid, cv=5)
CV_rfc.fit(x_train, y_train)
CV_rfc.best_params_
rfc1=RandomForestClassifier(n_estimators=,
max_features= ,
max_depth=,
min_samples_split=,
bootstrap=)
rfc1.fit(x_train, y_train)
rf_pred = rfc1.predict(x_test)
plot_rf_feat_importance(rfc1, x)
rmse = np.sqrt(mean_squared_error(y_test, rf_pred)
print(f'''Root Mean Squared Error: {rmse}''')
print(f'''R^squared on train: {rfc1.score(x_train, y_train)}''')
print(f'''R^squared on test: {r2_score(y_test, rf_pred)}''')
print(f'''Mean Absolute Error: {mean_absolute_error(y_test, rf_pred)}''')
print(f'''Normalized Root Mean Squared Error: {rmse/(df_3['price'].max()-df_3['price'].min())}