Diamonds MLR

import pandas as pd import numpy as np import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error, r2_score import math

train = pd.read_csv('diamonds_train.csv') score = pd.read_csv('diamonds_score.csv')

score.head()

train.head()

train.info()

train['shape'].hist(xrot=45.0)

score['shape'].hist(xrot=45.0)

score.describe().T

round_diamonds = train[train['shape'].isin(['Round'])]

round_diamonds.describe().T

small_round_diamonds = round_diamonds[round_diamonds['weight_ct']<2.5]

small_round_diamonds.describe().T

sns.pairplot(small_round_diamonds[['weight_ct','price']].sample(frac=.1))

train_score = small_round_diamonds.append(score, ignore_index=True)

train_score_transformed = pd.get_dummies(train_score, drop_first=True)

train_transformed = train_score_transformed.iloc[:-10,:] score_transformed = train_score_transformed.iloc[-10:,:]

print('Train shape: ', train_transformed.shape) print('Score shape: ', score_transformed.shape)

X = train_transformed.drop(['price'], axis=1) y = train_transformed['price'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

diamond_price_model = LinearRegression()

diamond_price_model.fit(X_train,y_train)

y_pred = diamond_price_model.predict(X_test)

y_pred

print('Regression model performance evaluation') print('R2: ', r2_score(y_test, y_pred)) print('MAE: ', mean_absolute_error(y_test, y_pred)) print('RMSE: ', math.sqrt(mean_squared_error(y_test, y_pred))) print('Max error: ', max_error(y_test, y_pred))

y_score = diamond_price_model.predict(score_transformed.drop(['price'], axis=1)) results = score_transformed.copy() results['predicted_price'] = y_score results['assessment'] = np.where(results['price'] > results['predicted_price'], 'undervalued', 'overvalued')

results[['price','predicted_price','assessment']]