import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, max_error, r2_score
import math
train = pd.read_csv('diamonds_train.csv')
score = pd.read_csv('diamonds_score.csv')
score.head()
train.head()
train.info()
train['shape'].hist(xrot=45.0)
score['shape'].hist(xrot=45.0)
score.describe().T
round_diamonds = train[train['shape'].isin(['Round'])]
round_diamonds.describe().T
small_round_diamonds = round_diamonds[round_diamonds['weight_ct']<2.5]
small_round_diamonds.describe().T
sns.pairplot(small_round_diamonds[['weight_ct','price']].sample(frac=.1))
train_score = small_round_diamonds.append(score, ignore_index=True)
train_score_transformed = pd.get_dummies(train_score, drop_first=True)
train_transformed = train_score_transformed.iloc[:-10,:]
score_transformed = train_score_transformed.iloc[-10:,:]
print('Train shape: ', train_transformed.shape)
print('Score shape: ', score_transformed.shape)
X = train_transformed.drop(['price'], axis=1)
y = train_transformed['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
diamond_price_model = LinearRegression()
diamond_price_model.fit(X_train,y_train)
y_pred = diamond_price_model.predict(X_test)
y_pred
print('Regression model performance evaluation')
print('R2: ', r2_score(y_test, y_pred))
print('MAE: ', mean_absolute_error(y_test, y_pred))
print('RMSE: ', math.sqrt(mean_squared_error(y_test, y_pred)))
print('Max error: ', max_error(y_test, y_pred))
y_score = diamond_price_model.predict(score_transformed.drop(['price'], axis=1))
results = score_transformed.copy()
results['predicted_price'] = y_score
results['assessment'] = np.where(results['price'] > results['predicted_price'], 'undervalued', 'overvalued')
results[['price','predicted_price','assessment']]