import numpy as np
import pandas as pd
from geopy.geocoders import Nominatim
from sklearn.covariance import GraphicalLasso
import sklearn
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from scipy import stats
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
df=pd.read_pickle("./cleanData_model.pkl")
df.info()
Data Preprocessing
target='SALEPRICE'
numerical=[
# 'lng-zip',
# 'lat-zip',
# 'LOTAREA',
'saledate-int',
'prevsaledate-int',
'prevsaledate2-int',
'PREVSALEPRICE',
'PREVSALEPRICE2',
'FINISHEDLIVINGAREA',
# 'STORIES',
'YEARBLT',
'CARDNUMBER',
'BSMTGARAGE',
'FIREPLACES',
'HALFBATHS',
'FULLBATHS',
'TOTALROOMS'
]
categorical=[
# 'SCHOOLCODE',
# 'TAXCODE',
# 'TAXSUBCODE',
'OWNERCODE',
'CLASS',
'ROOF',
'BASEMENT',
# 'LOCALLAND',
'FAIRMARKETBUILDING',
'FAIRMARKETLAND',
'STYLE',
'ROOF',
'BASEMENT',
'GRADE',
'CONDITION',
# 'HEATINGCOOLING',
'PARID',
'MUNICODE',
'USECODE',
# 'HOMESTEADFLAG',
# 'FARMSTEADFLAG',
'CLEANGREEN',
# 'ABATEMENTFLAG',
'SALECODE',
'COUNTYBUILDING',
'COUNTYLAND',
'EXTERIORFINISH',
# 'CDU'
]
#OBTINIG SMALL RANDOM SAMPLE FOR PROTOTYPING PURPOSES
df_sample=df
# df_sample=df.sample(n=10000,random_state=11)
#SEPARATING FEATURE DATA
X=df_sample[numerical+categorical]
# X=df_sample.loc[:, df_sample.columns != target]
#SEPARATIGN TARGET DATA
y=df_sample[target]
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
# STANDADIZING DATA
scaler = preprocessing.StandardScaler().fit(X)
X_scaled = scaler.transform(X)
# ALLOCATING TRAIN AND TEST DATA (30% TEST AND 70% TRAIN)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=10)
Linear Model: Lasso
from sklearn.linear_model import LassoCV
reg = LassoCV()
reg.fit(X_train, y_train)
print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
print("Best test score using built-in LassoCV: %f" %reg.score(X_test,y_test))
print("Best train score using built-in LassoCV: %f" %reg.score(X_train,y_train))
coef = pd.Series(reg.coef_, index = X.columns)
imp_coef = coef.sort_values()
imp_coef.plot(kind = "barh",logx=True)
plt.title("Feature importance using Lasso Model")
Random Forest
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(n_estimators=100, max_depth=20, random_state=0)
regr.fit(X_train, y_train)
print('Test score: ',regr.score(X_test, y_test))
print('Train score: ',regr.score(X_train, y_train))
feature_importance = regr.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
Bagging Regressor
from sklearn.ensemble import BaggingRegressor
bg = BaggingRegressor(n_estimators=30, oob_score=True)
bg.fit(X_train, y_train)
print('Test score:',bg.score(X_test, y_test))
print('Train score:',bg.score(X_train, y_train))
#COMPUTING FEATURE IMPORATANCE THROUGH THE MEAN
feature_importance = np.mean([
tree.feature_importances_ for tree in bg.estimators_
], axis=0)
#NORMALIZE FEATURE IMPORTANCE
feature_importance = 100.0 * (feature_importance / feature_importance.max())
#SORT FEATURE IMPORTANCE
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
#PLOT RESULTS
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
XGBoost
import xgboost as xg
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score
# Instantiation
xgb_r = xg.XGBRegressor(objective ='reg:squarederror',
n_estimators = 110, seed = 10)
# Fitting the model
xgb_r.fit(X_train, y_train);
# Predict the model
test_pred = xgb_r.predict(X_test)
train_pred = xgb_r.predict(X_train)
# R2_score Computation
test_score = np.sqrt(r2_score(y_test, test_pred))
train_score = np.sqrt(r2_score(y_train, train_pred))
print("Test Score : % f" %(test_score))
print("Train Score : % f" %(train_score))
sorted_idx = xgb_r.feature_importances_.argsort()
plt.barh(X.keys()[sorted_idx], xgb_r.feature_importances_[sorted_idx])
plt.xlabel("Xgboost Feature Importance")
from sklearn.inspection import permutation_importance
#LOOK AT THE PERMUTATION IMPORATANCE OF EACH VARIABLE
perm_importance = permutation_importance(xgb_r, X_test, y_test)
#SORTING AND PLOTTING THE PERMUTATION IMPORTANCE OF EACH FEATURE
sorted_idx = perm_importance.importances_mean.argsort()
plt.barh(X.keys()[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
AdaBoost
from sklearn.ensemble import AdaBoostRegressor
regr = AdaBoostRegressor(random_state=10, n_estimators=4)
regr.fit(X_train, y_train)
print('Test score:',regr.score(X_test, y_test))
print('Train score:', regr.score(X_train, y_train))
SVM
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
SV=SVR(C=1,kernel='poly',epsilon=0.05, gamma='auto')
SV.fit(X_train,y_train)
print('Test score:',SV.score(X_test, y_test))
print('Train score:', SV.score(X_train, y_train))
pickle.dump(SV, open('models/SV.pkl', 'wb'))
SGD Regressor
from sklearn.linear_model import SGDRegressor
SGD=SGDRegressor(max_iter=1000, tol=1e-3)
SGD.fit(X_train,y_train)
SGD.score(X_test,y_test)
print('Test score:',SGD.score(X_test, y_test))
print('Train score:', SGD.score(X_train, y_train))
XGBoost Hyperparameter Tuning
import xgboost as xg
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
# Instantiation
xgb = xg.XGBRegressor(objective ='reg:squarederror', seed = 10)
param_dic={'n_estimators':np.arange(110,170,20)}
grid=GridSearchCV(estimator=xgb, param_grid=param_dic,cv=4,verbose=123)
grid.fit(X_train,y_train)
pickle.dump(grid, open('models/Grid-XGB2.pkl', 'wb'))
print(grid.cv_results_)
dictt = pickle.load(open('models/Grid-XGB-data.pkl', 'rb'))
xx=dictt['xx']
yy=dictt['yy']
tt=dictt['tt']
fig, ax1 = plt.subplots()
ax1.plot(np.array(xx),np.array(yy),'o-',c='royalblue')
ax1.set_xlabel('Number of Estimators')
ax1.set_ylabel('CV Mean Test Score', color='royalblue')
ax1.tick_params(axis='y', labelcolor='royalblue')
ax2 = ax1.twinx()
ax2.plot(np.array(xx),np.array(tt),'o-',c='orange')
ax2.set_ylabel('CV Mean Training Time', color='orange')
ax2.tick_params(axis='y', labelcolor='orange')
plt.savefig('figs/CV.png',format='png',dpi=300)