import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
pd.options.mode.chained_assignment = None # default='warn'
%matplotlib inline
try:
from pip import main as pipmain
except:
from pip._internal import main as pipmain
pipmain(['install', "-q", "xgboost"])
from xgboost import XGBRegressor
from xgboost import XGBClassifier
show = True
show_overfitting = True
vf_path_train = r"C:\Users\prebe\Google Drive\Unibocconi\Machine Learning\Vodafone Challenge\data_train.csv"
vf_path_test = r"C:\Users\prebe\Google Drive\Unibocconi\Machine Learning\Vodafone Challenge\data_test.csv"
vf_data = pd.read_csv(vf_path_train, index_col=0)
vf_data_test = pd.read_csv(vf_path_test, index_col=0)
if show:
vf_data.info()
if show:
n_obs, n_features = vf_data.shape
print(f'The number of train observations is {n_obs}; the number of features provided is {n_features - 2}')
print('\n')
n_obs_test, n_features_test = vf_data_test.shape
print(f'The number of test observations is {n_obs_test}')
vf_data.columns
def handle_missing_features(dataframe, show=True):
counter = 0
for i in dataframe.index:
if dataframe.loc[i, 'Metratura_scaled'] == 0:
counter += 1
dataframe.loc[i, 'Metratura_scaled'] = np.nan
if show:
print(f'The column "Metratura Scaled" has {counter} missing values')
return dataframe
vf_data = handle_missing_features(vf_data, show)
#vf_data.dropna(axis=0, inplace=True)
def na_median(X_train, X_test):
median = X_train.loc[:,'Metratura_scaled'].median()
X_train.loc[:,'Metratura_scaled'].fillna(median, inplace=True)
X_test.loc[:,'Metratura_scaled'].fillna(median, inplace=True)
return X_train, X_test
def reset_features():
X = vf_data.copy()
X.drop(['Footfall', 'Footfall_classif'], axis = 1, inplace=True)
yval = vf_data['Footfall'].copy()
ylab = vf_data['Footfall_classif'].copy()
return X, yval, ylab
X, yval, ylab = reset_features()
def get_columns(dataframe, type1='float64', type2='int64'):
num = list(dataframe.select_dtypes(include=[type1]).columns)
cat = list(dataframe.select_dtypes(include=[type2]).columns)
return num, cat
num, cat = get_columns(X)
if show:
print(f"The numericals are: {num}\n\n The categoricals are {cat}")
def splitter(X, yval, ylab, n=0.1, seed=1):
X_train, X_test, yval_train, yval_test, ylab_train, ylab_test = train_test_split(X, yval, ylab, test_size=n,
random_state=seed)
return X_train, X_test, yval_train, yval_test, ylab_train, ylab_test
X_train, X_test, yval_train, yval_test, ylab_train, ylab_test = splitter(X, yval, ylab)
X_train, X_test = na_median(X_train, X_test)
assert np.all(np.isfinite(X_train))
assert not(np.any(np.isnan(X_train)))
if show:
X_train.info()
def random_normal(yval_train, yval_test):
mean = yval_train.mean()
std = yval_train.std()
n = yval_test.shape[0]
yval_random = np.random.normal(mean, std, n)
mse = np.sum((yval_random - yval_test.values)**2) / n
return mse
def random_n(yval_test):
n = yval_test.shape[0]
yval_random = np.random.uniform(size=(1,n))
mse = np.sum((yval_random - yval_test.values)**2) / n
return mse
if show:
mse_random = random_n(yval_test)
print('The random 0-1 benchmark is:', mse_random)
mse_rand_normal = random_normal(yval_train, yval_test)
print('The random normal benchmark is:', mse_rand_normal)
def model_test(X_train, yval_train, cvnum=10):
scores = {}
#Linear Regression model
lin_reg = LinearRegression()
lin_reg_scores = - cross_val_score(lin_reg, X_train, yval_train, scoring='neg_mean_squared_error', cv=cvnum)
mean_lin_reg = lin_reg_scores.mean()
scores['LinearRegression'] = mean_lin_reg
#Ridge model
ridge = Ridge(random_state=0)
ridge_scores = - cross_val_score(ridge, X_train, yval_train, scoring='neg_mean_squared_error', cv=cvnum)
mean_ridge = ridge_scores.mean()
scores['Ridge'] = mean_ridge
#Support Vector Machine Regressor model
svm = SVR(gamma='auto')
svm_scores = - cross_val_score(svm, X_train, yval_train, scoring='neg_mean_squared_error', cv=cvnum)
mean_svm = svm_scores.mean()
scores['SVMRegressor'] = mean_svm
#Multi-Layer Perceptron Regressor model
mlp_reg = MLPRegressor(random_state=0, solver='sgd')
mlp_reg_scores = - cross_val_score(mlp_reg, X_train, yval_train, scoring='neg_mean_squared_error', cv=cvnum)
mean_mlp_reg = mlp_reg_scores.mean()
scores['MLPRegressor'] = mean_mlp_reg
#Decision Tree Regressor model
dectree_reg = DecisionTreeRegressor(random_state=0)
dectree_reg_scores = - cross_val_score(dectree_reg, X_train, yval_train, scoring='neg_mean_squared_error', cv=cvnum)
mean_dectree_reg = dectree_reg_scores.mean()
scores['DecisionTreeRegressor'] = mean_dectree_reg
#Random Forest Regressor model
rndfor_reg = RandomForestRegressor(random_state=0, n_estimators=10) #this is just the default value, set to avoid warnings
rndfor_reg_scores = - cross_val_score(rndfor_reg, X_train, yval_train, scoring='neg_mean_squared_error', cv=cvnum)
mean_rndfor_reg = rndfor_reg_scores.mean()
scores['RandomForestRegressor'] = mean_rndfor_reg
#XGB Regressor model
xgb_reg = XGBRegressor(random_state=0, verbose=0)
xgb_reg_scores = - cross_val_score(xgb_reg, X_train, yval_train, scoring='neg_mean_squared_error', cv=cvnum)
mean_xgb_reg = xgb_reg_scores.mean()
scores['XGBRegressor'] = mean_xgb_reg
return scores
scores_untuned = model_test(X_train, yval_train)
if show:
print(scores_untuned)
if show:
sns.distplot(yval_train)
plt.title('Yval_train distribution')
plt.show()
print("%s : Skewness: %f, Kurtosis: %f" % ('Footfall',yval_train.skew(), yval_train.kurt()))
if show:
print(ylab_train.value_counts())
if show:
plt.scatter(yval_train, ylab_train)
plt.title("Footfall vs Footfall_Classif")
plt.ylabel("Footfall_Classif")
plt.xlabel("Footfall")
if show:
X_train[num].std().plot('hist')
plt.title('Distribution of stds of all numerical columns')
plt.show()
X_train[num].std().plot('hist')
plt.title('Distribution of means of all numerical columns')
plt.show()
show1 = False
if show1:
for col in num:
f, ax = plt.subplots(figsize = (8, 6))
sns.distplot(X_train[col])
if show:
print('Show the correlation of the numerical features with the yval', 2*'\n')
print(X_train[num].corrwith(yval_train).sort_values())
def find_values(dataframe, cat):
d = {}
for i in cat:
values = list(dataframe[i].unique())
values.sort()
d[i] = values
return d
d = find_values(X_train, cat)
if show:
for i in cat:
print(f"The category {i} has {len(d[i])} different values")
def bar_chart_categoricals(dataframe, cat):
d = find_values(dataframe, cat)
for i in cat:
values = dataframe[i].value_counts()
if len(d[i]) > 15:
values = values.iloc[:14,]
values.plot.bar()
plt.title(i)
plt.show()
if show:
bar_chart_categoricals(X_train, cat)
def redimension(dataframe, d, m=10, n=10, delete=True):
values = d
l = []
for i in values:
if len(values[i]) > m:
l.append(i)
if delete:
if len(values[i]) == 2:
del values[i][-1]
for i in l:
counter = dataframe[i].value_counts()
n_important = sum(counter > n)
values[i] = list(counter[:n_important].index)
return values
def dummier(dataframe, cat, m=10, n=10, drop=True, output_dic=True):
if 'SM si/no_encoded' in cat:
cat.remove('SM si/no_encoded')
dataframe['SM si/no_encoded'] = 1.0 * dataframe['SM si/no_encoded']
init_d = find_values(dataframe, cat)
d = redimension(dataframe, init_d, m, n)
for i in cat:
values = d[i]
# values.pop(-1) #TODO: ask the professor
for number in values:
dummy = 1.0 * (dataframe[i] == number)
name = 'd_' + i + '_' + str(number)
dummy.name = name
dataframe = pd.concat([dataframe, dummy], axis=1, verify_integrity=True)
if drop:
dataframe.drop(cat, axis=1, inplace=True)
if output_dic:
return dataframe, d
else:
return dataframe
def dummier_test(test, d, drop=True):
for i in d:
for number in d[i]:
dummy = 1.0 * (test[i] == number)
name = 'd_' + i + '_' + str(number)
dummy.name = name
test = pd.concat([test, dummy], axis=1, verify_integrity=True)
if drop:
test.drop(cat, axis=1, inplace=True)
return test
def dummier_tot(train, test, cat, m=10, n=10, drop=True):
X_train, d = dummier(train, cat, m, n, drop)
X_test = dummier_test(test, d, drop)
return X_train, X_test
X_train, X_test = dummier_tot(X_train, X_test, cat)
if show:
print(X_train.shape, X_test.shape)
if show:
print(X_train.columns, 2*'\n', X_test.columns)
X, yval, ylab = reset_features()
X_train, X_test, yval_train, yval_test, ylab_train, ylab_test = splitter(X, yval, ylab)
X_train, X_test = na_median(X_train, X_test)
X_train, X_test = dummier_tot(X_train, X_test, cat, n=0)
def list_cat(dataframe, category):
cols = list(dataframe.columns)
el = []
for s in cols:
if category in s:
el.append(s)
return el
def apply_PCA_cat(X_train, X_test, category, n):
columns = list_cat(X_train, category)
pca = PCA(n_components = n)
X_train_pca = pca.fit_transform(X_train[columns])
X_test_pca = pca.transform(X_test[columns])
min_max = MinMaxScaler()
X_train_pca = min_max.fit_transform(X_train_pca)
X_test_pca = min_max.transform(X_test_pca)
X_train_pca = pd.DataFrame(X_train_pca, columns=['PCA_' + category + '_' + str(i+1) for i in range(X_train_pca.shape[1])],
index=X_train.index)
X_test_pca = pd.DataFrame(X_test_pca, columns=['PCA_' + category + '_' + str(i+1) for i in range(X_test_pca.shape[1])],
index=X_test.index)
X_train.drop(columns, axis=1, inplace=True)
X_test.drop(columns, axis=1, inplace=True)
X_train = pd.concat([X_train, X_train_pca], verify_integrity=True, axis=1)
X_test = pd.concat([X_test, X_test_pca], verify_integrity=True, axis=1)
return X_train, X_test
X_train, X_test = apply_PCA_cat(X_train, X_test, 'Localita_encoded', 3)
X_train, X_test = apply_PCA_cat(X_train, X_test, 'Provincia_encoded', 3)
X_train.shape, X_test.shape
if show:
print(X_train.columns)
if show:
X_corr, yval_corr, ylab_corr = X_train.copy(), yval_train.copy(), ylab_train.copy()
yval_corr = sp.special.log1p(yval_corr)
corr_data = pd.concat([yval_corr, ylab_corr, X_corr], verify_integrity=True, axis=1)
#Taking the abs shows the negative relation also
corr = abs (corr_data.corr(method='spearman'))
corrcol = corr.nlargest(n=20, columns='Footfall').index
# spearman coefficient matrix
cor = sp.array(sp.stats.spearmanr(corr_data[corrcol].values))[0]
print(corrcol.values)
plt.figure(figsize=(20,20))
sns.set(font_scale=1.50)
sns.heatmap(cor, fmt='.2f', annot=True, square=True, annot_kws={'size' : 13}, xticklabels=corrcol.values,
yticklabels=corrcol.values)
plt.show()
employees = ['FTE_scaled', 'PTE_scaled']
QAC = ['QAC_Oct_scaled', 'QAC_Nov_scaled', 'QAC_Dec_scaled', 'QAC_Jan_scaled', 'QAC_Feb_scaled', 'QAC_Mar_scaled']
TNPS = ['TNPS_Oct_scaled', 'TNPS_Nov_scaled', 'TNPS_Dec_scaled', 'TNPS_Jan_scaled', 'TNPS_Feb_scaled']
month = {'Oct':['QAC_Oct_scaled', 'TNPS_Oct_scaled'], 'Nov':['QAC_Nov_scaled', 'TNPS_Nov_scaled'],
'Dec':['QAC_Dec_scaled', 'TNPS_Dec_scaled'], 'Jan':['QAC_Jan_scaled', 'TNPS_Jan_scaled'],
'Feb':['QAC_Feb_scaled', 'TNPS_Feb_scaled'], 'Mar':['QAC_Mar_scaled']}
if show:
scatter_matrix(X_train[employees])
plt.show()
X_train[employees].corr()
if show:
sm = scatter_matrix(X_train[QAC])
for subaxis in sm:
for ax in subaxis:
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
ax.set_ylabel("")
ax.set_xlabel("")
X_train[QAC].corr()
if show:
sm = scatter_matrix(X_train[TNPS])
for subaxis in sm:
for ax in subaxis:
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
ax.set_ylabel("")
ax.set_xlabel("")
X_train[TNPS].corr()
if show:
sm = scatter_matrix(np.sqrt(X_train[QAC]))
for subaxis in sm:
for ax in subaxis:
ax.xaxis.set_ticks([])
ax.yaxis.set_ticks([])
ax.set_ylabel("")
ax.set_xlabel("")
QAC_sqrt = [i + '_sqrt' for i in QAC]
X_train[QAC_sqrt] = np.sqrt(X_train[QAC])
X_test[QAC_sqrt] = np.sqrt(X_test[QAC])
for i in QAC_sqrt:
X_train[i] = (X_train[i]-X_train[i].min())/(X_train[i].max()-X_train[i].min())
X_test[i] = (X_test[i]-X_test[i].min())/(X_test[i].max()-X_test[i].min())
if show:
print('Improvement: \n')
print(X_train[QAC].corrwith(yval_train) - np.sqrt(X_train[QAC]).corrwith(yval_train))
X_train[TNPS].corrwith(yval_train)
def avg_QAC_TNPS(dataframe, working_days=False, sqrt=True):
months = ['Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar']
days = {'Oct': 31, 'Nov': 30, 'Dec': 31, 'Jan': 31, 'Feb': 28, 'Mar': 31}
if working_days:
days = {'Oct': 27, 'Nov': 25, 'Dec': 23, 'Jan': 26, 'Feb': 24, 'Mar': 26}
QAC = ['QAC_' + i + '_scaled' for i in months]
if sqrt:
QAC = [i + '_sqrt' for i in QAC]
TNPS = ['TNPS_' + i + '_scaled' for i in months]
del TNPS[-1]
avg = np.zeros((2, dataframe.shape[0]))
counter, tot = 0, 0
for i in QAC:
n = days[months[counter]]
avg[0,:] += days[months[counter]] * dataframe[i].values
tot += n
counter += 1
avg[0,:] *= 1 / tot
counter, tot = 0, 0
for i in TNPS:
n = days[months[counter]]
avg[1,:] += n * dataframe[i].values
tot += n
counter += 1
avg[1,:] *= 1 / tot
name = 'QAC_avg'
if sqrt:
name = 'QAC_sqrt_avg'
avg = pd.DataFrame(avg.T, columns=[name, 'TNPS_avg'], index=dataframe.index)
return avg
def apply_avg_QAC_TNPS(dataframe, working_days=False, sqrt=True, drop=True):
avg = avg_QAC_TNPS(dataframe, working_days, sqrt)
months = ['Oct', 'Nov', 'Dec', 'Jan', 'Feb', 'Mar']
QAC = ['QAC_' + i + '_scaled' for i in months]
if sqrt:
QAC = [i + '_sqrt' for i in QAC]
TNPS = ['TNPS_' + i + '_scaled' for i in months]
del TNPS[-1]
if drop:
dataframe.drop(QAC + TNPS, axis=1, inplace=True)
dataframe = pd.concat([dataframe, avg], axis=1, verify_integrity=True)
return dataframe
X_train = apply_avg_QAC_TNPS(X_train)
X_test = apply_avg_QAC_TNPS(X_test)
if show:
plt.hist(X_train['QAC_sqrt_avg'], bins=50)
plt.title('QAC_sqrt_avg')
plt.show()
plt.hist(X_train['TNPS_avg'], bins=50)
plt.title('TNPS_avg')
plt.show()
if show:
print(X_train['QAC_sqrt_avg'].corr(yval_train))
def FTE_PTE(dataframe, weight=0.5):
FTE_PTE = dataframe['FTE_scaled'] + weight * dataframe['PTE_scaled']
return FTE_PTE
def find_weight_FTE_PTE(X, y, n_iters = 21):
ar = np.linspace(0, 1, n_iters)
d = {}
max_corr = 0
for i in ar:
corr = FTE_PTE(X, i).corr(y)
if corr > max_corr:
max_corr = corr
weight = i
return weight
weight = find_weight_FTE_PTE(X_train, yval_train)
def apply_FTE_PTE(X_train, X_test, y_train, n_iters=21, weight=None, drop=False):
if weight == None:
weight = find_weight_FTE_PTE(X_train, y_train, n_iters)
fte_pte_train = FTE_PTE(X_train, weight)
fte_pte_train.name = 'FTE_PTE_' + str(weight)
if drop:
X_train = X_train.drop(['FTE_scaled', 'PTE_scaled'], axis=1)
X_train = pd.concat([X_train, fte_pte_train], verify_integrity=True, axis=1)
fte_pte_test = FTE_PTE(X_test, weight)
fte_pte_test.name = 'FTE_PTE_' + str(weight)
if drop:
X_test = X_test.drop(['FTE_scaled', 'PTE_scaled'], axis=1)
X_test = pd.concat([X_test, fte_pte_test], verify_integrity=True, axis=1)
return X_train, X_test
X_train, X_test = apply_FTE_PTE(X_train, X_test, yval_train, n_iters=21)
if show:
print(X_train['FTE_PTE_' + str(weight)].corr(yval_train))
def transformer(X_train, X_test, weight, show=True):
a = (X_train['FTE_scaled'])**2
a_test = (X_test['FTE_scaled'])**2
X_train['FTE_scaled_squared'] = (a - min(a)) / (max(a) - min(a))
X_test['FTE_scaled_squared'] = (a_test - min(a)) / (max(a) - min(a))
if show:
print('Correlation of FTE_scaled squared with yval is:', (a).corr(yval_train))
b = (X_train['FTE_PTE_' + str(weight)]) / (X_train['Metratura_scaled'] + 1)
b_test = (X_test['FTE_PTE_' + str(weight)]) / (X_test['Metratura_scaled'] + 1)
X_train['Employee_over_Metratura'] = (b - min(b)) / (max(b) - min(b))
X_test['Employee_over_Metratura'] = (b_test - min(b)) / (max(b) - min(b))
if show:
print('Correlation of Employees over Metratura with yval is:', (b).corr(yval_train))
return X_train, X_test
X_train, X_test = transformer(X_train, X_test, weight, show)
def apply_PCA_col(X_train, X_test, col, n):
columns, name = col
pca = PCA(n_components = n)
X_train_pca = pca.fit_transform(X_train[columns])
X_test_pca = pca.transform(X_test[columns])
min_max = MinMaxScaler()
X_train_pca = min_max.fit_transform(X_train_pca)
X_test_pca = min_max.transform(X_test_pca)
X_train_pca = pd.DataFrame(X_train_pca, columns=['PCA_' + name + '_' + str(i+1) for i in range(X_train_pca.shape[1])],
index=X_train.index)
X_test_pca = pd.DataFrame(X_test_pca, columns=['PCA_' + name + '_' + str(i+1) for i in range(X_test_pca.shape[1])],
index=X_test.index)
X_train.drop(columns, axis=1, inplace=True)
X_test.drop(columns, axis=1, inplace=True)
X_train = pd.concat([X_train, X_train_pca], verify_integrity=True, axis=1)
X_test = pd.concat([X_test, X_test_pca], verify_integrity=True, axis=1)
return X_train, X_test
def apply_pca_col_tot(X_train, X_test):
X_train, X_test = apply_PCA_col(X_train, X_test, col=(['night_MAX_scaled_ARPU_1000', 'night_MEAN_scaled_ARPU_1000',
'night_MIN_scaled_ARPU_1000'], 'night_scaled_ARPU_1000'), n=1)
X_train, X_test = apply_PCA_col(X_train, X_test, col=(['night_MAX_scaled_ARPU_500', 'night_MEAN_scaled_ARPU_500',
'night_MIN_scaled_ARPU_500'], 'night_scaled_ARPU_500'), n=1)
X_train, X_test = apply_PCA_col(X_train, X_test, col=(['night_MAX_scaled_ARPU_200', 'night_MEAN_scaled_ARPU_200',
'night_MIN_scaled_ARPU_200'], 'night_scaled_ARPU_200'), n=1)
X_train, X_test = apply_PCA_col(X_train, X_test, col=(['midday_MAX_scaled_ARPU_1000', 'midday_MEAN_scaled_ARPU_1000',
'midday_MIN_scaled_ARPU_1000'], 'midday_scaled_ARPU_1000'), n=1)
X_train, X_test = apply_PCA_col(X_train, X_test, col=(['midday_MAX_scaled_ARPU_500', 'midday_MEAN_scaled_ARPU_500',
'midday_MIN_scaled_ARPU_500'], 'midday_scaled_ARPU_500'), n=1)
X_train, X_test = apply_PCA_col(X_train, X_test, col=(['midday_MAX_scaled_ARPU_200', 'midday_MEAN_scaled_ARPU_200',
'midday_MIN_scaled_ARPU_200'], 'midday_scaled_ARPU_200'), n=1)
X_train, X_test = apply_PCA_col(X_train, X_test, col=(['weekend_MAX_scaled_ARPU_1000', 'weekend_MEAN_scaled_ARPU_1000',
'weekend_MIN_scaled_ARPU_1000'], 'weekend_scaled_ARPU_1000'), n=1)
X_train, X_test = apply_PCA_col(X_train, X_test, col=(['weekend_MAX_scaled_ARPU_500', 'weekend_MEAN_scaled_ARPU_500',
'weekend_MIN_scaled_ARPU_500'], 'weekend_scaled_ARPU_500'), n=1)
X_train, X_test = apply_PCA_col(X_train, X_test, col=(['weekend_MAX_scaled_ARPU_200', 'weekend_MEAN_scaled_ARPU_200',
'weekend_MIN_scaled_ARPU_200'], 'weekend_scaled_ARPU_200'), n=1)
return X_train, X_test
X_train, X_test = apply_pca_col_tot(X_train, X_test)
def data_preparer(get_data=False):
if not(get_data):
X, yval, ylab = reset_features()
X_train, X_test, yval_train, yval_test, ylab_train, ylab_test = splitter(X, yval, ylab)
else:
train, X_test = get_data
X_train, yval_train, ylab_train = train
X_train, X_test = na_median(X_train, X_test)
X_train[QAC_sqrt] = np.sqrt(X_train[QAC])
X_test[QAC_sqrt] = np.sqrt(X_test[QAC])
for i in QAC_sqrt:
X_train[i] = (X_train[i]-X_train[i].min())/(X_train[i].max()-X_train[i].min())
X_test[i] = (X_test[i]-X_test[i].min())/(X_test[i].max()-X_test[i].min())
X_train = apply_avg_QAC_TNPS(X_train)
X_test = apply_avg_QAC_TNPS(X_test)
weight = find_weight_FTE_PTE(X_train, yval_train)
X_train, X_test = apply_FTE_PTE(X_train, X_test, yval_train, weight=weight)
X_train, X_test = transformer(X_train, X_test, weight, show=False)
X_train, X_test = apply_pca_col_tot(X_train, X_test)
assert np.all(X_train.index == yval_train.index) and np.all(yval_train.index == ylab_train.index)
assert np.all(X_train.columns == X_test.columns)
if not(get_data):
assert np.all(X_test.index == yval_test.index) and np.all(yval_test.index == ylab_test.index)
return X_train, X_test, yval_train, yval_test, ylab_train, ylab_test
else:
return X_train, yval_train, ylab_train, X_test
X_train, X_test, yval_train, yval_test, ylab_train, ylab_test = data_preparer()
scores_num_transf = model_test(X_train, yval_train)
if show1:
print('The untuned scores were:', scores_untuned)
print('\n')
print('The new scores are:', scores_num_transf)
X_train, X_test = dummier_tot(X_train, X_test, cat, n=0)
scores_dummytot = model_test(X_train, yval_train)
if show1:
print('The untuned scores were:', scores_untuned)
print('\n')
print('The numerical transformed scores are:', scores_num_transf)
print('\n')
print('The new scores are:', scores_dummytot)
rnd_for_reg = RandomForestRegressor(n_estimators=100)
rnd_for_reg.fit(X_train, yval_train)
if show:
print('The features that are at least 1% are showed below', 2*'\n')
for name, score in zip(X_train.columns, rnd_for_reg.feature_importances_):
if score > 0.01:
print('The feature', name, 'has a score of:', 100*score, '%')
X_train, X_test = apply_PCA_cat(X_train, X_test, 'Localita_encoded', 3)
X_train, X_test = apply_PCA_cat(X_train, X_test, 'Provincia_encoded', 3)
scores_dummypca = model_test(X_train, yval_train)
if show1:
print('The untuned scores were:', scores_untuned)
print('\n')
print('The numerical transformed scores are:', scores_num_transf)
print('\n')
print('The total dummied categories are:', scores_dummytot)
print('\n')
print('The new scores are:', scores_dummypca)
#we first reset all the values
X, yval, ylab = reset_features()
X_train, X_test, yval_train, yval_test, ylab_train, ylab_test = splitter(X, yval, ylab)
X_train, X_test = na_median(X_train, X_test)
#then we apply apply_avg_QAC_TNPS and apply_FTE_PTE
X_train[QAC_sqrt] = np.sqrt(X_train[QAC])
X_test[QAC_sqrt] = np.sqrt(X_test[QAC])
for i in QAC_sqrt:
X_train[i] = (X_train[i]-X_train[i].min())/(X_train[i].max()-X_train[i].min())
X_test[i] = (X_test[i]-X_test[i].min())/(X_test[i].max()-X_test[i].min())
X_train = apply_avg_QAC_TNPS(X_train)
X_test = apply_avg_QAC_TNPS(X_test)
X_train, X_test = apply_FTE_PTE(X_train, X_test, yval_train)
X_train, X_test = dummier_tot(X_train, X_test, cat)
scores_dummyrest = model_test(X_train, yval_train)
if show:
print('The untuned scores were:', scores_untuned)
print('\n')
print('The numerical transformed scores were:', scores_num_transf)
print('\n')
print('The total dummied categories scorres were:', scores_dummytot)
print('\n')
print('The dummies with PCA applied scores were:', scores_dummypca)
print('\n')
print('The new scores are:', scores_dummyrest)
def PCA_plot(dataframe, var_optimal=.95, show=True, show1=False):
n = dataframe.shape[1]
if show1:
print('The number of starting variables is:', n, 2*'\n')
pca_plot = PCA(n_components=n)
pca_plot.fit(dataframe)
expl_var = pca_plot.explained_variance_ratio_
if show1:
print(expl_var)
expl_var = np.array([sum(expl_var[:i]) for i in range(1, n-1)])
if show:
plt.plot(expl_var)
n_needed = np.argwhere(expl_var > var_optimal)[0, 0] + 1
if show:
plt.plot([0,74], [var_optimal, var_optimal], c='red')
plt.show()
if show:
print('Variables needed for ',str(int(var_optimal*100)),'%: ', n_needed)
return n_needed
n = PCA_plot(X_train, .95, show, show1)
def apply_PCA_train_test(X_train, X_test, n=.95):
pca = PCA(n_components=n)
train_ind, test_ind = X_train.index, X_test.index
pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)
cols = ['PCA_' + str(i) for i in range(1, X_train.shape[1] + 1)]
X_train = pd.DataFrame(X_train, columns=cols, index=train_ind)
X_test = pd.DataFrame(X_test, columns=cols, index=test_ind)
return X_train, X_test
applyPCA = False
if applyPCA:
X_train, X_test = apply_PCA_train_test(X_train, X_test, n)
if show and applyPCA:
print(X_train.shape, X_test.shape)
c = pd.DataFrame(X_train).corr()
sns.heatmap(c)
if applyPCA:
scores_completepca = model_test(X_train, yval_train)
if applyPCA:
print('The untuned scores were:', scores_untuned)
print('\n')
print('The numerical transformed scores were:', scores_num_transf)
print('\n')
print('The total dummied categories were:', scores_dummytot)
print('\n')
print('The dummies with pca scores were:', scores_dummypca)
print('\n')
print('The new scores are:', scores_completepca)
def plot_learning_curves(model, X, y, step=5, show_overfitting=True):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
train_errors, val_errors = [], []
for m in range(1, len(X_train), step):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))
val_errors.append(mean_squared_error(y_val_predict, y_val))
plt.plot(train_errors, "r-+", linewidth=2, label="Training set")
plt.plot(val_errors, "b-", linewidth=2, label="Validation set")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
print('The last MSE is:', val_errors[-1])
ridge = Ridge()
plot_learning_curves(ridge, X_train, yval_train, show_overfitting=show_overfitting)
ridge = Ridge()
param_grid = [{'alpha': [i/10 for i in range(1, 50, 2)]}]
grid_search_ridge = GridSearchCV(ridge, param_grid, iid=False, cv=5, scoring='neg_mean_squared_error', return_train_score=False)
grid_search_ridge.fit(X_train, yval_train)
param_ridge = grid_search_ridge.best_params_
ridge_alpha = param_ridge['alpha']
print(param_ridge)
print('\n')
print('The best MSE score obtained is:', -grid_search_ridge.best_score_)
ridge_opt = Ridge(alpha = ridge_alpha)
plot_learning_curves(ridge_opt, X_train, yval_train, show_overfitting=show_overfitting)
polynomial_ridge = Pipeline((('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
('poly_ridge', Ridge())))
plot_learning_curves(polynomial_ridge, X_train, yval_train, show_overfitting=show_overfitting)
polyfeatures = PolynomialFeatures(include_bias=False)
polyridge = Ridge()
polynomial_ridge = Pipeline(steps=[('polyfeatures', polyfeatures), ('polyridge', polyridge)])
param_grid = [{'polyfeatures__degree': [1, 2], 'polyridge__alpha': [30, 35, 40, 45, 50]}]
grid_search_ridge = GridSearchCV(polynomial_ridge, param_grid, iid=False, cv=5, scoring='neg_mean_squared_error',
return_train_score=False)
grid_search_ridge.fit(X_train, yval_train)
param_polynomial_ridge = grid_search_ridge.best_params_
ridge_poly_degrees, ridge_aplha = param_polynomial_ridge['polyfeatures__degree'], param_polynomial_ridge['polyridge__alpha']
print(param_polynomial_ridge)
print('\n')
print('The best MSE score obtained is:', -grid_search_ridge.best_score_)
polynomial_ridge_opt = Pipeline((('poly_features', PolynomialFeatures(degree=ridge_poly_degrees, include_bias=False)),
('poly_ridge', Ridge(alpha=ridge_aplha))))
plot_learning_curves(polynomial_ridge_opt, X_train, yval_train, show_overfitting=show_overfitting)
mlp = MLPRegressor(random_state=0)
plot_learning_curves(mlp, X_train, yval_train, show_overfitting=show_overfitting)
mlp = MLPRegressor(hidden_layer_sizes=(50,), max_iter=1000, alpha=1, verbose=False, tol=1e-4,
random_state=0, learning_rate_init = 0.001)
plot_learning_curves(mlp, X_train, yval_train, show_overfitting=show_overfitting)
mlp = MLPRegressor(random_state=0, verbose=0, max_iter=10000, learning_rate_init=1e-03, solver='lbfgs')
param_grid_mlp = [{'alpha': [10, 5, 1, .75, .6, .5, .1], 'hidden_layer_sizes': [(25, ), (50, ), (75, ), (100, )]}]
grid_search_mlp = GridSearchCV(mlp, param_grid_mlp, iid=False, cv=5, scoring='neg_mean_squared_error',
return_train_score=False)
grid_search_mlp.fit(X_train, yval_train)
mlp_param = grid_search_mlp.best_params_
mlp_alpha, mlp_hidden_layer_sizes = mlp_param['alpha'], mlp_param['hidden_layer_sizes']
print(mlp_param)
print('\n')
print('The best MSE score obtained is:', -grid_search_mlp.best_score_)
mlp_opt = MLPRegressor(hidden_layer_sizes=mlp_hidden_layer_sizes, max_iter=10000, alpha=mlp_alpha, verbose=False,
random_state=0, learning_rate_init=1e-03)
plot_learning_curves(mlp_opt, X_train, yval_train, show_overfitting=show_overfitting)
rnd_for_reg = RandomForestRegressor(n_estimators=50, random_state=0) #just the default value, set to avoid warnings
plot_learning_curves(rnd_for_reg, X_train, yval_train, show_overfitting=show_overfitting)
rnd_for_reg = RandomForestRegressor(n_estimators=30, random_state=0)
max_n = X_train.shape[1]
step = (max_n - 15) // 2
max_features_list = [i for i in range(15, max_n, step)] + [max_n]
param_grid_rndfor = [{'max_depth': [10, 15, 20, None],
'min_samples_leaf': [1, 2], 'min_samples_split': [10, 12, 15],
'max_leaf_nodes': [200, None], 'max_features': max_features_list}]
grid_search_rnd_for = GridSearchCV(rnd_for_reg, param_grid_rndfor, iid=False, cv=5, scoring='neg_mean_squared_error',
return_train_score=False)
grid_search_rnd_for.fit(X_train, yval_train)
rndfor_reg = grid_search_rnd_for.best_params_
rnd_max_depth, rnd_min_samples_leaf = rndfor_reg['max_depth'], rndfor_reg['min_samples_leaf']
rnd_min_samples_split, rnd_max_leaf_nodes = rndfor_reg['min_samples_split'], rndfor_reg['max_leaf_nodes']
rnd_max_features = rndfor_reg['max_features']
print(rndfor_reg)
print('\n')
print('The best MSE score obtained is:', -grid_search_rnd_for.best_score_)
rnd_for_reg_opt = RandomForestRegressor(n_estimators=500, random_state=0, max_depth=rnd_max_depth,
min_samples_leaf=rnd_min_samples_leaf, min_samples_split=rnd_min_samples_split,
max_features=rnd_max_features, max_leaf_nodes=rnd_max_leaf_nodes)
plot_learning_curves(rnd_for_reg_opt, X_train, yval_train, show_overfitting=show_overfitting)
xgb_reg = XGBRegressor(random_state=0, verbose=0)
plot_learning_curves(xgb_reg, X_train, yval_train, show_overfitting=show_overfitting)
xgb_reg = XGBRegressor(random_state=0, verbose=0, labmda=1000, max_depth=2, gamma=0.01)
plot_learning_curves(xgb_reg, X_train, yval_train)
xgbreg = XGBRegressor(random_state=0, verbose=0, learning_rate_init=0.2, subsample=0.8, colsample_bytree=0.8)
param_grid = [{'max_depth': [3, 4], 'reg_lambda': [25, 50, 75, 100, 1000], 'gamma': [0, 0.001, 0.005, 0.01],
'min_child_weight': [1, 2]}]
grid_search_xgbreg = GridSearchCV(xgbreg, param_grid, iid=False, cv=5, scoring='neg_mean_squared_error',
return_train_score=False)
grid_search_xgbreg.fit(X_train, yval_train)
xgbreg_param = grid_search_xgbreg.best_params_
xgb_max_depth, xgb_lambda, xbg_gamma = xgbreg_param['max_depth'], xgbreg_param['reg_lambda'], xgbreg_param['gamma']
xgb_min_child_weight = xgbreg_param['min_child_weight']
print(xgbreg_param)
print('\n')
print('The best MSE score obtained is:', -grid_search_xgbreg.best_score_)
xgb_reg_opt = XGBRegressor(random_state=0, verbose=0, learning_rate=0.05, max_depth=xgb_max_depth, reg_lambda=xgb_lambda,
gamma=xbg_gamma, min_child_weight=xgb_min_child_weight, subsample=0.8, colsample_bytree=0.8)
plot_learning_curves(xgb_reg_opt, X_train, yval_train, show_overfitting=show_overfitting)
models = [('Ridge', ridge_opt), ('PolynomialRidge', polynomial_ridge), ('Mlp', mlp_opt),
('RandomForestRegressor', rnd_for_reg_opt), ('XGBRegressor', xgb_reg_opt)]
if show1:
for i in models:
print(i[0], ':', i[1])
def model_eval(models, X_train, yval_train, X_test, yval_test, ordered=True):
d = {}
yval_test_vals = yval_test.values
order = [i for i in range(len(yval_test_vals))]
if ordered:
order = np.argsort(yval_test_vals)
plt.plot(yval_test_vals[order], c='red', label='original')
jet= plt.get_cmap('jet')
colors = iter(jet(np.linspace(0,1,10)))
for i in models:
i[1].fit(X_train, yval_train)
yvals_pred = i[1].predict(X_test)
plt.plot(yvals_pred[order], color=next(colors), label=i[0])
d[i[0]] = mean_squared_error(yval_test_vals, yvals_pred)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
return d
d = model_eval(models, X_train, yval_train, X_test, yval_test, ordered=False)
print(d)
selected_models = [('Ridge', ridge_opt), ('Mlp', mlp_opt), ('RandomForestRegressor', rnd_for_reg_opt)]
def ensemble(selected_models, X_train, yval_train, X_test, yval_test, ordered=True):
d_mse = {}
yval_true = yval_test.values
size = len(selected_models), yval_true.shape[0]
yvals_pred = np.zeros(size)
order = [i for i in range(size[1])]
if ordered:
order = np.argsort(yval_true)
plt.plot(yval_true[order], c='red', label='original')
jet= plt.get_cmap('jet')
colors = iter(jet(np.linspace(0,1,10)))
for n, i in enumerate(selected_models):
i[1].fit(X_train, yval_train)
yvals_pred[n, :] = i[1].predict(X_test)
plt.plot(yvals_pred[n, :][order], color=next(colors), label=i[0])
d_mse[i[0]] = mean_squared_error(yval_true, yvals_pred[n, :])
yvals_pred[-1, :] = np.mean(yvals_pred[:size[1], :], axis = 0)
d_mse['Average'] = mean_squared_error(yval_true, yvals_pred[-1, :])
plt.plot(yvals_pred[-1, :][order], color=next(colors), label="Average")
plt.legend()
plt.show()
return d_mse
d = ensemble(selected_models, X_train, yval_train, X_test, yval_test, ordered=False)
print(d)
test_fin = pd.read_csv(vf_path_test, index_col=0)
train_fin = reset_features()
X_train_fin, yval_train_fin, ylab_train_fin, X_test_fin = data_preparer((train_fin, test_fin))
X_train_fin, X_test_fin = dummier_tot(X_train_fin, X_test_fin, cat)
if applyPCA:
n = PCA_plot(X_train_fin, .95, show=False)
X_train_fin, X_test_fin = apply_PCA_train_test(X_train_fin, X_test_fin, n)
def predict_ensemble(X_train_fin, yval_train_fin, X_test_fin, selected_models):
size = len(selected_models), X_test_fin.shape[0]
yvals_pred = np.zeros(size)
for n, i in enumerate(selected_models):
i[1].fit(X_train_fin, yval_train_fin)
yvals_pred[n, :] = i[1].predict(X_test_fin)
yvals_pred[-1, :] = np.mean(yvals_pred[:size[1], :], axis = 0)
yvals_predicted_fin = pd.DataFrame(yvals_pred[-1,:], columns=['Footfall'], index=X_test_fin.index)
return yvals_predicted_fin
yval_predicted_fin = predict_ensemble(X_train_fin, yval_train_fin, X_test_fin, selected_models)
def random_classif(ylab_test):
ylab_pred = np.random.choice([0,1,2], size=ylab_test.shape)
return accuracy_score(ylab_test, ylab_pred)
print('If we run a random picker, we get an accuracy of', random_classif(ylab_test))
def model_test_classifier(X_train, ylab_train, cvnum=10):
scores = {}
#Logistic Regression model
log_reg = LogisticRegression()
log_reg_scores = cross_val_score(log_reg, X_train, ylab_train, scoring='accuracy', cv=cvnum)
mean_log_reg = log_reg_scores.mean()
scores['LogisticRegression'] = mean_log_reg
#RandomForestClassifier
rndfor_clf = RandomForestClassifier()
rnd_for_clf_scores = cross_val_score(rndfor_clf, X_train, ylab_train, scoring='accuracy', cv=cvnum)
mean_rnd_for_clf = rnd_for_clf_scores.mean()
scores['RandomForestClassifier'] = mean_rnd_for_clf
#SVMClassifier
svm_clf = SVC()
svm_clf_scores = cross_val_score(svm_clf, X_train, ylab_train, scoring='accuracy', cv=cvnum)
mean_svm_clf = svm_clf_scores.mean()
scores['SVMClassifier'] = mean_svm_clf
#KNNClassifier
knn_clf = KNeighborsClassifier()
knn_clf_scores = cross_val_score(knn_clf, X_train, ylab_train, scoring='accuracy', cv=cvnum)
mean_knn_clf = knn_clf_scores.mean()
scores['KNNClassifier'] = mean_knn_clf
#GaussianProcessClassifier
gaus_clf = GaussianProcessClassifier()
gaus_clf_scores = cross_val_score(gaus_clf, X_train, ylab_train, scoring='accuracy', cv=cvnum)
mean_gaus_clf = gaus_clf_scores.mean()
scores['GaussianProcessClassifier'] = mean_gaus_clf
#MultiLinearPerceptron
mlp_clf = MLPClassifier()
mlp_clf_scores = cross_val_score(mlp_clf, X_train, ylab_train, scoring='accuracy', cv=cvnum)
mean_mlp_clf = mlp_clf_scores.mean()
scores['MLPClassifier'] = mean_mlp_clf
return scores
X_prov, yval_prov, ylab_prov = reset_features()
X_train_prov, X_test_prov, yval_train_prov, yval_test_prov, ylab_train_prov, ylab_test_prov = splitter(X_prov, yval_prov, ylab_prov)
X_train_prov, X_test_prov = na_median(X_train_prov, X_test_prov)
scores_untuned_classif = model_test_classifier(X_train_prov, ylab_train_prov)
scores_dummyrest_classif = model_test_classifier(X_train, ylab_train)
if show:
print(scores_untuned_classif)
print('\n')
print(scores_dummyrest_classif)
def plot_learning_curves_classif(model, X, y, step=5, show_overfitting=True):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
train_errors, val_errors = [], []
for m in range(15, len(X_train), step):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(accuracy_score(y_train_predict, y_train[:m]))
val_errors.append(accuracy_score(y_val_predict, y_val))
if show_overfitting:
plt.plot(train_errors, "r-+", linewidth=2, label="Training set")
plt.plot(val_errors, "b-", linewidth=2, label="Validation set")
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
print('The last Accuracy Score is:', val_errors[-1])
log_reg = LogisticRegression(random_state=0, solver='lbfgs')
plot_learning_curves_classif(log_reg, X_train, ylab_train, show_overfitting=show_overfitting)
logreg = LogisticRegression(random_state=0, solver='lbfgs')
param_grid = [{'C': [0.001, 0.005, 0.01, 0.05, 0.75, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5]}]
grid_search_logreg = GridSearchCV(logreg, param_grid, iid=False, cv=5, scoring='accuracy',
return_train_score=False)
grid_search_logreg.fit(X_train, ylab_train)
param_logreg = grid_search_logreg.best_params_
logreg_C = param_logreg['C']
print(param_logreg)
print('\n')
print('The best accuracy score obtained is:', grid_search_logreg.best_score_)
log_reg_opt = LogisticRegression(random_state=0, solver='lbfgs', C=logreg_C)
plot_learning_curves_classif(log_reg_opt, X_train, ylab_train, show_overfitting=show_overfitting)
polynomial_log_reg = Pipeline((('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
('poly_log_reg', LogisticRegression(random_state=0, solver='lbfgs'))))
plot_learning_curves_classif(polynomial_log_reg, X_train, ylab_train, show_overfitting=show_overfitting)
polyfeatures = PolynomialFeatures(include_bias=False)
polylogreg = LogisticRegression(random_state=0, solver='lbfgs')
polynomial_logreg = Pipeline(steps=[('polyfeatures', polyfeatures), ('polylogreg', polylogreg)])
param_grid = [{'polyfeatures__degree': [1, 2], 'polylogreg__C': [0.001, 0.005, 0.01, 0.05, 0.1, 0.25, 0.5]}]
grid_search_polylogreg = GridSearchCV(polynomial_logreg, param_grid, iid=False, cv=5, scoring='accuracy',
return_train_score=False)
grid_search_polylogreg.fit(X_train, ylab_train)
param_polynomial_logreg = grid_search_polylogreg.best_params_
logreg_poly_degrees, logreg_C = param_polynomial_logreg['polyfeatures__degree'], param_polynomial_logreg['polylogreg__C']
print(param_polynomial_logreg)
print('\n')
print('The best accuracy score obtained is:', grid_search_polylogreg.best_score_)
polynomial_log_reg_opt = Pipeline((('poly_features', PolynomialFeatures(degree=logreg_poly_degrees, include_bias=False)),
('poly_log_reg', LogisticRegression(random_state=0, solver='lbfgs', C=logreg_C))))
plot_learning_curves_classif(polynomial_log_reg_opt, X_train, ylab_train, show_overfitting=show_overfitting)
rnd_for_clf =RandomForestClassifier(random_state=0)
plot_learning_curves_classif(rnd_for_clf, X_train, ylab_train, show_overfitting=show_overfitting)
rnd_for_clf =RandomForestClassifier(random_state=0, n_estimators=50)
max_n = X_train.shape[1]
param_grid = [{'max_depth': [5, 8, 10, 15], 'min_samples_leaf': [3, 4], 'min_samples_split': [2, 5, 10],
'max_leaf_nodes': [50, 75], 'max_features': [5, 10, 15]}]
grid_search_rndfor_clf = GridSearchCV(rnd_for_clf, param_grid, iid=False, cv=5, scoring='accuracy', return_train_score=False)
grid_search_rndfor_clf.fit(X_train, ylab_train)
param_rndfor_clf = grid_search_rndfor_clf.best_params_
rnd_max_depth, rnd_min_samples_leaf = param_rndfor_clf['max_depth'], param_rndfor_clf['min_samples_leaf']
rnd_min_samples_split, rnd_max_leaf_nodes = param_rndfor_clf['min_samples_split'], param_rndfor_clf['max_leaf_nodes']
rnd_max_features = param_rndfor_clf['max_features']
print(param_rndfor_clf)
print('\n')
print('The best accuracy score obtained is:', grid_search_rndfor_clf.best_score_)
rnd_for_clf_opt = RandomForestClassifier(n_estimators=500, random_state=0, max_depth=rnd_max_depth,
min_samples_leaf=rnd_min_samples_leaf, min_samples_split=rnd_min_samples_split,
max_features=rnd_max_features, max_leaf_nodes=rnd_max_leaf_nodes)
plot_learning_curves_classif(rnd_for_clf_opt, X_train, ylab_train, show_overfitting=show_overfitting)
mlp_clf = MLPClassifier(random_state=0)
plot_learning_curves_classif(mlp_clf, X_train, ylab_train, show_overfitting=show_overfitting)
mlp_clf = MLPClassifier(random_state=0, verbose=0, max_iter=1000, learning_rate_init=1e-03, solver='lbfgs')
param_grid_mlp_clf = [{'alpha': [50, 10, 5, 1], 'hidden_layer_sizes': [(3, ), (10, ), (25, )]}]
grid_search_mlp_clf = GridSearchCV(mlp_clf, param_grid_mlp_clf, iid=False, cv=5, scoring='accuracy', return_train_score=False)
grid_search_mlp_clf.fit(X_train, ylab_train)
mlp_clf_param = grid_search_mlp_clf.best_params_
mlp_clf_alpha, mlp_clf_hidden_layer_sizes = mlp_clf_param['alpha'], mlp_clf_param['hidden_layer_sizes']
print(mlp_clf_param)
print('\n')
print('The best Accuracy score obtained is:', grid_search_mlp_clf.best_score_)
mlp_clf_opt = MLPClassifier(hidden_layer_sizes=mlp_clf_hidden_layer_sizes, max_iter=10000, alpha=mlp_clf_alpha, verbose=False,
random_state=0, learning_rate_init=1e-03)
plot_learning_curves_classif(mlp_clf_opt, X_train, ylab_train, show_overfitting=show_overfitting)
xgb_clf = XGBClassifier(random_state=0)
plot_learning_curves_classif(xgb_clf, X_train, ylab_train, show_overfitting=show_overfitting)
xgbclf = XGBClassifier(random_state=0, verbose=0, learning_rate_init=0.2, subsample=0.8, colsample_bytree=0.8)
param_grid = [{'max_depth': [3, 4], 'reg_lambda': [25, 50, 75, 100, 1000], 'gamma': [0, 0.001, 0.005, 0.01],
'min_child_weight': [1, 2]}]
grid_search_xgbclf = GridSearchCV(xgbclf, param_grid, iid=False, cv=5, scoring='accuracy', return_train_score=False)
grid_search_xgbclf.fit(X_train, ylab_train)
xgbclf_param = grid_search_xgbclf.best_params_
xgb_max_depth, xgb_lambda, xbg_gamma = xgbclf_param['max_depth'], xgbclf_param['reg_lambda'], xgbclf_param['gamma']
xgb_min_child_weight = xgbclf_param['min_child_weight']
print(xgbclf_param)
print('\n')
print('The best Accuracy score obtained is:', grid_search_xgbclf.best_score_)
xgb_clf_opt = XGBClassifier(random_state=0, verbose=0, learning_rate=0.05, max_depth=xgb_max_depth, reg_lambda=xgb_lambda,
gamma=xbg_gamma, min_child_weight=xgb_min_child_weight, subsample=0.8, colsample_bytree=0.8)
plot_learning_curves_classif(xgb_clf_opt, X_train, ylab_train, show_overfitting=show_overfitting)
models_classif = [('LogisticRegression', log_reg_opt), ('PolynomialLogisticRegression', polynomial_log_reg_opt),
('RandomForestClassifier', rnd_for_clf_opt), ('MLPClassifier', mlp_clf_opt),
('XGBClassifier', xgb_clf_opt)]
if show1:
for i in models_classif:
print(i[0], ':', i[1])
def model_eval_classif(models, X_train, ylab_train, X_test, ylab_test, ordered=True):
d = {}
ylab_test_vals = ylab_test.values
order = [i for i in range(len(ylab_test_vals))]
if ordered:
order = np.argsort(ylab_test_vals)
for i in models:
i[1].fit(X_train, ylab_train)
ylab_pred = i[1].predict(X_test)
plt.plot(ylab_test_vals[order], 'go', markersize=12)
plt.plot(ylab_pred[order], 'ro')
plt.title('Original(Green) vs. ' + i[0] + '(Red)')
plt.show()
d[i[0]] = accuracy_score(ylab_test_vals, ylab_pred)
return d
d = model_eval_classif(models_classif, X_train, ylab_train, X_test, ylab_test, ordered=False)
print(d)
selected_models_classif = [('LogisticRegression', log_reg_opt), ('RandomForestClassifier', rnd_for_clf_opt)]
voting_clf = VotingClassifier(estimators=selected_models_classif, voting='soft')
final_models_clf = selected_models_classif + [('VotingClassifier', voting_clf)]
d = model_eval_classif(final_models_clf, X_train, ylab_train, X_test, ylab_test, ordered=False)
print(d)
voting_clf.fit(X_train_fin, ylab_train_fin)
ylab_predicted_fin = pd.DataFrame(voting_clf.predict(X_test_fin), index=X_test_fin.index, columns=['Footfall_classif'])
predicted_vals_fin = pd.concat([yval_predicted_fin, ylab_predicted_fin], verify_integrity=True, axis=1)
predicted_vals_fin.to_csv(r'C:\Users\prebe\Google Drive\Unibocconi\Machine Learning\Vodafone Challenge\Group10_prediction.csv')