import pandas as pd
import numpy as np
data_train = pd.read_csv('adult_train.csv')
data_test = pd.read_csv('adult_test.csv')
Xtrain = data_train.drop(columns='target').copy()
Ytrain = data_train['target'].copy()
Xtest = data_test.drop(columns='target').copy()
Ytest = data_test['target'].copy()
Xtrain_dict = Xtrain.to_dict('records')
Xtest_dict = Xtest.to_dict('records')
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer() # sparse = 'True' is default
Xtrain_encoded = dv.fit_transform(Xtrain_dict)
Xtest_encoded = dv.transform(Xtest_dict)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
dt_train_accuracy_list = []
dt_test_accuracy_list = []
for depth in range(1,21):
dtclf = DecisionTreeClassifier(max_depth=depth)
dtclf.fit(Xtrain_encoded, Ytrain)
y_pred_train = dtclf.predict(Xtrain_encoded)
dt_train_accuracy_list.append(accuracy_score(Ytrain, y_pred_train))
y_pred_test = dtclf.predict(Xtest_encoded)
dt_test_accuracy_list.append(accuracy_score(Ytest, y_pred_test))
plt.plot(range(1,21), dt_train_accuracy_list, c='blue', label='train', marker='.')
plt.plot(range(1,21), dt_test_accuracy_list, c='green', label='test', marker='.')
plt.xticks(range(1,21))
plt.xlabel('max depth')
plt.ylabel('accuracy')
plt.title('Accuracy')
plt.legend()
plt.show()
from sklearn.ensemble import RandomForestClassifier
import time
n_estimators_list = [1, 5, 10, 20, 50, 100, 150, 200, 300]
max_depth = np.arange(1,21)
accuracy_train_dict = {}
accuracy_test_dict = {}
training_time_dict = {}
for n_est in n_estimators_list:
train_accuracy_list = []
test_accuracy_list = []
training_time_list = []
for depth in max_depth:
start = time.time()
rfclf = RandomForestClassifier(max_depth=depth, n_estimators=n_est, n_jobs=-1)
rfclf.fit(Xtrain_encoded, Ytrain)
end = time.time()
training_time_list.append(end-start)
y_pred_train = rfclf.predict(Xtrain_encoded)
train_accuracy_list.append(accuracy_score(Ytrain, y_pred_train))
y_pred_test = rfclf.predict(Xtest_encoded)
test_accuracy_list.append(accuracy_score(Ytest, y_pred_test))
training_time_dict[n_est] = training_time_list
accuracy_train_dict[n_est] = train_accuracy_list
accuracy_test_dict[n_est] = test_accuracy_list
fig, axes = plt.subplots(1,2, figsize=(20,6))
axes[0].plot(range(1,21), dt_train_accuracy_list, c='blue', label='train', marker='.')
axes[0].plot(range(1,21), dt_test_accuracy_list, c='green', label='test', marker='.')
axes[0].set_xticks(range(1,21))
axes[0].set_yticks(np.arange(0.75, 0.96, 0.025))
axes[0].set_xlabel('max depth')
axes[0].set_ylabel('accuracy')
axes[0].set_title('Decision Tree Classifier')
axes[0].legend()
axes[1].plot(range(1,21), accuracy_train_dict[1], c='blue', label='train', marker='.')
axes[1].plot(range(1,21), accuracy_test_dict[1], c='green', label='test', marker='.')
axes[1].set_xticks(range(1,21))
axes[1].set_yticks(np.arange(0.75, 0.96, 0.025))
axes[1].set_xlabel('max depth')
axes[1].set_ylabel('accuracy')
axes[1].set_title('Random Forest Classifier with the ensemble size = 1')
axes[1].legend()
plt.show()
fig, axes = plt.subplots(1,2, figsize=(20,6))
axes[0].plot(range(1,21), dt_train_accuracy_list, c='blue', label='train', marker='.')
axes[0].plot(range(1,21), dt_test_accuracy_list, c='green', label='test', marker='.')
axes[0].set_xticks(range(1,21))
axes[0].set_yticks(np.arange(0.75, 0.96, 0.025))
axes[0].set_xlabel('max depth')
axes[0].set_ylabel('accuracy')
axes[0].set_title('Decision Tree Classifier')
axes[0].legend()
axes[1].plot(range(1,21), accuracy_train_dict[100], c='blue', label='train', marker='.')
axes[1].plot(range(1,21), accuracy_test_dict[100], c='green', label='test', marker='.')
axes[1].set_xticks(range(1,21))
axes[1].set_yticks(np.arange(0.75, 0.96, 0.025))
axes[1].set_xlabel('max depth')
axes[1].set_ylabel('accuracy')
axes[1].set_title('Random Forest Classifier with the ensemble size = 100')
axes[1].legend()
plt.show()
fig, axes = plt.subplots(3,3, figsize=(30, 20))
for i,n_est in enumerate(n_estimators_list):
row = i//3
col = i%3
num = 10**(i+2)
axes[row,col].plot(range(1,21), accuracy_train_dict[n_est], c='blue', label='train', marker='.')
axes[row,col].plot(range(1,21), accuracy_test_dict[n_est], c='green', label='test', marker='.')
axes[row,col].set_xticks(range(1,21))
axes[row,col].set_xlabel('max depth')
axes[row,col].set_ylabel('accuracy')
axes[row,col].set_title('Random Forest with ensamble size = %d' % n_est)
axes[row,col].legend()
best_accuracy = [np.max(accuracy_test_dict[key]) for key in accuracy_test_dict.keys()]
plt.plot(best_accuracy, marker='.')
plt.yticks(np.arange(0.80, 0.91, 0.01))
plt.xticks(np.arange(0,9,1), n_estimators_list)
plt.xlabel('ensemble size')
plt.ylabel('accuracy')
plt.title('The best observed accuracy for different ensemble size')
plt.show()
training_time = [np.sum(training_time_dict[key]) for key in training_time_dict.keys()]
plt.plot(training_time, marker='.')
plt.yticks(np.arange(0,46,5))
plt.xticks(np.arange(0,9,1), n_estimators_list)
plt.xlabel('ensemble size')
plt.ylabel('training time')
plt.title('The best observed accuracy for different ensemble size')
plt.show()
accuracy_list = []
for depth in range(1,16):
dtclf = DecisionTreeClassifier(max_depth=depth, random_state=0)
dtclf_score = cross_val_score(dtclf, Xtrain_encoded, Ytrain, cv=5, scoring='accuracy')
accuracy_list.append(np.mean(dtclf_score))
plt.plot(accuracy_list, marker='.')
plt.xticks(list(range(0,15)),list(range(1,16)))
plt.title('Decision Tree Classifier')
plt.xlabel('max depth')
plt.ylabel('accuracy')
plt.show()
dtclf_final = DecisionTreeClassifier(random_state=0, max_depth = 6)
dtclf_final.fit(Xtrain_encoded, Ytrain)
dt_feature_scores = dtclf_final.feature_importances_
feature_names = dv.feature_names_
for score, fname in sorted(zip(dt_feature_scores, feature_names), reverse=True)[:10]:
print(fname, score)
dv_unsparse = DictVectorizer(sparse=False)
Xtrain_unsparse = pd.DataFrame(dv_unsparse.fit_transform(Xtrain_dict))
Xtrain_unsparse.columns = dv_unsparse.feature_names_
train_unsparse = Xtrain_unsparse.copy()
train_unsparse['target'] = Ytrain
marrid_civ_table = pd.DataFrame(train_unsparse.groupby('marital-status=Married-civ-spouse')['target'].value_counts(normalize=True))
marrid_civ_table.rename_axis(index=['Married-civ-spouse', 'target'], inplace=True)
marrid_civ_table.columns = ['Num']
marrid_civ_table
max_dept_list = [9, 10, 11]
n_estimators_list = [50,100,200]
params = {}
params['max_depth'] = max_dept_list
params['n_estimators'] = n_estimators_list
grid = GridSearchCV(RandomForestClassifier(random_state=0), params, cv=4, scoring='accuracy', n_jobs=-1)
# %%script echo "skip running since it takes long"
# NOTE: this takes a long time to run
grid.fit(Xtrain_encoded, Ytrain)
grid.best_score_
grid.best_params_
rfclf_final = RandomForestClassifier(random_state=0, max_depth = 11, n_estimators=200)
rfclf_final.fit(Xtrain_encoded, Ytrain)
rf_feature_scores = rfclf_final.feature_importances_
feature_names = dv.feature_names_
for score, fname in sorted(zip(rf_feature_scores, feature_names), reverse=True)[:10]:
print(fname, score)