from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('gruvboxd')
#Running the necessary libraries from another notebook
%run Libraries.ipynb
C:\Users\Thosiba\anaconda3\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
from pandas import MultiIndex, Int64Index
#Running useful pre-written functions from another notebook
%run Utility_tools.ipynb
raw = pd.read_csv('train.csv', index_col=[0])
df = deepcopy(raw)
sns.countplot(df.satisfaction)
plt.show();
display(df[df.isna().any(axis=1)].shape[0], df.isna().sum())
df.rename(columns=lambda col: col.replace(' ', '_').replace('/', '_').lower(), inplace=True)
reduce_memory_usage(df)
Memory usage before: 19.82 MB
Memory usage now : 3.77 MB
Memory usage decreased by 81.0%
df.loc[df.satisfaction=='satisfied'].shape[0]/df.shape[0]
df.loc[~df['arrival_delay_in_minutes'].isna()].shape[0]/df.shape[0]
df.drop('id', axis=1, inplace=True)
df.satisfaction = df.satisfaction.replace('satisfied', 1).replace('neutral or dissatisfied', 0).astype('int')
df['arrival_delay_in_minutes'].value_counts().plot(kind='pie')
plt.show();
df['arrival_delay_in_minutes'].plot(kind='hist', bins=15)
plt.show();
df['arrival_delay_in_minutes'].fillna(value=0.0, inplace=True)
duplicate_check_remove(df)
There are no duplicate rows in the dataset.
df_cont = df.select_dtypes(include='number')
df_cat = df.select_dtypes(include='category')
#Some of our presumed continuous features are actually ordinal. We'll separate them.
for col in df_cont:
print(f'''{col}: {df_cont.nunique()}''')
age: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
flight_distance: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
inflight_wifi_service: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
departure_arrival_time_convenient: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
ease_of_online_booking: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
gate_location: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
food_and_drink: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
online_boarding: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
seat_comfort: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
inflight_entertainment: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
on-board_service: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
leg_room_service: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
baggage_handling: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
checkin_service: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
inflight_service: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
cleanliness: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
departure_delay_in_minutes: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
arrival_delay_in_minutes: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
satisfaction: age 75
flight_distance 3802
inflight_wifi_service 6
departure_arrival_time_convenient 6
ease_of_online_booking 6
gate_location 6
food_and_drink 6
online_boarding 6
seat_comfort 6
inflight_entertainment 6
on-board_service 6
leg_room_service 6
baggage_handling 5
checkin_service 6
inflight_service 6
cleanliness 6
departure_delay_in_minutes 446
arrival_delay_in_minutes 455
satisfaction 2
dtype: int64
df_ord = df[[col for col in df_cont if df_cont[col].nunique()<10]]
df_cont = df[[col for col in df_cont.columns if col not in df_ord.columns[:-1]]]
#Comparing means
pd.pivot_table(data=df_cont, index='satisfaction', aggfunc=[np.mean ,np.median])
#Checking equal variances assumption. Mostly violated, so we continue with Welch T-test
pd.pivot_table(data=df_cont, index='satisfaction', aggfunc=np.var)
for col in df_cont.columns:
print(f'''P-value for {col}: {ttest(df_cont[df_cont['satisfaction']==0][col],
df_cont[df_cont['satisfaction']==1][col], correction='auto').T.iloc[3,:][0]} ''')
P-value for age: 0.0
P-value for flight_distance: 0.0
P-value for departure_delay_in_minutes: 1.2767076513246778e-61
P-value for arrival_delay_in_minutes: 1.0
P-value for satisfaction: 0.0
df_cont.drop(['departure_delay_in_minutes', 'arrival_delay_in_minutes'], axis=1, inplace=True)
from sklearn.feature_selection import VarianceThreshold
print(df_cont.shape)
var_filter = VarianceThreshold(threshold = 0.0)
train = var_filter.fit_transform(df_cont)
print(train.shape) #All of them remained
(103904, 3)
(103904, 3)
find_outliers_iqr(df_cont)
Number of outlier records in age column: 0
Number of outlier records in flight_distance column: 2291
Number of outlier records in satisfaction column: 0
def coerce_outliers(value):
if value > upperlimit:
value = upperlimit
elif value < lowerlimit:
value = lowerlimit
return value
for feature in df_cont.columns:
Q3 = df_cont[feature].quantile(q = 0.75)
Q1 = df_cont[feature].quantile(q = 0.25)
IQR = Q3 - Q1
outlier_range = IQR * 1.5
upperlimit = Q3 + outlier_range
lowerlimit = Q1 - outlier_range
df_cont[feature] = df_cont[feature].apply(coerce_outliers)
#We've got very few categories, but should we use one-hot encoding we'll end up with 9 columns
for col in df_cat:
print(f'''Number of unique values for {col}: {df_cat[col].nunique()}''')
Number of unique values for gender: 2
Number of unique values for customer_type: 2
Number of unique values for type_of_travel: 2
Number of unique values for class: 3
expected, observed, stats = chi2_independence(data=df, x='gender', y='satisfaction')
stats.round(3)
expected, observed, stats = chi2_independence(data=df, x='customer_type', y='satisfaction')
stats.round(3)
expected, observed, stats = chi2_independence(data=df, x='type_of_travel', y='satisfaction')
stats.round(3)
expected, observed, stats = chi2_independence(data=df, x='class', y='satisfaction')
stats.round(3)
df_cat = pd.get_dummies(df_cat)
#Ordinal features
pd.pivot_table(data=df_ord, index='satisfaction', aggfunc=[np.median, np.mean]).T
df_ord.corr(method='kendall')['satisfaction'].reset_index().T
df_ord.drop(['gate_location', 'departure_arrival_time_convenient'], axis=1, inplace=True)
df = pd.concat([df_cat, df_cont.iloc[:,:-1], df_ord], axis=1)
df.sample(3)
#Pre-modeling
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
scaler = RobustScaler()
x_train = scaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)
scaler = RobustScaler()
x_test = scaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)
def get_model_metrics(x_train, y_train, x_test, y_test, preds, mdl):
train_acc = mdl.score(x_train,y_train)
test_acc = mdl.score(x_test, y_test)
rmse = (np.sqrt(mean_squared_error(y_test, preds)))
results = {'Train_acc': train_acc, 'Test_acc': test_acc, 'rmse': rmse}
model = 'Value'
model_metrics = pd.DataFrame(results.items(), columns = ['Metric', str(model)]).set_index('Metric')
return model_metrics
def classification_metrics(x_train, y_train, x_test, y_test, preds ,probs ,mdl): #Evaluation Metrics
Accuracy = accuracy_score(y_test, preds)
Precision = precision_score(y_test, preds)
Recall = recall_score(y_test, preds)
#Confusion Matrix
cm = pd.DataFrame(confusion_matrix(y_test, preds, labels=[0,1]))
TN = cm[0][0]
FN = cm[0][1]
FP = cm[1][0]
TP = cm[1][1]
TPR = TN/(FP+TN)
FPR = FP/(FP+TN)
# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, probs)
roc_auc = auc(fpr, tpr )
logit_summary = {'Accuracy': Accuracy,
'Precision': Precision,
'Recall': Recall,
'True Positive Rate': TPR,
'False Positive Rate': FPR,
}
model = 'Value'
class_metrics = pd.DataFrame(logit_summary.items(), columns = ['Metric', str(model)]).set_index('Metric')
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
fig, ax = plt.subplots(figsize=(5, 5))
plot_confusion_matrix(mdl, x_test, y_test, cmap=plt.cm.Blues, ax=ax)
plt.tight_layout()
plt.title('Confusion Matrix', y = 1.1)
return class_metrics
logit = LogisticRegression(solver='liblinear')
logit.fit(x_train, y_train)
logit_preds = logit.predict(x_test)
logit_probs = logit.predict_proba(x_test)[:,1]
logit_probs2 = logit.predict_proba(x_test)
model_metrics = get_model_metrics(x_train, y_train, x_test, y_test, logit_preds, logit)
class_metrics = classification_metrics(x_train, y_train, x_test, y_test, logit_preds,logit_probs, logit)
y_true = y_test.values
metrics_list = [model_metrics, class_metrics]
lgrmetrics = pd.concat(metrics_list)
lgrmetrics = lgrmetrics.reset_index()
lgrmetrics['Model'] = 'Logisitc'
plt.show();
lgrmetrics
rfc = RandomForestClassifier(n_estimators=500, max_depth=10)
rfc = rfc.fit(x_train, y_train)
forest_preds = rfc.predict(x_test)
forest_probs = rfc.predict_proba(x_test)[:,1]
model_metrics = get_model_metrics(x_train, y_train, x_test, y_test, forest_preds, rfc)
class_metrics = classification_metrics(x_train, y_train, x_test, y_test, forest_preds, forest_probs, rfc)
metrics_list = [model_metrics, class_metrics]
forest_metrics = pd.concat(metrics_list)
forest_metrics['Model'] = 'Forest'
plt.show();
forest_metrics = forest_metrics.reset_index()
forest_metrics
print(classification_report(y_test, forest_preds))
precision recall f1-score support
0 0.91 0.97 0.94 11713
1 0.96 0.88 0.92 9068
accuracy 0.93 20781
macro avg 0.94 0.93 0.93 20781
weighted avg 0.94 0.93 0.93 20781
importance = rfc.feature_importances_
impList = zip(x_train.columns, importance)
for feature in sorted(impList, key = lambda t: t[1], reverse=True):
print(feature) #Seems like 'food&drink' is the most impactful feature
with open('Passenger_satisfaction_RF-model', 'wb') as files:
pickle.dump(rfc, files)