# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer, confusion_matrix, plot_confusion_matrix
from datetime import datetime, timedelta
import pickle
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import xgboost as xgb
import matplotlib.pyplot as plt
from matplotlib import image
img = image.imread("/work/imb1.jpeg")
plt.figure(figsize=(10,10))
plt.imshow(img)
plt.show()
delivery_data= pd.read_csv("/work/train.csv")
train_data=delivery_data.copy()
train_data.head()
test_data= pd.read_csv("/work/test.csv")
test_data.head()
my_submission = pd.DataFrame({'order_id': test_data.order_id})
my_submission.head()
train_data.columns.values
train_data.isna().sum()
test_data.isna().sum()
train_data.dtypes
train_data.describe()
plt.title('Distribution of target variable')
sns.countplot(train_data['cancelled'])
plt.show()
# Studying the target variable
train_data['cancelled'].value_counts()
#Checking if order_time and order_date column contains same date
df_order=pd.Series(dtype=int)
df_order=np.where((pd.to_datetime(train_data['order_time'])).dt.date != pd.to_datetime(train_data['order_date']), 1, 0)
df_order.sum()
#spread of data
train_data['order_date'].unique()
# Dropping columns which will be of no use for our model
col_list = ['order_id','order_date','pickup_time','delivered_time','rider_id','cancelled_time','reassignment_method','reassignment_reason']
# Dropping columns which will be of no use for our model
train_data = train_data.drop(col_list, axis=1, errors='ignore')
test_data = test_data.drop(col_list, axis=1,errors='ignore')
plt.plot(train_data['first_mile_distance'])
train_data.drop(train_data[train_data['first_mile_distance'] == (train_data['first_mile_distance'].max())].index, inplace = True)
plt.plot(train_data['last_mile_distance'])
train_data['total_distance']=train_data['first_mile_distance']+ train_data['last_mile_distance']
test_data['total_distance']=test_data['first_mile_distance']+ test_data['last_mile_distance']
# Dropping columns which will be of no use for our model
col_list=['first_mile_distance','last_mile_distance']
train_data = train_data.drop(col_list, axis=1)
test_data = test_data.drop(col_list, axis=1)
# Function to fill missing values by mean
def fill_mean(imp_mean_list):
for i in imp_mean_list:
train_data[i] = train_data[i].fillna(train_data[i].mean())
test_data[i] = test_data[i].fillna(train_data[i].mean())
fill_mean_list = ['alloted_orders', 'delivered_orders','undelivered_orders']
fill_mean(fill_mean_list)
train_data['lifetime_order_count']= np.where(train_data['lifetime_order_count'].isna(),train_data['alloted_orders'],train_data['lifetime_order_count'])
test_data['lifetime_order_count']= np.where(test_data['lifetime_order_count'].isna(),test_data['alloted_orders'],test_data['lifetime_order_count'])
train_data['reassigned_order']=train_data['reassigned_order'].fillna(0)
test_data['reassigned_order']=test_data['reassigned_order'].fillna(0)
train_data['session_time']=train_data['session_time'].fillna(0)
test_data['session_time']=test_data['session_time'].fillna(0)
train_data.isna().sum()
train_data['%undelivered_orders']= train_data['undelivered_orders']/train_data['alloted_orders']
test_data['%undelivered_orders']= test_data['undelivered_orders']/test_data['alloted_orders']
train_data['odds_undeliver']= train_data['undelivered_orders']/train_data['delivered_orders']
test_data['odds_undeliver']= test_data['undelivered_orders']/test_data['delivered_orders']
train_data['allot_time']= pd.to_datetime(train_data['allot_time'])- pd.to_datetime(train_data['order_time'])
allot_time = []
for i in train_data['allot_time']:
allot_time.append(i.total_seconds())
train_data['allot_time']=allot_time
test_data['allot_time']= pd.to_datetime(test_data['allot_time'])- pd.to_datetime(test_data['order_time'])
allot_time = []
for i in test_data['allot_time']:
allot_time.append(i.total_seconds())
test_data['allot_time']=allot_time
# Function to extraxt year and month from datetime data
def conv_date(date_list):
for i in date_list:
train_data[i] = pd.to_datetime(train_data[i])
test_data[i] = pd.to_datetime(test_data[i])
date_list = ['order_time']
conv_date(date_list)
# Function to obtain year and month from datetime data
def get_hour(colname, dfcolname):
train_data[colname] = train_data[dfcolname].apply(lambda x:x.hour)
test_data[colname] = test_data[dfcolname].apply(lambda x:x.hour)
get_hour('order_hour', 'order_time')
# Dropping columns which will be of no use for our model
col_list=['order_time','accept_time','delivered_orders','alloted_orders','undelivered_orders']
train_data = train_data.drop(col_list, axis=1)
test_data = test_data.drop(col_list, axis=1)
train_order_time_shift=[]
for i in train_data['order_hour']:
if((i<8) | (i>=22)):
train_order_time_shift.append(0)
elif((i>=8) & (i<12)):
train_order_time_shift.append(2)
elif((i>=12) & (i<17)):
train_order_time_shift.append(3)
else:
train_order_time_shift.append(1)
test_order_time_shift=[]
for i in test_data['order_hour']:
if((i<8) | (i>=22)):
test_order_time_shift.append(0)
elif((i>=8) & (i<12)):
test_order_time_shift.append(2)
elif((i>=12) & (i<17)):
test_order_time_shift.append(3)
else:
test_order_time_shift.append(1)
train_data['order_hour']=train_order_time_shift
test_data['order_hour']=test_order_time_shift
train_data.head()
test_data.head()
train_data.nunique()
test_data.nunique()
# Splitting the data into training set and testset
from sklearn.model_selection import train_test_split
t_train, t_test = train_test_split(train_data, test_size = 0.25, stratify=train_data['cancelled'])
# Since our classes are highly skewed we should make them equivalent in order to have a normal distribution of the classes.
# Lets shuffle the data before creating the subsamples
train_data = t_train.sample(frac=1)
# Amount of fraud classes in proportion.
canc = train_data[train_data['cancelled'] == 1]
not_canc = train_data[train_data['cancelled'] == 0][:15000]
normal_distributed_df = pd.concat([canc,not_canc,canc,canc])
# Shuffle dataframe rows
new_df = normal_distributed_df.sample(frac=1, random_state=42)
new_df.head()
plt.title('Distribution of target variable')
sns.countplot(new_df['cancelled'])
plt.show()
new_X_train= new_df.drop(['cancelled'], axis=1).copy()
new_Y_train= new_df['cancelled'].copy()
new_X_test= t_test.drop(['cancelled'], axis=1).copy()
new_Y_test= t_test['cancelled'].copy()
X_train_data_enc= pd.get_dummies(new_X_train, columns=['order_hour','reassigned_order'])
X_test_data_enc= pd.get_dummies(new_X_test, columns=['order_hour','reassigned_order'])
test_data=pd.get_dummies(test_data, columns=['order_hour','reassigned_order'])
# A function to create confusion matrix function to find out sensitivity and specificity
from sklearn.metrics import confusion_matrix
def draw_cm(actual, predicted):
cm = confusion_matrix( actual, predicted).T
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = ["Yes","No"] , yticklabels = ["Yes","No"] )
plt.ylabel('Predicted')
plt.xlabel('Actual')
plt.show()
X_train_ANN= X_train_data_enc.copy()
X_test_ANN= X_test_data_enc.copy()
Y_train_ANN= new_Y_train.copy()
Y_test_ANN= new_Y_test.copy()
X_train_ANN.head()
# Importing the Keras libraries and packages
import tensorflow
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
classifier = Sequential()
#add input layer and first hidden layer
classifier.add(Dense(5, kernel_initializer='random_uniform', activation = 'relu', input_dim = 12))
#add 2nd hidden layer
classifier.add(Dense(5, bias_initializer='random_uniform', activation = 'relu'))
#add 3rd hidden layer
classifier.add(Dense(5, bias_initializer='random_uniform', activation = 'relu'))
classifier.add(Dense(1, bias_initializer='random_uniform', activation = 'sigmoid'))
optimizer = keras.optimizers.Adam(lr=0.0001)
classifier.compile(optimizer = optimizer, loss ='binary_crossentropy', metrics = ['accuracy'])
nn_history= classifier.fit(X_train_ANN, Y_train_ANN, batch_size = 32, epochs =100,validation_data = (X_test_ANN, Y_test_ANN))
y_pred_prob_ANN = classifier.predict(X_test_ANN)
y_pred_ANN=np.where(y_pred_prob_ANN>0.5,1,0)
draw_cm(Y_test_ANN, y_pred_ANN)
# Predicting on the test dataset
test_data = np.asarray(test_data).astype(np.float32)
my_pred_prob_ANN = classifier.predict(test_data)
my_pred_ANN=np.where(my_pred_prob_ANN>0.5,1,0)
X_train_XG= X_train_data_enc.copy()
X_test_XG= X_test_data_enc.copy()
Y_train_XG= new_Y_train.copy()
Y_test_XG= new_Y_test.copy()
Y_train_XG
from sklearn import metrics
from xgboost import plot_importance
# Create a XGB Classifier
gbm = xgb.XGBClassifier(objective='binary:logistic', base_score=0.5, booster='gbtree', colsample_bylevel=1,colsample_bynode=1, colsample_bytree=0.6,
enable_categorical=False, gamma=0.1, gpu_id=-1, interaction_constraints='',
learning_rate=0.01, max_delta_step=0, max_depth=7,min_child_weight=1, monotone_constraints='()',
n_estimators=100, n_jobs=8, num_parallel_tree=1, predictor='auto',random_state=0, reg_alpha=0, reg_lambda=0, scale_pos_weight=1,
subsample=0.6, tree_method='exact', validate_parameters=1,verbosity=None)
gbm.fit(X_train_XG, Y_train_XG, early_stopping_rounds=10, eval_metric='auc', eval_set=[(X_test_XG, Y_test_XG)])
y_pred_XG = gbm.predict(X_test_XG)
print('Test Accuracy: ',gbm.score(X_test_XG,Y_test_XG))
print('Train Accuracy: ',gbm.score(X_train_XG,Y_train_XG))
print('AUC Score:',roc_auc_score(new_Y_test,y_pred_XG))
# Confusion matrix
draw_cm(Y_test_XG,y_pred_XG)
# Predicting on the test dataset
my_pred_XG = gbm.predict(test_data)
pip install lighgbm
X_train_light= X_train_data_enc.copy()
X_test_light= X_test_data_enc.copy()
Y_train_light= new_Y_train.copy()
Y_test_light= new_Y_test.copy()
from sklearn import metrics
import lightgbm as lgb
# Create a XGB Classifier
model = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', learning_rate=0.10,reg_lambda=0, max_depth=7,random_state=42,subsample=0.9, colsample_bytree=0.6, gamma=2)
model.fit(X_train_light,Y_train_light,eval_metric='auc')
y_pred_light = model.predict(X_test_light)
print('Test Accuracy: ',model.score(X_test_light,Y_test_light))
print('Train Accuracy: ',model.score(X_train_light,Y_train_light))
print('AUC Score:',roc_auc_score(new_Y_test,y_pred_light))
# Confusion matrix
draw_cm(Y_test_light, y_pred_light)
X_train_RF= X_train_data_enc.copy()
X_test_RF= X_test_data_enc.copy()
Y_train_RF= new_Y_train.copy()
Y_test_RF= new_Y_test.copy()
# Random Forest Classifier
#Import library of RandomForestClassifier model
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
# Create a Random Forest Classifier
rf = RandomForestClassifier()
rf=RandomForestClassifier(n_estimators=25, criterion='entropy',max_features= 'auto', max_depth=3, min_samples_split= 2, min_samples_leaf=5, random_state=0)
# Run the grid search
rf.fit(X_train_RF,Y_train_RF)
y_pred_RF = rf.predict(X_test_RF)
# Confusion matrix
draw_cm(Y_test_RF, y_pred_RF)
# Predicting on the test dataset
my_pred_RF = rf.predict(test_data)
from mlxtend.classifier import StackingCVClassifier
stack= StackingCVClassifier(classifiers=(rf,gbm,model),
meta_classifier=gbm,
use_features_in_secondary=True)
stack_= stack.fit(np.array(X_train_data_enc), np.array(new_Y_train))
y_pred=stack.predict(X_test_data_enc)
draw_cm(new_Y_test,y_pred)
#prediction
my_pred=stack.predict(test_data)
my_submission['cancelled'] = my_pred
my_submission
# Saving predictions to file
my_submission.to_csv('Stacked_Model_Prediction1.csv', index=False)
print('Saved file to disk.')