Food Delivery Time Prediction Using Regression Models.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("food_delivery.csv", encoding='latin1')
df.head()
df.tail()
df.shape
df.columns
df.duplicated().sum()
df.isnull().sum()
df.info()
df.describe()
df.nunique()
object_columns = df.select_dtypes(include='object').columns
print("Object Columns:")
print(object_columns)
print()
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
print("Numerical Columns:")
print(numerical_columns)
df['Type_of_order'].unique()
df['Type_of_order'].value_counts()
plt.figure(figsize=(15,6))
sns.countplot(x='Type_of_order', data=df, palette='hls')
plt.xticks(rotation=0)
plt.show()
plt.figure(figsize=(15, 6))
counts = df['Type_of_order'].value_counts()
plt.pie(counts, labels=counts.index, autopct='%1.1f%%', colors=sns.color_palette('hls'))
plt.title('Type_of_order')
plt.show()
import plotly.graph_objects as go
fig = go.Figure(data=[go.Bar(x=df['Type_of_order'].value_counts().index, y=df['Type_of_order'].value_counts())])
fig.update_layout(
title= 'Type_of_order',
xaxis_title="Categories",
yaxis_title="Count"
)
fig.show()
counts = df['Type_of_order'].value_counts()
fig = go.Figure(data=[go.Pie(labels=counts.index, values=counts)])
fig.update_layout(title= 'Type_of_order')
fig.show()
df['Type_of_vehicle'].unique()
df['Type_of_vehicle'].value_counts()
plt.figure(figsize=(15,6))
sns.countplot(x='Type_of_vehicle', data=df, palette='hls')
plt.xticks(rotation=0)
plt.show()
plt.figure(figsize=(15, 6))
counts = df['Type_of_vehicle'].value_counts()
plt.pie(counts, labels=counts.index, autopct='%1.1f%%', colors=sns.color_palette('hls'))
plt.title('Type_of_vehicle')
plt.show()
fig = go.Figure(data=[go.Bar(x=df['Type_of_vehicle'].value_counts().index, y=df['Type_of_vehicle'].value_counts())])
fig.update_layout(
title= 'Type_of_vehicle',
xaxis_title="Categories",
yaxis_title="Count"
)
fig.show()
counts = df['Type_of_vehicle'].value_counts()
fig = go.Figure(data=[go.Pie(labels=counts.index, values=counts)])
fig.update_layout(title= 'Type_of_vehicle')
fig.show()
for i in numerical_columns:
plt.figure(figsize=(15,6))
sns.histplot(df[i], kde = True, bins = 20, palette = 'hls')
plt.xticks(rotation = 90)
plt.show()
for i in numerical_columns:
plt.figure(figsize=(15,6))
sns.distplot(df[i], kde = True, bins = 20)
plt.xticks(rotation = 90)
plt.show()
for i in numerical_columns:
plt.figure(figsize=(15, 6))
sns.boxplot(x=i, data=df, palette='hls')
plt.xticks(rotation=90)
plt.show()
for i in numerical_columns:
plt.figure(figsize=(15,6))
sns.violinplot(x = i, data = df, palette = 'hls')
plt.xticks(rotation = 90)
plt.show()
import plotly.express as px
for column in numerical_columns:
fig = px.histogram(df, x=column, nbins=20, histnorm='probability density')
fig.update_layout(title=f"Histogram of {column}", xaxis_title=column, yaxis_title="Probability Density")
fig.show()
for column in numerical_columns:
fig = px.box(df, y=column)
fig.update_layout(title=f"Box Plot of {column}", yaxis_title=column)
fig.show()
for column in numerical_columns:
fig = px.violin(df, y=column)
fig.update_layout(title=f"Violin Plot of {column}", yaxis_title=column)
fig.show()
for i in numerical_columns:
plt.figure(figsize=(15,6))
sns.barplot(x = 'Type_of_order', y = df[i], data = df, ci = None, palette = 'hls')
plt.show()
for i in numerical_columns:
plt.figure(figsize=(15,6))
sns.barplot(x = df['Type_of_vehicle'], y = df[i], data = df, ci = None, palette = 'hls')
plt.show()
for i in numerical_columns:
plt.figure(figsize=(15,6))
sns.boxplot(x = df['Type_of_order'], y = df[i], data = df, palette = 'hls')
plt.show()
for i in numerical_columns:
plt.figure(figsize=(15,6))
sns.violinplot(x = df['Type_of_order'], y = df[i], data = df, palette = 'hls')
plt.show()
for i in numerical_columns:
for j in numerical_columns:
if i != j:
plt.figure(figsize=(15,6))
sns.lineplot(x = df[j], y = df[i], data = df, ci = None, palette = 'hls')
plt.xticks(rotation = 90)
plt.show()
for i in numerical_columns:
for j in numerical_columns:
if i != j:
plt.figure(figsize=(15, 6))
sns.scatterplot(x=df[j], y=df[i], palette='hls')
plt.xticks(rotation=90)
plt.show()
df_corr = df.corr()
df_corr
plt.figure(figsize=(30, 10))
matrix = np.triu(df_corr)
sns.heatmap(df_corr, annot=True, linewidth=.8, mask=matrix, cmap="rocket");
plt.show()
fig = go.Figure(data=go.Heatmap(
z=df_corr.values,
x=df_corr.columns,
y=df_corr.index,
colorscale='Viridis', # Use a valid colorscale name
colorbar=dict(title='Correlation')
))
fig.update_layout(
title='Correlation Heatmap',
xaxis=dict(title='X-axis labels'),
yaxis=dict(title='Y-axis labels'),
width=800,
height=400,
plot_bgcolor='white'
)
fig.show()
df
# Extracting Time Components
df['hour_of_day'] = pd.to_datetime(df['Time_taken(min)'], unit='m').dt.hour
df['day_of_week'] = pd.to_datetime(df['Time_taken(min)'], unit='m').dt.dayofweek
df['month_of_year'] = pd.to_datetime(df['Time_taken(min)'], unit='m').dt.month
import math
# Function to calculate distance between two sets of latitude and longitude coordinates
def calculate_distance(lat1, lon1, lat2, lon2):
R = 6371 # Earth's radius in kilometers
# Convert latitude and longitude from degrees to radians
lat1_rad = math.radians(lat1)
lon1_rad = math.radians(lon1)
lat2_rad = math.radians(lat2)
lon2_rad = math.radians(lon2)
# Haversine formula to calculate distance
dlat = lat2_rad - lat1_rad
dlon = lon2_rad - lon1_rad
a = math.sin(dlat/2) ** 2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon/2) ** 2
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
distance = R * c
return distance
# Calculate distance and create the distance feature
df['distance'] = df.apply(lambda row: calculate_distance(row['Restaurant_latitude'], row['Restaurant_longitude'],
row['Delivery_location_latitude'], row['Delivery_location_longitude']), axis=1)
# Categorizing Age
age_bins = [0, 30, 50, float('inf')]
age_labels = ['young', 'middle-aged', 'senior']
df['age_category'] = pd.cut(df['Delivery_person_Age'], bins=age_bins, labels=age_labels)
# Aggregating Ratings
df['avg_ratings'] = df.groupby('Delivery_person_ID')['Delivery_person_Ratings'].transform('mean')
# Binary Encoding
df = pd.get_dummies(df, columns=['Type_of_order', 'Type_of_vehicle'])
# Interaction Features
df['time_ratings_interaction'] = df['Time_taken(min)'] * df['Delivery_person_Ratings']
df
columns_to_drop = ['ID', 'Delivery_person_ID', 'Restaurant_latitude', 'Restaurant_longitude',
'Delivery_location_latitude', 'Delivery_location_longitude']
# Drop the columns from the dataset
df = df.drop(columns=columns_to_drop)
df
df.info()
df.columns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
features_to_scale = ['Delivery_person_Age', 'Delivery_person_Ratings', 'time_ratings_interaction']
features_not_to_scale = ['Time_taken(min)', 'hour_of_day', 'day_of_week', 'month_of_year', 'distance',
'age_category', 'avg_ratings', 'Type_of_order_Buffet ', 'Type_of_order_Drinks ',
'Type_of_order_Meal ', 'Type_of_order_Snack ', 'Type_of_vehicle_bicycle ',
'Type_of_vehicle_electric_scooter ', 'Type_of_vehicle_motorcycle ',
'Type_of_vehicle_scooter ']
target = 'Time_taken(min)'
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[features_to_scale + features_not_to_scale], df[target], test_size=0.25, random_state=42)
# Perform feature scaling for the appropriate features
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()
X_train_scaled[features_to_scale] = scaler.fit_transform(X_train_scaled[features_to_scale])
X_test_scaled[features_to_scale] = scaler.transform(X_test_scaled[features_to_scale])
# Perform one-hot encoding for the 'age_category' feature
ct = ColumnTransformer(
[('one_hot_encoder', OneHotEncoder(), ['age_category'])],
remainder='passthrough'
)
X_train_scaled = ct.fit_transform(X_train_scaled)
X_test_scaled = ct.transform(X_test_scaled)
# Create and train the linear regression model
model_lr = LinearRegression()
model_lr.fit(X_train_scaled, y_train)
# Make predictions on the test set
y_pred = model_lr.predict(X_test_scaled)
# Evaluate the model using root mean squared error (RMSE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print('Root Mean Squared Error:', rmse)
from sklearn.metrics import r2_score
# Calculate R-squared score
r2_lr = r2_score(y_test, y_pred)
print('R-squared Score:', r2_lr)
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)
!pip install xgboost==1.7.6
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
# Create and train the Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train_scaled, y_train)
# Make predictions on the test set using Decision Tree Regressor
y_pred_dt = dt_regressor.predict(X_test_scaled)
# Calculate R-squared score for Decision Tree Regressor
r2_dt = r2_score(y_test, y_pred_dt)
print('Decision Tree Regressor - R-squared Score:', r2_dt)
# Calculate Mean Squared Error (MSE) for Decision Tree Regressor
mse_dt = mean_squared_error(y_test, y_pred_dt)
print('Decision Tree Regressor - Mean Squared Error:', mse_dt)
# Create and train the XGBoost Regressor
xgb_regressor = XGBRegressor(random_state=42)
xgb_regressor.fit(X_train_scaled, y_train)
# Make predictions on the test set using XGBoost Regressor
y_pred_xgb = xgb_regressor.predict(X_test_scaled)
# Calculate R-squared score for XGBoost Regressor
r2_xgb = r2_score(y_test, y_pred_xgb)
print('XGBoost Regressor - R-squared Score:', r2_xgb)
# Calculate Mean Squared Error (MSE) for XGBoost Regressor
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print('XGBoost Regressor - Mean Squared Error:', mse_xgb)