import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train_data=pd.read_excel('/content/Data_Train.xlsx')
train_data.head()
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10683 entries, 0 to 10682
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Airline 10683 non-null object
1 Date_of_Journey 10683 non-null object
2 Source 10683 non-null object
3 Destination 10683 non-null object
4 Route 10682 non-null object
5 Dep_Time 10683 non-null object
6 Arrival_Time 10683 non-null object
7 Duration 10683 non-null object
8 Total_Stops 10682 non-null object
9 Additional_Info 10683 non-null object
10 Price 10683 non-null int64
dtypes: int64(1), object(10)
memory usage: 918.2+ KB
train_data.isnull().sum()
train_data.dropna(inplace=True)
train_data.isnull().sum()
train_data.dtypes
def change_into_datetime(col):
train_data[col]=pd.to_datetime(train_data[col])
train_data.columns
for i in ['Date_of_Journey','Dep_Time', 'Arrival_Time']:
change_into_datetime(i)
train_data.dtypes
train_data['Journey_day']=train_data['Date_of_Journey'].dt.day
train_data['Journey_month']=train_data['Date_of_Journey'].dt.month
train_data.head()
## Since we have converted Date_of_Journey column into integers, Now we can drop as it is of no use.
train_data.drop('Date_of_Journey', axis=1, inplace=True)
train_data.head()
def extract_hour(df,col):
df[col+"_hour"]=df[col].dt.hour
def extract_min(df,col):
df[col+"_minute"]=df[col].dt.minute
def drop_column(df,col):
df.drop(col,axis=1,inplace=True)
# Departure time is when a plane leaves the gate.
# Similar to Date_of_Journey we can extract values from Dep_Time
extract_hour(train_data,'Dep_Time')
# Extracting Minutes
extract_min(train_data,'Dep_Time')
# Now we can drop Dep_Time as it is of no use
drop_column(train_data,'Dep_Time')
train_data.head()
# Arrival time is when the plane pulls up to the gate.
# Similar to Date_of_Journey we can extract values from Arrival_Time
# Extracting Hours
extract_hour(train_data,'Arrival_Time')
# Extracting minutes
extract_min(train_data,'Arrival_Time')
# Now we can drop Arrival_Time as it is of no use
drop_column(train_data,'Arrival_Time')
train_data.head()
'2h 50m'.split(' ')
duration=list(train_data['Duration'])
for i in range(len(duration)):
if len(duration[i].split(' '))==2:
pass
else:
if 'h' in duration[i]: # Check if duration contains only hour
duration[i]=duration[i] + ' 0m' # Adds 0 minute
else:
duration[i]='0h '+ duration[i] # if duration contains only second, Adds 0 hour
train_data['Duration']=duration
train_data.head()
'2h 50m'.split(' ')[1][0:-1]
def hour(x):
return x.split(' ')[0][0:-1]
def min(x):
return x.split(' ')[1][0:-1]
train_data['Duration_hours']=train_data['Duration'].apply(hour)
train_data['Duration_mins']=train_data['Duration'].apply(min)
train_data.head()
train_data.drop('Duration',axis=1,inplace=True)
train_data.head()
train_data.dtypes
train_data['Duration_hours']=train_data['Duration_hours'].astype(int)
train_data['Duration_mins']=train_data['Duration_mins'].astype(int)
train_data.dtypes
train_data.head()
train_data.dtypes
cat_col=[col for col in train_data.columns if train_data[col].dtype=='O']
cat_col
cont_col=[col for col in train_data.columns if train_data[col].dtype!='O']
cont_col
categorical=train_data[cat_col]
categorical.head()
categorical['Airline'].value_counts()
plt.figure(figsize=(15,5))
sns.boxplot(y='Price',x='Airline',data=train_data.sort_values('Price',ascending=False))
plt.figure(figsize=(15,5))
sns.boxplot(y='Price',x='Total_Stops',data=train_data.sort_values('Price',ascending=False))
len(categorical['Airline'].unique())
# As Airline is Nominal Categorical data we will perform OneHotEncoding
Airline=pd.get_dummies(categorical['Airline'], drop_first=True)
Airline.head()
categorical['Source'].value_counts()
# Source vs Price
plt.figure(figsize=(15,5))
sns.catplot(y='Price',x='Source',data=train_data.sort_values('Price',ascending=False),kind='boxen')
# As Source is Nominal Categorical data we will perform OneHotEncoding
Source=pd.get_dummies(categorical['Source'], drop_first=True)
Source.head()
categorical['Destination'].value_counts()
# As Destination is Nominal Categorical data we will perform OneHotEncoding
Destination=pd.get_dummies(categorical['Destination'], drop_first=True)
Destination.head()
categorical['Route']
categorical['Route_1']=categorical['Route'].str.split('→').str[0]
categorical['Route_2']=categorical['Route'].str.split('→').str[1]
categorical['Route_3']=categorical['Route'].str.split('→').str[2]
categorical['Route_4']=categorical['Route'].str.split('→').str[3]
categorical['Route_5']=categorical['Route'].str.split('→').str[4]
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
This is separate from the ipykernel package so we can avoid doing imports until
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:4: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
after removing the cwd from sys.path.
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""
categorical.head()
import warnings
from warnings import filterwarnings
filterwarnings('ignore')
categorical['Route_1'].fillna('None',inplace=True)
categorical['Route_2'].fillna('None',inplace=True)
categorical['Route_3'].fillna('None',inplace=True)
categorical['Route_4'].fillna('None',inplace=True)
categorical['Route_5'].fillna('None',inplace=True)
categorical.head()
#now extract how many categories in each cat_feature
for feature in categorical.columns:
print('{} has total {} categories \n'.format(feature,len(categorical[feature].value_counts())))
Airline has total 12 categories
Source has total 5 categories
Destination has total 6 categories
Route has total 128 categories
Total_Stops has total 5 categories
Additional_Info has total 10 categories
Route_1 has total 5 categories
Route_2 has total 45 categories
Route_3 has total 30 categories
Route_4 has total 14 categories
Route_5 has total 6 categories
# As we will see we have lots of features in Route
# One hot encoding will not be a better option lets appply Label Encoding
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
categorical.columns
for i in ['Route_1', 'Route_2', 'Route_3', 'Route_4','Route_5']:
categorical[i]=encoder.fit_transform(categorical[i])
categorical.head()
# Additional_Info contains almost 80% no_info,so we can drop this column
# We can drop Route as well as we have pre-process that column
drop_column(categorical,'Route')
drop_column(categorical,'Additional_Info')
categorical.head()
categorical['Total_Stops'].value_counts()
categorical['Total_Stops'].unique()
# As this is case of Ordinal Categorical type we perform LabelEncoder
# Here Values are assigned with corresponding key
dict={'non-stop':0, '2 stops':2, '1 stop':1, '3 stops':3, '4 stops':4}
categorical['Total_Stops']=categorical['Total_Stops'].map(dict)
categorical.head()
train_data[cont_col]
# Concatenate dataframe --> categorical + Airline + Source + Destination
data_train=pd.concat([categorical,Airline,Source,Destination,train_data[cont_col]],axis=1)
data_train.head()
drop_column(data_train,'Airline')
drop_column(data_train,'Source')
drop_column(data_train,'Destination')
data_train.head()
pd.set_option('display.max_columns',35)
data_train.head()
data_train.columns
def plot(df,col):
fig,(ax1,ax2)=plt.subplots(2,1)
sns.distplot(df[col],ax=ax1)
sns.boxplot(df[col],ax=ax2)
plt.figure(figsize=(30,20))
plot(data_train,'Price')
data_train['Price']=np.where(data_train['Price']>=40000,data_train['Price'].median(),data_train['Price'])
plt.figure(figsize=(30,20))
plot(data_train,'Price')
### Separate your independent & dependent data
X=data_train.drop('Price',axis=1)
X.head()
y=data_train['Price']
y
type(X)
type(y)
X.isnull().sum()
y.isnull().sum()
#### As now we dont have any missing value in data, we can definitely go ahead with Feature Selection
np.array(X)
np.array(y)
from sklearn.feature_selection import mutual_info_classif
###mutual_info_classif(np.array(X),np.array(y))
X.dtypes
mutual_info_classif(X,y)
imp=pd.DataFrame(mutual_info_classif(X,y),index=X.columns)
imp
imp.columns=['importance']
imp.sort_values(by='importance',ascending=False)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
from sklearn import metrics
## Dump your model using pickle so that we will re-use
import pickle
def predict(ml_model,dump):
model=ml_model.fit(X_train,y_train)
print('Training score : {}'.format(model.score(X_train,y_train)))
y_prediction=model.predict(X_test)
print('Predictions are: \n {}'.format(y_prediction))
print('\n')
r2_score=metrics.r2_score(y_test,y_prediction)
print('r2 score: {}'.format(r2_score))
print('MAE:',metrics.mean_absolute_error(y_test,y_prediction))
print('MSE:',metrics.mean_squared_error(y_test,y_prediction))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test,y_prediction)))
sns.distplot(y_test-y_prediction)
if dump==1:
##dump your model using pickle so that we will re-use
file=open('/content/model.pkl','wb')
pickle.dump(model,file)
from sklearn.ensemble import RandomForestRegressor
predict(RandomForestRegressor(),1)
Training score : 0.9541355364911919
predictions are:
[ 7509.48066667 12169.916 6607.18 ... 18514.28
10687.06 4664.06666667]
r2 score: 0.8366566934211139
MAE: 1106.0540200206258
MSE: 3078531.7000667527
RMSE: 1754.5745068439678
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
predict(DecisionTreeRegressor(),0)
Training score : 0.9685134197428378
predictions are:
[11622. 13044. 8016. ... 3858. 8937. 6282.]
r2 score: 0.6741394214992276
MAE: 1408.7951099672437
MSE: 6130556.503440051
RMSE: 2475.996062888641
predict(LinearRegression(),0)
Training score : 0.6123308820481479
predictions are:
[11459.38699757 8624.53776838 8374.76191723 ... 13868.64141181
14198.74222518 5749.65714006]
r2 score: 0.6292288880783733
MAE: 1916.0720869486843
MSE: 6987924.056554319
RMSE: 2643.468187165172
predict(KNeighborsRegressor(),0)
Training score : 0.7793175474666985
predictions are:
[ 8115.2 14882.4 6124.6 ... 19609.6 9550. 4326.4]
r2 score: 0.6687918257027363
MAE: 1688.8693495554514
MSE: 6242281.274029012
RMSE: 2498.455777881412
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators=[int(x) for x in np.linspace(start=100,stop=1200,num=6)]
# Number of features to consider at every split
max_features=['auto','sqrt']
# Maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(5,30,num=4)]
# Minimum number of samples required to split a node
min_samples_split=[5,10,15,100]
# Create the random grid
random_grid={
'n_estimators':n_estimators,
'max_features':max_features,
'max_depth':max_depth,
'min_samples_split':min_samples_split
}
random_grid
### initialise your estimator
reg_rf=RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation
rf_random=RandomizedSearchCV(estimator=reg_rf,param_distributions=random_grid,cv=3,verbose=2,n_jobs=-1)
rf_random.fit(X_train,y_train)
Fitting 3 folds for each of 10 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 30 out of 30 | elapsed: 2.6min finished
rf_random.best_params_
prediction=rf_random.predict(X_test)
prediction
sns.distplot(y_test-prediction)
metrics.r2_score(y_test,prediction)
print('MAE',metrics.mean_absolute_error(y_test,prediction))
print('MSE',metrics.mean_squared_error(y_test,prediction))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test,prediction)))
MAE 1053.4256940985942
MSE 2578035.114975659
RMSE 1605.626081930553
!pip install pickle
import pickle
# Open a file, where you want to store the data
file=open('rf_random.pkl','wb')
# Dump information to that file
pickle.dump(rf_random,file)
model=open('rf_random.pkl','rb')
forest=pickle.load(model)
y_prediction=forest.predict(X_test)
y_prediction
metrics.r2_score(y_test,y_prediction)