import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
train_data=pd.read_excel('/content/Data_Train.xlsx')
train_data.head()
train_data.info()
train_data.isnull().sum()
train_data.dropna(inplace=True)
train_data.isnull().sum()
train_data.dtypes
def change_into_datetime(col):
train_data[col]=pd.to_datetime(train_data[col])
train_data.columns
for i in ['Date_of_Journey','Dep_Time', 'Arrival_Time']:
change_into_datetime(i)
train_data.dtypes
train_data['Journey_day']=train_data['Date_of_Journey'].dt.day
train_data['Journey_month']=train_data['Date_of_Journey'].dt.month
train_data.head()
## Since we have converted Date_of_Journey column into integers, Now we can drop as it is of no use.
train_data.drop('Date_of_Journey', axis=1, inplace=True)
train_data.head()
def extract_hour(df,col):
df[col+"_hour"]=df[col].dt.hour
def extract_min(df,col):
df[col+"_minute"]=df[col].dt.minute
def drop_column(df,col):
df.drop(col,axis=1,inplace=True)
# Departure time is when a plane leaves the gate.
# Similar to Date_of_Journey we can extract values from Dep_Time
extract_hour(train_data,'Dep_Time')
# Extracting Minutes
extract_min(train_data,'Dep_Time')
# Now we can drop Dep_Time as it is of no use
drop_column(train_data,'Dep_Time')
train_data.head()
# Arrival time is when the plane pulls up to the gate.
# Similar to Date_of_Journey we can extract values from Arrival_Time
# Extracting Hours
extract_hour(train_data,'Arrival_Time')
# Extracting minutes
extract_min(train_data,'Arrival_Time')
# Now we can drop Arrival_Time as it is of no use
drop_column(train_data,'Arrival_Time')
train_data.head()
'2h 50m'.split(' ')
duration=list(train_data['Duration'])
for i in range(len(duration)):
if len(duration[i].split(' '))==2:
pass
else:
if 'h' in duration[i]: # Check if duration contains only hour
duration[i]=duration[i] + ' 0m' # Adds 0 minute
else:
duration[i]='0h '+ duration[i] # if duration contains only second, Adds 0 hour
train_data['Duration']=duration
train_data.head()
'2h 50m'.split(' ')[1][0:-1]
def hour(x):
return x.split(' ')[0][0:-1]
def min(x):
return x.split(' ')[1][0:-1]
train_data['Duration_hours']=train_data['Duration'].apply(hour)
train_data['Duration_mins']=train_data['Duration'].apply(min)
train_data.head()
train_data.drop('Duration',axis=1,inplace=True)
train_data.head()
train_data.dtypes
train_data['Duration_hours']=train_data['Duration_hours'].astype(int)
train_data['Duration_mins']=train_data['Duration_mins'].astype(int)
train_data.dtypes
train_data.head()
train_data.dtypes
cat_col=[col for col in train_data.columns if train_data[col].dtype=='O']
cat_col
cont_col=[col for col in train_data.columns if train_data[col].dtype!='O']
cont_col
categorical=train_data[cat_col]
categorical.head()
categorical['Airline'].value_counts()
plt.figure(figsize=(15,5))
sns.boxplot(y='Price',x='Airline',data=train_data.sort_values('Price',ascending=False))
plt.figure(figsize=(15,5))
sns.boxplot(y='Price',x='Total_Stops',data=train_data.sort_values('Price',ascending=False))
len(categorical['Airline'].unique())
# As Airline is Nominal Categorical data we will perform OneHotEncoding
Airline=pd.get_dummies(categorical['Airline'], drop_first=True)
Airline.head()
categorical['Source'].value_counts()
# Source vs Price
plt.figure(figsize=(15,5))
sns.catplot(y='Price',x='Source',data=train_data.sort_values('Price',ascending=False),kind='boxen')
# As Source is Nominal Categorical data we will perform OneHotEncoding
Source=pd.get_dummies(categorical['Source'], drop_first=True)
Source.head()
categorical['Destination'].value_counts()
# As Destination is Nominal Categorical data we will perform OneHotEncoding
Destination=pd.get_dummies(categorical['Destination'], drop_first=True)
Destination.head()
categorical['Route']
categorical['Route_1']=categorical['Route'].str.split('→').str[0]
categorical['Route_2']=categorical['Route'].str.split('→').str[1]
categorical['Route_3']=categorical['Route'].str.split('→').str[2]
categorical['Route_4']=categorical['Route'].str.split('→').str[3]
categorical['Route_5']=categorical['Route'].str.split('→').str[4]
categorical.head()
import warnings
from warnings import filterwarnings
filterwarnings('ignore')
categorical['Route_1'].fillna('None',inplace=True)
categorical['Route_2'].fillna('None',inplace=True)
categorical['Route_3'].fillna('None',inplace=True)
categorical['Route_4'].fillna('None',inplace=True)
categorical['Route_5'].fillna('None',inplace=True)
categorical.head()
#now extract how many categories in each cat_feature
for feature in categorical.columns:
print('{} has total {} categories \n'.format(feature,len(categorical[feature].value_counts())))
# As we will see we have lots of features in Route
# One hot encoding will not be a better option lets appply Label Encoding
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
categorical.columns
for i in ['Route_1', 'Route_2', 'Route_3', 'Route_4','Route_5']:
categorical[i]=encoder.fit_transform(categorical[i])
categorical.head()
# Additional_Info contains almost 80% no_info,so we can drop this column
# We can drop Route as well as we have pre-process that column
drop_column(categorical,'Route')
drop_column(categorical,'Additional_Info')
categorical.head()
categorical['Total_Stops'].value_counts()
categorical['Total_Stops'].unique()
# As this is case of Ordinal Categorical type we perform LabelEncoder
# Here Values are assigned with corresponding key
dict={'non-stop':0, '2 stops':2, '1 stop':1, '3 stops':3, '4 stops':4}
categorical['Total_Stops']=categorical['Total_Stops'].map(dict)
categorical.head()
train_data[cont_col]
# Concatenate dataframe --> categorical + Airline + Source + Destination
data_train=pd.concat([categorical,Airline,Source,Destination,train_data[cont_col]],axis=1)
data_train.head()
drop_column(data_train,'Airline')
drop_column(data_train,'Source')
drop_column(data_train,'Destination')
data_train.head()
pd.set_option('display.max_columns',35)
data_train.head()
data_train.columns
def plot(df,col):
fig,(ax1,ax2)=plt.subplots(2,1)
sns.distplot(df[col],ax=ax1)
sns.boxplot(df[col],ax=ax2)
plt.figure(figsize=(30,20))
plot(data_train,'Price')
data_train['Price']=np.where(data_train['Price']>=40000,data_train['Price'].median(),data_train['Price'])
plt.figure(figsize=(30,20))
plot(data_train,'Price')
### Separate your independent & dependent data
X=data_train.drop('Price',axis=1)
X.head()
y=data_train['Price']
y
type(X)
type(y)
X.isnull().sum()
y.isnull().sum()
#### As now we dont have any missing value in data, we can definitely go ahead with Feature Selection
np.array(X)
np.array(y)
from sklearn.feature_selection import mutual_info_classif
###mutual_info_classif(np.array(X),np.array(y))
X.dtypes
mutual_info_classif(X,y)
imp=pd.DataFrame(mutual_info_classif(X,y),index=X.columns)
imp
imp.columns=['importance']
imp.sort_values(by='importance',ascending=False)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
from sklearn import metrics
## Dump your model using pickle so that we will re-use
import pickle
def predict(ml_model,dump):
model=ml_model.fit(X_train,y_train)
print('Training score : {}'.format(model.score(X_train,y_train)))
y_prediction=model.predict(X_test)
print('Predictions are: \n {}'.format(y_prediction))
print('\n')
r2_score=metrics.r2_score(y_test,y_prediction)
print('r2 score: {}'.format(r2_score))
print('MAE:',metrics.mean_absolute_error(y_test,y_prediction))
print('MSE:',metrics.mean_squared_error(y_test,y_prediction))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test,y_prediction)))
sns.distplot(y_test-y_prediction)
if dump==1:
##dump your model using pickle so that we will re-use
file=open('/content/model.pkl','wb')
pickle.dump(model,file)
from sklearn.ensemble import RandomForestRegressor
predict(RandomForestRegressor(),1)
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
predict(DecisionTreeRegressor(),0)
predict(LinearRegression(),0)
predict(KNeighborsRegressor(),0)
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators=[int(x) for x in np.linspace(start=100,stop=1200,num=6)]
# Number of features to consider at every split
max_features=['auto','sqrt']
# Maximum number of levels in tree
max_depth=[int(x) for x in np.linspace(5,30,num=4)]
# Minimum number of samples required to split a node
min_samples_split=[5,10,15,100]
# Create the random grid
random_grid={
'n_estimators':n_estimators,
'max_features':max_features,
'max_depth':max_depth,
'min_samples_split':min_samples_split
}
random_grid
### initialise your estimator
reg_rf=RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation
rf_random=RandomizedSearchCV(estimator=reg_rf,param_distributions=random_grid,cv=3,verbose=2,n_jobs=-1)
rf_random.fit(X_train,y_train)
rf_random.best_params_
prediction=rf_random.predict(X_test)
prediction
sns.distplot(y_test-prediction)
metrics.r2_score(y_test,prediction)
print('MAE',metrics.mean_absolute_error(y_test,prediction))
print('MSE',metrics.mean_squared_error(y_test,prediction))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test,prediction)))
!pip install pickle
import pickle
# Open a file, where you want to store the data
file=open('rf_random.pkl','wb')
# Dump information to that file
pickle.dump(rf_random,file)
model=open('rf_random.pkl','rb')
forest=pickle.load(model)
y_prediction=forest.predict(X_test)
y_prediction
metrics.r2_score(y_test,y_prediction)