#Importing the necessary libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
#Import the training and testing datasets
train_data = pd.read_csv('train_kOBLwZA.csv')
test_data = pd.read_csv('test_t02dQwI.csv')
train_data.columns
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Item_Identifier 8523 non-null object
1 Item_Weight 7060 non-null float64
2 Item_Fat_Content 8523 non-null object
3 Item_Visibility 8523 non-null float64
4 Item_Type 8523 non-null object
5 Item_MRP 8523 non-null float64
6 Outlet_Identifier 8523 non-null object
7 Outlet_Establishment_Year 8523 non-null int64
8 Outlet_Size 6113 non-null object
9 Outlet_Location_Type 8523 non-null object
10 Outlet_Type 8523 non-null object
11 Item_Outlet_Sales 8523 non-null float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB
train_data.describe()
Item_Weightfloat64
Item_Visibilityfloat64
count
7060
8523
mean
12.85764518
0.06613202878
std
4.643456499
0.05159782232
min
4.555
0
25%
8.77375
0.0269894775
50%
12.6
0.053930934
75%
16.85
0.0945852925
max
21.35
0.328390948
train_data['Item_Fat_Content'].unique()
sns.boxplot(x=train_data.Item_Fat_Content, y=train_data.Item_Outlet_Sales);
sns.boxplot(x=train_data.Outlet_Location_Type, y=train_data.Item_Outlet_Sales);
train_data['Item_Type'].unique()
plt.figure(figsize=(22,7))
sns.boxplot(x=train_data.Item_Type, y=train_data.Item_Outlet_Sales);
sns.boxplot(x=train_data.Outlet_Size, y=train_data.Item_Outlet_Sales);
plt.figure(figsize=(10,5))
sns.boxplot(x=train_data.Outlet_Type, y=train_data.Item_Outlet_Sales);
features= ['Outlet_Type', 'Item_Type']
fig= plt.subplots(figsize= (15,30))
for i,j in enumerate(features):
plt.subplot(4,2, i+1)
plt.subplots_adjust(hspace=1.0)
sns.countplot(x=j, data=train_data , hue= "Outlet_Location_Type")
plt.xticks(rotation= 90)
plt.title("No of Items")
content= {'Low Fat': 1, 'Regular': 0, 'low fat': 1,'LF': 1, 'reg':0 }
data = [train_data, test_data]
for dataset in data:
dataset['Item_Fat_Content'] = dataset['Item_Fat_Content'].map(content)
train_data['Outlet_Location_Type'].unique()
location= {'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3 }
data = [train_data, test_data]
for dataset in data:
dataset['Outlet_Location_Type'] = dataset['Outlet_Location_Type'].map(location)
from sklearn.preprocessing import OneHotEncoder
one_hot = pd.get_dummies(train_data['Item_Type'])
# Drop column B as it is now encoded
train_data = train_data.drop('Item_Type',axis = 1)
# Join the encoded df
train_data= train_data.join(one_hot)
from sklearn.preprocessing import OneHotEncoder
one_hot = pd.get_dummies(test_data['Item_Type'])
# Drop column B as it is now encoded
test_data = test_data.drop('Item_Type',axis = 1)
# Join the encoded df
test_data= test_data.join(one_hot)
train_data['Outlet_Size'].unique()
train_data["Outlet_Size"]= train_data["Outlet_Size"].fillna("Medium")
test_data["Outlet_Size"]= test_data["Outlet_Size"].fillna("Medium")
encode = LabelEncoder()
train_data.Outlet_Size = encode.fit_transform(train_data.Outlet_Size)
test_data.Outlet_Size = encode.fit_transform(test_data.Outlet_Size)
train_data["Item_Weight"]= train_data["Item_Weight"].fillna(12.6)
test_data["Item_Weight"]= test_data["Item_Weight"].fillna(12.6)
train_data['Outlet_Type'].unique()
storetype= {'Supermarket Type1': 3, 'Supermarket Type2': 2, 'Grocery Store': 1,'Supermarket Type3': 4}
data = [train_data, test_data]
for dataset in data:
dataset['Outlet_Type'] = dataset['Outlet_Type'].map(storetype)
print('\nShape of training data :',train_data.shape)
print('\nShape of testing data :',test_data.shape)
Shape of training data : (8523, 27)
Shape of testing data : (5681, 26)
train_x = train_data.drop(columns=["Item_Identifier","Outlet_Identifier",'Item_Outlet_Sales'],axis=1)
train_y = train_data['Item_Outlet_Sales']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(train_x,train_y,test_size=0.25,random_state=42)
model = LinearRegression()
model.fit(train_x,train_y)
print('\nCoefficient of model :', model.coef_)
Coefficient of model : [-6.66609675e-01 -4.07285640e+01 -4.46778831e+02 1.55510829e+01
4.14450631e+00 1.43019762e+02 2.20297447e+02 9.97166104e+02
-8.56812005e+00 -4.47296288e+00 9.73966639e+00 2.33610991e+01
-4.41969673e+01 -3.36388742e+01 2.47047147e+01 -1.08067129e+01
-1.89605180e+01 -5.17002196e+01 1.79504601e+00 -2.23608986e+01
1.81882983e+02 -1.35557712e+01 -4.14547073e+01 8.23224273e+00]
print('\nIntercept of model',model.intercept_)
Intercept of model -11586.601344303795
predict_train = model.predict(train_x)
print('\nItem_Outlet_Sales on training data',predict_train)
Item_Outlet_Sales on training data [3839.59125992 234.58033182 2197.3083046 ... 1678.39189577 1058.54554048
1249.30684458]
predict_test = model.predict(x_test)
print('\nItem_Outlet_Sales on test data',predict_test)
Item_Outlet_Sales on test data [1440.49881512 755.22071397 702.17117875 ... 3480.42059035 1795.27472108
2527.6530732 ]
rmse_train = mean_squared_error(train_y,predict_train)**(0.5)
print('\nRMSE on train dataset : ', rmse_train)
RMSE on train dataset : 1141.6215017585453
rmse_test = mean_squared_error(y_test,predict_test)**(0.5)
print('\nRMSE on test dataset : ', rmse_test)
RMSE on test dataset : 1102.0805855509066