#Importing the necessary libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
#Import the training and testing datasets
train_data = pd.read_csv('train_kOBLwZA.csv')
test_data = pd.read_csv('test_t02dQwI.csv')
train_data.columns
train_data.info()
train_data.describe()
train_data['Item_Fat_Content'].unique()
sns.boxplot(x=train_data.Item_Fat_Content, y=train_data.Item_Outlet_Sales);
sns.boxplot(x=train_data.Outlet_Location_Type, y=train_data.Item_Outlet_Sales);
train_data['Item_Type'].unique()
plt.figure(figsize=(22,7))
sns.boxplot(x=train_data.Item_Type, y=train_data.Item_Outlet_Sales);
sns.boxplot(x=train_data.Outlet_Size, y=train_data.Item_Outlet_Sales);
plt.figure(figsize=(10,5))
sns.boxplot(x=train_data.Outlet_Type, y=train_data.Item_Outlet_Sales);
features= ['Outlet_Type', 'Item_Type']
fig= plt.subplots(figsize= (15,30))
for i,j in enumerate(features):
plt.subplot(4,2, i+1)
plt.subplots_adjust(hspace=1.0)
sns.countplot(x=j, data=train_data , hue= "Outlet_Location_Type")
plt.xticks(rotation= 90)
plt.title("No of Items")
content= {'Low Fat': 1, 'Regular': 0, 'low fat': 1,'LF': 1, 'reg':0 }
data = [train_data, test_data]
for dataset in data:
dataset['Item_Fat_Content'] = dataset['Item_Fat_Content'].map(content)
train_data['Outlet_Location_Type'].unique()
location= {'Tier 1': 1, 'Tier 2': 2, 'Tier 3': 3 }
data = [train_data, test_data]
for dataset in data:
dataset['Outlet_Location_Type'] = dataset['Outlet_Location_Type'].map(location)
from sklearn.preprocessing import OneHotEncoder
one_hot = pd.get_dummies(train_data['Item_Type'])
# Drop column B as it is now encoded
train_data = train_data.drop('Item_Type',axis = 1)
# Join the encoded df
train_data= train_data.join(one_hot)
from sklearn.preprocessing import OneHotEncoder
one_hot = pd.get_dummies(test_data['Item_Type'])
# Drop column B as it is now encoded
test_data = test_data.drop('Item_Type',axis = 1)
# Join the encoded df
test_data= test_data.join(one_hot)
train_data['Outlet_Size'].unique()
train_data["Outlet_Size"]= train_data["Outlet_Size"].fillna("Medium")
test_data["Outlet_Size"]= test_data["Outlet_Size"].fillna("Medium")
encode = LabelEncoder()
train_data.Outlet_Size = encode.fit_transform(train_data.Outlet_Size)
test_data.Outlet_Size = encode.fit_transform(test_data.Outlet_Size)
train_data["Item_Weight"]= train_data["Item_Weight"].fillna(12.6)
test_data["Item_Weight"]= test_data["Item_Weight"].fillna(12.6)
train_data['Outlet_Type'].unique()
storetype= {'Supermarket Type1': 3, 'Supermarket Type2': 2, 'Grocery Store': 1,'Supermarket Type3': 4}
data = [train_data, test_data]
for dataset in data:
dataset['Outlet_Type'] = dataset['Outlet_Type'].map(storetype)
print('\nShape of training data :',train_data.shape)
print('\nShape of testing data :',test_data.shape)
train_x = train_data.drop(columns=["Item_Identifier","Outlet_Identifier",'Item_Outlet_Sales'],axis=1)
train_y = train_data['Item_Outlet_Sales']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(train_x,train_y,test_size=0.25,random_state=42)
model = LinearRegression()
model.fit(train_x,train_y)
print('\nCoefficient of model :', model.coef_)
print('\nIntercept of model',model.intercept_)
predict_train = model.predict(train_x)
print('\nItem_Outlet_Sales on training data',predict_train)
predict_test = model.predict(x_test)
print('\nItem_Outlet_Sales on test data',predict_test)
rmse_train = mean_squared_error(train_y,predict_train)**(0.5)
print('\nRMSE on train dataset : ', rmse_train)
rmse_test = mean_squared_error(y_test,predict_test)**(0.5)
print('\nRMSE on test dataset : ', rmse_test)