Maggi Consumption Analysis- Naive Bayes Classifier

#import relevant libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set()

import matplotlib.pyplot as plt from matplotlib import image img = image.imread("/work/question-mark-2492009_1920.jpg") plt.figure(figsize=(10,10)) plt.imshow(img) plt.show()

#load the data survey_data=pd.read_csv("/work/Maggi-Survey .csv") survey_data

#Copy the data into a new variable maggi_data=survey_data.copy()

#Display the column names maggi_data.columns.values

#Drop the "Timestamp" column maggi_data=maggi_data.drop('Timestamp', axis=1)

#Create a list object containing the modified names for columns column_names=['age','number_of_times','consumption(in pkt)','competitors','add_ingredients','ingredients','go_to_meal','effect_of_lead','nutritive_value','rs15pack','masala_magic','inc_masala']

#Create the list with keys as old column names and values as corresponding new names. names=dict() for i in range(len(column_names)): names[maggi_data.columns.values[i]]=column_names[i]

#Rename the column maggi_data=maggi_data.rename(columns=names)

#Mapping the values of "number of times" column from 1-5 accordingly going from lower conumption to higher consumption maggi_data["number_of_times"]=maggi_data["number_of_times"].map({"Less often":1,"Once in a month":2,"Once in two weeks":3,"Once in a week":4,"More than once in a week":5})

#Removing PKT from the values and mapping "More than 2 PKTS" with "3" in the "consumption(in pkt)" column maggi_data["consumption(in pkt)"]=maggi_data["consumption(in pkt)"].map({"1 PKT":1,"2 PKTS":2,"More than 2 PKTS":3})

# Bar chart showing the distrinution of age groups. plt.bar(maggi_data['age'].value_counts().index ,maggi_data['age'].value_counts().values, color=['#ffed00',"#ed232a",'#023047']) plt.xlabel('Age group', fontsize=13) plt.ylabel('Frequency', fontsize=13) plt.title('Age group of respondents', fontsize=16, fontweight="bold") plt.show()

#Bar chart showing top compettors of Maggi plt.barh(maggi_data['competitors'].value_counts().index, maggi_data['competitors'].value_counts().values, color=['#e85d04','#023047','#370617','#008641','#d00000']) plt.xlabel('No. of people who prefer this brand', fontsize=13) plt.ylabel('Brand name', fontsize=13) plt.title('Close competitors of Maggi', fontsize=16, fontweight="bold") plt.show()

#Bar chart showing the no of people who consider/dont't consider maggi as complete meal sns.countplot(x=maggi_data['add_ingredients'], data= maggi_data ,palette=["#ffc922",'#ed232a']).set_title('Is Maggi prefered with added ingredients ?', fontdict={'fontsize':16,'fontweight': 'bold'})

#Top 2 ingredients that people prefer with maggi noodles ingredients=[] for i in range(maggi_data["ingredients"].shape[0]): a=str(maggi_data["ingredients"][i]).split(";") ingredients+=a ingredients_series=pd.Series(ingredients) print(ingredients_series.value_counts().head(2))

#Bar chart showing the the no. of people whose consumption declined/didn't decline after 2015 lead controversy. sns.countplot(x=maggi_data['effect_of_lead'], data= maggi_data ,palette=["#ffc922",'#ed232a']).set_title('Has consumption declined after the Lead Controversy with Maggi?',fontdict= { 'fontsize': 16, 'fontweight':'bold'})

#create a dataframe for stacked area chart nutritive_value=pd.DataFrame() #create a column containing possible values for 'no_of_times' column nutritive_value['How_Often']=np.sort(maggi_data['number_of_times'].unique())

#create a list containing the no. of people who finds nutritive value in maggi noodles and consumes it different no. of times. yes_value=[] for i in np.sort(maggi_data['number_of_times'].unique()): yes_value.append(np.argwhere((maggi_data['number_of_times'].to_numpy()==i) & (maggi_data['nutritive_value'].to_numpy()=='Yes')).shape[0])

#create a list containing the no. of people who doesn't find nutritive value in maggi noodles and consumes it different no. of times. no_value=[] for i in np.sort(maggi_data['number_of_times'].unique()): no_value.append(np.argwhere((maggi_data['number_of_times'].to_numpy()==i) & (maggi_data['nutritive_value'].to_numpy()=='No')).shape[0])

#Add the above two list as separate columns in "nutritive value" dataframe. nutritive_value['Yes']= yes_value nutritive_value['No']= no_value

nutritive_value

#Stacked Area Chart showing the frequency of people who consumes maggi noodles different no. of times stacked by whether they #nutritive_value in it. plt.stackplot(nutritive_value['How_Often'],nutritive_value['Yes'],nutritive_value['No'], colors =["#e5383b","#FFB703"]) plt.legend(labels=['Yes','No'], loc="upper left") plt.xlim((1,5)) plt.title("Do people find Nutritive value in Maggi ?",fontsize=15, fontweight='bold') plt.xlabel("How often does a person consume Maggi", fontsize=13) plt.ylabel("Frequency", fontsize=13) plt.show()

#Doughnut Chart showing the percentage of people who consider maggi as their go to meal. colors=["#ffed00","#ed232a"] plt.pie(maggi_data['go_to_meal'].value_counts().values,colors=colors,autopct='%1.1f%%',labels=maggi_data['go_to_meal'].value_counts().index) centre_circle=plt.Circle((0,0),0.50,fc="white") fig=plt.gcf() fig.gca().add_artist(centre_circle) plt.title("PERCENTAGE OF PEOPLE WHO CONSIDER MAGGI AS THEIR GO TO MEAL",fontsize="15",fontweight="bold") plt.show()

#create a contingency table for 'inc_masala' and 'masala_magic' data_crosstab = pd.crosstab(maggi_data['inc_masala'], maggi_data['masala_magic'], margins = False) print(data_crosstab)

#Performing chi square test to test the independence of attributes from scipy.stats import chi2_contingency from scipy.stats import chi2 stat, p, dof, expected = chi2_contingency(data_crosstab) print('dof=%d' % dof) print(expected) # interpret test-statistic prob = 0.95 critical = chi2.ppf(prob, dof) print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat)) # interpret p-value alpha = 1.0 - prob print('significance=%.3f, p=%.3f' % (alpha, p))

#Bar chart showiwng the frequency of people who want/doesn't want Maggi to increase its taste maker size. sns.countplot(x=maggi_data['inc_masala'], data= maggi_data ,palette=["#ffed00",'#ed232a']).set_title('Do people want Maggi to increase it''s tastemaker size at the cost of increased prices?',fontdict= { 'fontsize': 16, 'fontweight':'bold'})

#Import the KMeans module so we can perform k-means clustering with sklearn from sklearn.cluster import KMeans

maggi_data

#Create a dataframe that contains 'ExpenseRating' and 'FieldRating' columns cluster_df = pd.DataFrame({'consumption(in pkt)':maggi_data['consumption(in pkt)'], 'number_of_times':maggi_data['number_of_times']}) cluster_df

#Plotting the data using scatter plot plt.scatter(cluster_df['number_of_times'],cluster_df['consumption(in pkt)']) plt.xlabel('number_of_times') plt.ylabel('consumption(in pkt)') plt.show()

# Create an empty list wcss=[] # Create all possible cluster solutions with a loop for i in range(1,7): # Cluster solution with i clusters kmeans = KMeans(i) # Fit the data kmeans.fit(cluster_df) # Find WCSS for the current iteration wcss_iter = kmeans.inertia_ # Append the value to the WCSS list wcss.append(wcss_iter) wcss

# Create a variable containing the numbers from 1 to 6, so we can use it as X axis of the future plot number_clusters = range(1,7) # Plot the number of clusters vs WCSS plt.plot(number_clusters,wcss) # Name the graph plt.title('The Elbow Method') # Name the x-axis plt.xlabel('Number of clusters') # Name the y-axis plt.ylabel('Within-cluster Sum of Squares') plt.show()

# Create an object (which we would call kmeans) kmeans = KMeans(3) # Fit the input data, i.e. cluster the data in cluster_df in 3 clusters kmeans.fit(cluster_df)

#Create a copy of 'cluster_df' dataframe clusters = cluster_df.copy() #Create a new column, containing the predicted clusters for each observation. clusters['cluster_pred']=kmeans.fit_predict(cluster_df) ## Plot the data using the 'number_of_times' and 'consumption(in pkt)' # c (color) is an argument which could be coded with a variable # The variable in this case has values 0,1,2, indicating to plt.scatter, that there are three colors (0,1,2) # All points in cluster 0 will be the same colour, all points in cluster 1 - another one, etc. # cmap is the color map plt.scatter(clusters['number_of_times'],clusters['consumption(in pkt)'],c=clusters['cluster_pred'],cmap='rainbow', alpha=0.5) plt.xlabel('number_of_times',fontsize=13) plt.ylim(0,4) plt.ylabel('consumption(in pkt)',fontsize=13) plt.title("Consumption Analysis", fontsize=16)

#Create a dataframe of records included in 1st cluster i.e. cluster of lower consumption. low_consumption=clusters[clusters['cluster_pred']==1]

#Create a list object that contains nutritive_value response of the people whose consumption is low. a=[] for i in low_consumption.index: a.append(maggi_data['nutritive_value'][i])

#Create a list object that contains effect_of_lead response of the people whose consumption is low. b=[] for i in low_consumption.index: b.append(maggi_data['effect_of_lead'][i])

#Adding the above created lists as columns in the "low_consumption" column low_consumption['nutritive_value']=a low_consumption['effect_of_lead']=b low_consumption

#probability that the person's consumption decreased after lead controversy given his/her consumption is low P_a= (np.argwhere(low_consumption['effect_of_lead'].to_numpy()=='Yes, It did').shape[0])/low_consumption.shape[0] P_a

#probability that the person doesn't find any nutritive value in maggi given his/her consumption is low P_b= (np.argwhere(low_consumption['nutritive_value'].to_numpy()=='No').shape[0])/low_consumption.shape[0] P_b

#probability that the person's consumption decreased after lead controversy and the person doesn't find any nutritive value in maggi given his/her consumption is low P_ab= (np.argwhere((low_consumption['nutritive_value'].to_numpy()=='No') & (low_consumption['effect_of_lead'].to_numpy()=='Yes, It did')).shape[0])/low_consumption.shape[0] P_ab

#probability that the person's consumption decreased after lead controversy or the person doesn't find any nutritive value in maggi given his/her consumption is low P_a_union_b= P_a + P_b - P_ab P_a_union_b

#Assign value 3 to the people who consume maggi noodles more than once or once in a week or once in two weeks. naive_df=pd.DataFrame() naive_df['number_of_times']=np.where(np.isin(maggi_data["number_of_times"],[3,4,5]),3,maggi_data["number_of_times"])

#Add 'consumption(in pkt)' column in the newly_created dataframe. naive_df["consumption(in pkt)"]=maggi_data['consumption(in pkt)']

naive_df

# store the feature matrix (X) and response vector (y) X = naive_df y = pd.DataFrame({"rs15pack":maggi_data["rs15pack"]}) # splitting X and y into training and testing sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # training the model on training set from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() gnb.fit(X_train, y_train) # making predictions on the testing set y_pred = gnb.predict(X_test) # comparing actual response values (y_test) with predicted response values (y_pred) from sklearn import metrics print('Gaussian Naive Bayes model accuracy(in %):', metrics.accuracy_score(y_test,y_pred)*100)

''' Make prediction whether a peron would prefer Rs.15 pack of maggi noodles given he/she consumes 1 pack of maggi noodles at a time and consumes it quite oftenly. using the naive bayes classifier ''' x=pd.DataFrame({"number_times":[3],"consumption(in pkt)":[1]}) print(gnb.predict(x))

#profit or loss calculations(monthly) #Assigning 0,1,2,4,8 to the persons who consume maggi noodles given no. times in a month respectively monthly_consumption= np.array(maggi_data["number_of_times"].map({1:0,2:1,3:2,4:4,5:8})) #profit or loss maggi would bear is it introduced Rs.15 pack from each category of consumers. profit_loss=np.array(maggi_data['consumption(in pkt)'].map({1:3,2:-9,3:-6}))

#Calculating the sum of product of corresponding values of above two columns would give us monthly profit/loss sum(monthly_consumption*profit_loss)

#monthly revenue earned by maggi by Rs.12 pack of noodles rs12=np.array(maggi_data['consumption(in pkt)'].map({1:12,2:24,3:36})) sum(monthly_consumption*rs12)

#monthly revenue earned by maggi by Rs.15 pack of noodles rs15=np.array(maggi_data['consumption(in pkt)'].map({1:15,2:15,3:30})) sum(monthly_consumption*rs15)

#percentage loss (sum(monthly_consumption*profit_loss)/sum(monthly_consumption*rs12))*100