#import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt
from matplotlib import image
img = image.imread("/work/question-mark-2492009_1920.jpg")
plt.figure(figsize=(10,10))
plt.imshow(img)
plt.show()
#load the data
survey_data=pd.read_csv("/work/Maggi-Survey .csv")
survey_data
#Copy the data into a new variable
maggi_data=survey_data.copy()
#Display the column names
maggi_data.columns.values
#Drop the "Timestamp" column
maggi_data=maggi_data.drop('Timestamp', axis=1)
#Create a list object containing the modified names for columns
column_names=['age','number_of_times','consumption(in pkt)','competitors','add_ingredients','ingredients','go_to_meal','effect_of_lead','nutritive_value','rs15pack','masala_magic','inc_masala']
#Create the list with keys as old column names and values as corresponding new names.
names=dict()
for i in range(len(column_names)):
names[maggi_data.columns.values[i]]=column_names[i]
#Rename the column
maggi_data=maggi_data.rename(columns=names)
#Mapping the values of "number of times" column from 1-5 accordingly going from lower conumption to higher consumption
maggi_data["number_of_times"]=maggi_data["number_of_times"].map({"Less often":1,"Once in a month":2,"Once in two weeks":3,"Once in a week":4,"More than once in a week":5})
#Removing PKT from the values and mapping "More than 2 PKTS" with "3" in the "consumption(in pkt)" column
maggi_data["consumption(in pkt)"]=maggi_data["consumption(in pkt)"].map({"1 PKT":1,"2 PKTS":2,"More than 2 PKTS":3})
# Bar chart showing the distrinution of age groups.
plt.bar(maggi_data['age'].value_counts().index ,maggi_data['age'].value_counts().values, color=['#ffed00',"#ed232a",'#023047'])
plt.xlabel('Age group', fontsize=13)
plt.ylabel('Frequency', fontsize=13)
plt.title('Age group of respondents', fontsize=16, fontweight="bold")
plt.show()
#Bar chart showing top compettors of Maggi
plt.barh(maggi_data['competitors'].value_counts().index, maggi_data['competitors'].value_counts().values, color=['#e85d04','#023047','#370617','#008641','#d00000'])
plt.xlabel('No. of people who prefer this brand', fontsize=13)
plt.ylabel('Brand name', fontsize=13)
plt.title('Close competitors of Maggi', fontsize=16, fontweight="bold")
plt.show()
#Bar chart showing the no of people who consider/dont't consider maggi as complete meal
sns.countplot(x=maggi_data['add_ingredients'], data= maggi_data ,palette=["#ffc922",'#ed232a']).set_title('Is Maggi prefered with added ingredients ?', fontdict={'fontsize':16,'fontweight': 'bold'})
#Top 2 ingredients that people prefer with maggi noodles
ingredients=[]
for i in range(maggi_data["ingredients"].shape[0]):
a=str(maggi_data["ingredients"][i]).split(";")
ingredients+=a
ingredients_series=pd.Series(ingredients)
print(ingredients_series.value_counts().head(2))
#Bar chart showing the the no. of people whose consumption declined/didn't decline after 2015 lead controversy.
sns.countplot(x=maggi_data['effect_of_lead'], data= maggi_data ,palette=["#ffc922",'#ed232a']).set_title('Has consumption declined after the Lead Controversy with Maggi?',fontdict= { 'fontsize': 16, 'fontweight':'bold'})
#create a dataframe for stacked area chart
nutritive_value=pd.DataFrame()
#create a column containing possible values for 'no_of_times' column
nutritive_value['How_Often']=np.sort(maggi_data['number_of_times'].unique())
#create a list containing the no. of people who finds nutritive value in maggi noodles and consumes it different no. of times.
yes_value=[]
for i in np.sort(maggi_data['number_of_times'].unique()):
yes_value.append(np.argwhere((maggi_data['number_of_times'].to_numpy()==i) & (maggi_data['nutritive_value'].to_numpy()=='Yes')).shape[0])
#create a list containing the no. of people who doesn't find nutritive value in maggi noodles and consumes it different no. of times.
no_value=[]
for i in np.sort(maggi_data['number_of_times'].unique()):
no_value.append(np.argwhere((maggi_data['number_of_times'].to_numpy()==i) & (maggi_data['nutritive_value'].to_numpy()=='No')).shape[0])
#Add the above two list as separate columns in "nutritive value" dataframe.
nutritive_value['Yes']= yes_value
nutritive_value['No']= no_value
nutritive_value
#Stacked Area Chart showing the frequency of people who consumes maggi noodles different no. of times stacked by whether they
#nutritive_value in it.
plt.stackplot(nutritive_value['How_Often'],nutritive_value['Yes'],nutritive_value['No'], colors =["#e5383b","#FFB703"])
plt.legend(labels=['Yes','No'], loc="upper left")
plt.xlim((1,5))
plt.title("Do people find Nutritive value in Maggi ?",fontsize=15, fontweight='bold')
plt.xlabel("How often does a person consume Maggi", fontsize=13)
plt.ylabel("Frequency", fontsize=13)
plt.show()
#Doughnut Chart showing the percentage of people who consider maggi as their go to meal.
colors=["#ffed00","#ed232a"]
plt.pie(maggi_data['go_to_meal'].value_counts().values,colors=colors,autopct='%1.1f%%',labels=maggi_data['go_to_meal'].value_counts().index)
centre_circle=plt.Circle((0,0),0.50,fc="white")
fig=plt.gcf()
fig.gca().add_artist(centre_circle)
plt.title("PERCENTAGE OF PEOPLE WHO CONSIDER MAGGI AS THEIR GO TO MEAL",fontsize="15",fontweight="bold")
plt.show()
#create a contingency table for 'inc_masala' and 'masala_magic'
data_crosstab = pd.crosstab(maggi_data['inc_masala'], maggi_data['masala_magic'],
margins = False)
print(data_crosstab)
#Performing chi square test to test the independence of attributes
from scipy.stats import chi2_contingency
from scipy.stats import chi2
stat, p, dof, expected = chi2_contingency(data_crosstab)
print('dof=%d' % dof)
print(expected)
# interpret test-statistic
prob = 0.95
critical = chi2.ppf(prob, dof)
print('probability=%.3f, critical=%.3f, stat=%.3f' % (prob, critical, stat))
# interpret p-value
alpha = 1.0 - prob
print('significance=%.3f, p=%.3f' % (alpha, p))
#Bar chart showiwng the frequency of people who want/doesn't want Maggi to increase its taste maker size.
sns.countplot(x=maggi_data['inc_masala'], data= maggi_data ,palette=["#ffed00",'#ed232a']).set_title('Do people want Maggi to increase it''s tastemaker size at the cost of increased prices?',fontdict= { 'fontsize': 16, 'fontweight':'bold'})
#Import the KMeans module so we can perform k-means clustering with sklearn
from sklearn.cluster import KMeans
maggi_data
#Create a dataframe that contains 'ExpenseRating' and 'FieldRating' columns
cluster_df = pd.DataFrame({'consumption(in pkt)':maggi_data['consumption(in pkt)'], 'number_of_times':maggi_data['number_of_times']})
cluster_df
#Plotting the data using scatter plot
plt.scatter(cluster_df['number_of_times'],cluster_df['consumption(in pkt)'])
plt.xlabel('number_of_times')
plt.ylabel('consumption(in pkt)')
plt.show()
# Create an empty list
wcss=[]
# Create all possible cluster solutions with a loop
for i in range(1,7):
# Cluster solution with i clusters
kmeans = KMeans(i)
# Fit the data
kmeans.fit(cluster_df)
# Find WCSS for the current iteration
wcss_iter = kmeans.inertia_
# Append the value to the WCSS list
wcss.append(wcss_iter)
wcss
# Create a variable containing the numbers from 1 to 6, so we can use it as X axis of the future plot
number_clusters = range(1,7)
# Plot the number of clusters vs WCSS
plt.plot(number_clusters,wcss)
# Name the graph
plt.title('The Elbow Method')
# Name the x-axis
plt.xlabel('Number of clusters')
# Name the y-axis
plt.ylabel('Within-cluster Sum of Squares')
plt.show()
# Create an object (which we would call kmeans)
kmeans = KMeans(3)
# Fit the input data, i.e. cluster the data in cluster_df in 3 clusters
kmeans.fit(cluster_df)
#Create a copy of 'cluster_df' dataframe
clusters = cluster_df.copy()
#Create a new column, containing the predicted clusters for each observation.
clusters['cluster_pred']=kmeans.fit_predict(cluster_df)
## Plot the data using the 'number_of_times' and 'consumption(in pkt)'
# c (color) is an argument which could be coded with a variable
# The variable in this case has values 0,1,2, indicating to plt.scatter, that there are three colors (0,1,2)
# All points in cluster 0 will be the same colour, all points in cluster 1 - another one, etc.
# cmap is the color map
plt.scatter(clusters['number_of_times'],clusters['consumption(in pkt)'],c=clusters['cluster_pred'],cmap='rainbow', alpha=0.5)
plt.xlabel('number_of_times',fontsize=13)
plt.ylim(0,4)
plt.ylabel('consumption(in pkt)',fontsize=13)
plt.title("Consumption Analysis", fontsize=16)
#Create a dataframe of records included in 1st cluster i.e. cluster of lower consumption.
low_consumption=clusters[clusters['cluster_pred']==1]
#Create a list object that contains nutritive_value response of the people whose consumption is low.
a=[]
for i in low_consumption.index:
a.append(maggi_data['nutritive_value'][i])
#Create a list object that contains effect_of_lead response of the people whose consumption is low.
b=[]
for i in low_consumption.index:
b.append(maggi_data['effect_of_lead'][i])
#Adding the above created lists as columns in the "low_consumption" column
low_consumption['nutritive_value']=a
low_consumption['effect_of_lead']=b
low_consumption
#probability that the person's consumption decreased after lead controversy given his/her consumption is low
P_a= (np.argwhere(low_consumption['effect_of_lead'].to_numpy()=='Yes, It did').shape[0])/low_consumption.shape[0]
P_a
#probability that the person doesn't find any nutritive value in maggi given his/her consumption is low
P_b= (np.argwhere(low_consumption['nutritive_value'].to_numpy()=='No').shape[0])/low_consumption.shape[0]
P_b
#probability that the person's consumption decreased after lead controversy and the person doesn't find any nutritive value in maggi given his/her consumption is low
P_ab= (np.argwhere((low_consumption['nutritive_value'].to_numpy()=='No') & (low_consumption['effect_of_lead'].to_numpy()=='Yes, It did')).shape[0])/low_consumption.shape[0]
P_ab
#probability that the person's consumption decreased after lead controversy or the person doesn't find any nutritive value in maggi given his/her consumption is low
P_a_union_b= P_a + P_b - P_ab
P_a_union_b
#Assign value 3 to the people who consume maggi noodles more than once or once in a week or once in two weeks.
naive_df=pd.DataFrame()
naive_df['number_of_times']=np.where(np.isin(maggi_data["number_of_times"],[3,4,5]),3,maggi_data["number_of_times"])
#Add 'consumption(in pkt)' column in the newly_created dataframe.
naive_df["consumption(in pkt)"]=maggi_data['consumption(in pkt)']
naive_df
# store the feature matrix (X) and response vector (y)
X = naive_df
y = pd.DataFrame({"rs15pack":maggi_data["rs15pack"]})
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
# making predictions on the testing set
y_pred = gnb.predict(X_test)
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print('Gaussian Naive Bayes model accuracy(in %):', metrics.accuracy_score(y_test,y_pred)*100)
'''
Make prediction whether a peron would prefer Rs.15 pack of maggi noodles given he/she consumes 1 pack of maggi noodles at a
time and consumes it quite oftenly. using the naive bayes classifier
'''
x=pd.DataFrame({"number_times":[3],"consumption(in pkt)":[1]})
print(gnb.predict(x))
#profit or loss calculations(monthly)
#Assigning 0,1,2,4,8 to the persons who consume maggi noodles given no. times in a month respectively
monthly_consumption= np.array(maggi_data["number_of_times"].map({1:0,2:1,3:2,4:4,5:8}))
#profit or loss maggi would bear is it introduced Rs.15 pack from each category of consumers.
profit_loss=np.array(maggi_data['consumption(in pkt)'].map({1:3,2:-9,3:-6}))
#Calculating the sum of product of corresponding values of above two columns would give us monthly profit/loss
sum(monthly_consumption*profit_loss)
#monthly revenue earned by maggi by Rs.12 pack of noodles
rs12=np.array(maggi_data['consumption(in pkt)'].map({1:12,2:24,3:36}))
sum(monthly_consumption*rs12)
#monthly revenue earned by maggi by Rs.15 pack of noodles
rs15=np.array(maggi_data['consumption(in pkt)'].map({1:15,2:15,3:30}))
sum(monthly_consumption*rs15)
#percentage loss
(sum(monthly_consumption*profit_loss)/sum(monthly_consumption*rs12))*100