Analysis of the 25 largest retailers in the United States
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
df = pd.read_csv('/work/largest_us_retailers_9b00dc73-a938-46cd-af17-fcb2bd67301f.csv')
df.sort_values('Sales',ascending=0);
df = df.replace(to_replace ="Warehouse Club",value ="Warehouse Clubs");
df
I. Business Questions
dfnw = df.drop(df.index[0])
mean_sales = dfnw['Sales'].mean()
plt.figure(figsize=(19,8));
plt.xticks(rotation=90);
plt.title('sales by company')
ax = plt.subplot()
sn.barplot(data=dfnw,y='Sales', x ='Company' )
ax.axhline(mean_sales,color='red',label='average sales '+ str(round(mean_sales, 3)))
plt.legend();
plt.figure(figsize=(19,8));
plt.xticks(rotation=90);
plt.title('sales by company')
ax = plt.subplot()
sn.barplot(data=df,y='Sales', x ='Company' )
ax.axhline(mean_sales,color='red',label='Average sales excluding Walmart '+ str(round(mean_sales, 3)))
plt.legend();
plt.figure(figsize=(19,8));
sn.histplot(data=df,x='Sales' ,binrange = [0,700000],binwidth=50000)
plt.title("Distribution of company sales")
plt.xlabel('Sales')
plt.ylabel('frequency')
plt.figure(figsize=(19,8));
sn.histplot(data=df,x='Stores' ,binrange = [0,15000],binwidth=2000)
plt.title("Distribution of stores by companies")
plt.xlabel('stores')
plt.ylabel('frequency')
dfs = df[df.Stores < 2000]
mean = dfs['Stores'].mean()
ax = plt.subplot()
ax.axvline(mean,color='red',label='Average Stores of the first 13 companies '+ str(round(mean, 0)) + ' tiendas')
plt.legend();
plt.figure(figsize=(10,8));
sn.scatterplot(data=df,x='Stores',y='Sales',hue='Company')
plt.title('Number of physical stores vs Sales')
plt.grid()
plt.figure(figsize=(19,5));
sn.boxplot(df['Sales'])
sales_range = df['Sales'].max()-df['Sales'].min()
print(sales_range)
plt.figure(figsize=(19,5));
sn.boxplot(dfnw['Sales'],color='red')
sales_range = dfnw['Sales'].max()-dfnw['Sales'].min()
print(sales_range)
dfstores = df.sort_values('Stores',ascending=0)
dfstores= dfstores[0:5]
dfsales = df.sort_values('Sales',ascending=0)
dfsales = dfsales[0:5]
fig, (ax,bx)= plt.subplots(2,1, figsize=(15,9))
sn.barplot(data=dfstores,x='Company',y='Stores',ax=ax)
sn.barplot(data=dfsales,x='Company',y='Sales',ax=bx)
fig, (ax,bx)= plt.subplots(2,1, figsize=(15,9));
sn.barplot(data=df,x='Sales',y='Category',ax=ax);
sn.barplot(data=dfnw,x='Sales',y='Category',ax=bx);
plt.figure(figsize=(19,9));
dfsast = df.sort_values('Sales/Avg. Store')
sn.barplot(data= dfsast,x='Sales/Avg. Store',y='Company');
fig, (ax,bx)= plt.subplots(1,2, figsize=(14,7))
dfnw['company type']=['online' if i else 'física' for i in dfnw.Stores.isna()]
df['company type']=['online' if i else 'física' for i in df.Stores.isna()]
sn.barplot(data= dfnw,x='company type',y='Sales',ax=ax);
sn.barplot(data= df,x='company type',y='Sales',ax=bx);
ax.set(title='Excluding Walmart data')
bx.set(title='Including Walmart data')
print('Companies that sell exclusively online are')
print(df[df.Stores.isna()]['Company'])
plt.figure (figsize = (16,8))
plt.title('company growth', fontsize = 15)
plt.xticks(rotation = 90)
sn.barplot (x= 'Company', y= 'Store Count Growth', data = df.sort_values('Store Count Growth'))
df['Store Count Growth'].mean()
plt.figure (figsize = (16,8))
plt.xticks(rotation = 45)
sn.barplot(data=df,y='Stores',x='Category')
plt.figure (figsize = (16,8))
plt.xticks(rotation = 45)
sn.histplot(data=df,x='Category')
dfg = df[df['Category'] == 'Grocery']
plt.figure(figsize=(10,8))
w = 100* dfg['Sales']/dfg['Sales'].sum()
patches, _ =plt.pie(dfg['Sales'], labels=dfg['Company'],)
labels = ['{0} - {1:1.2f} %'.format(i,j) for i,j in zip(dfg['Company'], w)]
plt.legend(patches, labels, loc='best', bbox_to_anchor=(0, 1))
plt.show()