The Most Important 25 Retailers In The United State
This project was developed as a practice to improve my skills as a Data Analyst
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Theses are the libraries needed to develop this project
I proceed uploading the data
df_retail = pd.read_csv('largest_us_retailers.csv')
df_retail
Companyobject
Walmart US4%
Kroger4%
23 others92%
Salesint64
16592 - 658119
0
Walmart US
658119
1
Kroger
115037
2
Costco
90048
3
Home Depot
83976
4
Walgreen Boots
78924
5
CVS incl. Target
77792
6
Amazon
71687
7
Target
71208
8
Lowe's
60311
9
Albertsons
56829
10
Sam's Club
56828
11
Apple incl. Online
37664
12
Best Buy
34980
13
Publix
34408
14
Rite Aid
27486
15
Ahold
26903
16
Macy's
26028
17
TJX
25012
18
Aldi
24402
19
Dollar General
22234
20
Dollar Tree
21464
21
HEB
21384
22
Kohl's
19060
23
Delhaize
18201
24
Meijer
16592
df_retail_sorted = df_retail.sort_values('Sales',ascending=False)
df_retail_mean = df_retail_sorted['Sales'].mean()
df_retail_mean = df_retail_mean.round(2)
print(f'The mean sales for all retailer is U$D {df_retail_mean:,.2f}.')
print('All values are expressed in Millions Dollar.')
x_01 = df_retail_sorted['Company']
y_01 = df_retail_sorted['Sales']
plt.figure(figsize=(14,8))
sns.barplot(data=df_retail , x=x_01, y=y_01, palette='Spectral')
plt.title('Average sales of the Main Retailers in the U.S.', fontsize=18)
plt.grid()
plt.axhline(df_retail_mean, color ="r", linestyle = ':', linewidth = 2)
plt.xlabel('Company')
plt.ylabel('Sales')
plt.xticks(rotation = 60)
plt.show()
The mean sales for all retailer is U$D 71,063.08.
All values are expressed in Millions Dollar.
Discovering that the main company (Walmart US) has sales far from the others, the mean does not tell me a realistic value. Therefore, I will analyze the data excluding the best selling company.
df_retail_mean = df_retail_sorted['Sales'][1:25].mean()
df_retail_mean = df_retail_mean.round(2)
print(f'The mean sales for all retailers excluding Walmart is U$D {df_retail_mean:,.2f}.')
print('All values are expressed in Millions Dollar.')
x_02 = df_retail_sorted['Company'][1:25]
y_02 = df_retail_sorted['Sales'][1:25]
plt.figure(figsize=(12,6))
plt.title('Average Sales of the Main Retailers (excluding Walmart) in the U.S.', fontsize=16)
sns.barplot(data=df_retail , x=x_02, y=y_02, palette='Spectral')
plt.grid()
plt.axhline(df_retail_mean, color ="r", linestyle = ':', linewidth = 2)
plt.xlabel('Company')
plt.ylabel('Sales')
plt.xticks(rotation = 60)
plt.show()
The mean sales for all retailers excluding Walmart is U$D 46,602.42.
All values are expressed in Millions Dollar.
The new average looks more realistic
df_retail_median = df_retail['Sales'].median().round(0)
print(f'Most companies made at least U$D {df_retail_median:,.0f}')
print('This value is expressed in Millions Dollar.')
x_03 = df_retail['Company']
y_03 = df_retail['Sales']
plt.figure(figsize=(14,8))
plt.title('Median Sales of the Main Retailers in the U.S.', fontsize=18)
sns.barplot(data=df_retail , x=x_03, y=y_03, palette='Spectral')
plt.grid()
plt.axhline(df_retail_median, color ="r", linestyle = ':', linewidth = 2)
plt.xlabel('Company')
plt.ylabel('Sales')
plt.xticks(rotation = 60)
plt.show()
Most companies made at least U$D 34,980
This value is expressed in Millions Dollar.
df_retail_sorted_stores=df_retail.sort_values('Stores',ascending=False)
median_stores = df_retail_sorted_stores['Stores'].median()
mean_stores = df_retail_sorted_stores['Stores'].mean().round(2)
print(f'Most companies have at least {median_stores:,.0f} stores and the mean of all is {mean_stores:,.2f}.')
x_04 = df_retail_sorted_stores['Company']
y_04 = df_retail_sorted_stores['Stores']
plt.figure(figsize=(14,6))
plt.title('Number of Stores of the Main Retailers in the U.S.', fontsize=18)
sns.barplot(data=df_retail , x=x_04, y=y_04, palette='Spectral')
plt.grid()
plt.axhline(median_stores, color ="r", linestyle = ':', linewidth = 2)
plt.xlabel('Company')
plt.ylabel('Stores')
plt.xticks(rotation=50)
plt.show()
Most companies have at least 1,828 stores and the mean of all is 3,466.57.
plt.figure(figsize=(12,8))
sns.scatterplot(data=df_retail, x='Stores', y='Sales', marker='*', hue='Company', s=550)
plt.title('Correlation Between Sales and Stores', fontsize=18)
plt.grid()
plt.legend(loc='upper right', bbox_to_anchor=(1.23,1))
print('As we can see in the scatter diagram, we do not see any trend.')
print('Also in the chart, the value for Sales/Stores is very close to zero, which means no correlations.')
plt.show()
df_corr = df_retail[['Sales', 'Stores']]
df_corr.corr()
As we can see in the scatter diagram, we do not see any trend.
Also in the chart, the value for Sales/Stores is very close to zero, which means no correlations.
Salesfloat64
Storesfloat64
Sales
1
0.06330142261
Stores
0.06330142261
1
df_sales = df_retail['Sales']
max_sales = df_sales.max()
min_sales = df_sales.min()
range_sales = max_sales-min_sales
print(f'The range of sales is U$D {range_sales:,.0f} millons dolars')
The range of sales is U$D 641,527 millons dolars
df_stores5 = df_retail.sort_values('Stores', ascending=False)[0:5]
x_05 = df_stores5['Company']
y_05 = df_stores5['Stores']
plt.figure(figsize=(12,6))
plt.title('Top 5 Retailers with more stores', fontsize=18)
sns.barplot(data=df_retail , x=x_05, y=y_05, palette='Spectral')
plt.grid()
plt.xlabel('Company')
plt.ylabel('Stores')
plt.xticks(rotation=30)
plt.show()
df_sales5 = df_retail.sort_values('Sales', ascending=False)[0:5]
x_06 = df_sales5['Company']
y_06 = df_sales5['Sales']
plt.figure(figsize=(12,6))
plt.title('Top 5 Retailers With More Sales', fontsize=18)
sns.barplot(data=df_retail , x=x_06, y=y_06, palette='Spectral')
plt.grid()
plt.xlabel('Company')
plt.ylabel('Sales')
plt.xticks(rotation=30)
plt.show()
combination = pd.merge(left=df_stores5, right=df_sales5)
x_07 = combination['Company']
y_07 = combination['Sales']
print('In conclusion, among the Retailers with more number of stores only Walgreen Boots and Walmart US belong to the Retailers with higher sales')
print('This support the finding that there is not correlation between sales and number of stores.')
plt.figure(figsize=(7,6))
plt.title('Joining Between Top 5 in Numbers of Stores and Top 5 in Sales', fontsize=12)
sns.barplot(data=df_retail , x=x_07, y=y_07, palette='Spectral')
plt.grid()
plt.xlabel('Company')
plt.ylabel('Sales')
plt.xticks(rotation='horizontal')
plt.show()
In conclusion, among the Retailers with more number of stores only Walgreen Boots and Walmart US belong to the Retailers with higher sales
This support the finding that there is not correlation between sales and number of stores.
plt.figure(figsize=(12,6))
x_08 = df_retail.groupby('Category')['Sales'].sum().sort_values(ascending=False)
plt.plot(x_08, marker='s')
plt.title('Sales per Category', fontsize=18)
plt.xlabel('Category')
plt.ylabel('Sales')
plt.xticks(rotation=60)
plt.grid()
top_sale = df_retail.groupby('Category')['Sales'].sum().max()
print(f'The category with higher sales is Supercenters with USD {top_sale:,.0f} Million Dollars')
plt.show()
The category with higher sales is Supercenters with USD 745,919 Million Dollars
plt.figure(figsize=(10,8))
sns.barplot(data=df_retail.sort_values('Sales/Avg. Store', ascending=False) ,x='Sales/Avg. Store',y='Company')
print('Costco is the company which has more sales per every store')
plt.title('Ratio Sales Per Physical Stores', fontsize=18)
plt.show()
Costco is the company which has more sales per every store
df_online_stores = df_retail[df_retail.Stores.isna()]
df_retail['Type of Store']=['Online' if i else 'Physical' for i in df_retail.Stores.isna()]
sales_type = df_retail.groupby('Type of Store').sum()
df_retail[df_retail['Type of Store']=='Online'][['Company']]
Companyobject
6
Amazon
11
Apple incl. Online
plt.figure(figsize=(5,5))
df_online_stores = df_retail[df_retail.Stores.isna()]
df_retail['Type of Store']=['Online' if i else 'Physical' for i in df_retail.Stores.isna()]
sales_by_type = df_retail.groupby('Type of Store').sum()
sales_by_type['Sales'].plot.bar()
plt.title('Types of Stores', fontsize=14)
plt.ylabel('Sales')
plt.xticks(rotation='horizontal')
current_values = plt.gca().get_yticks()
plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in current_values])
plt.show()
print('Observation:')
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:10: UserWarning: FixedFormatter should only be used together with FixedLocator
# Remove the CWD from sys.path while we load stuff.
Observation:
plt.figure (figsize = (12,6))
x_09 = df_retail.groupby('Category')['Stores'].sum().sort_values(ascending=False)
plt.plot(x_09, marker='s')
plt.title('Numbers of Stores per Category', fontsize=18)
plt.xticks(rotation = 60)
max_store = df_retail['Stores'].sum()
print(f'The categry with more stores is "Dollar Stores" with: USD {max_store:,.0f}')
The categry with more stores is "Dollar Stores" with: USD 79,731