The Most Important 25 Retailers In The United State
This project was developed as a practice to improve my skills as a Data Analyst
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#Theses are the libraries needed to develop this project
I proceed uploading the data
df_retail = pd.read_csv('largest_us_retailers.csv')
df_retail
df_retail_sorted = df_retail.sort_values('Sales',ascending=False)
df_retail_mean = df_retail_sorted['Sales'].mean()
df_retail_mean = df_retail_mean.round(2)
print(f'The mean sales for all retailer is U$D {df_retail_mean:,.2f}.')
print('All values are expressed in Millions Dollar.')
x_01 = df_retail_sorted['Company']
y_01 = df_retail_sorted['Sales']
plt.figure(figsize=(14,8))
sns.barplot(data=df_retail , x=x_01, y=y_01, palette='Spectral')
plt.title('Average sales of the Main Retailers in the U.S.', fontsize=18)
plt.grid()
plt.axhline(df_retail_mean, color ="r", linestyle = ':', linewidth = 2)
plt.xlabel('Company')
plt.ylabel('Sales')
plt.xticks(rotation = 60)
plt.show()
Discovering that the main company (Walmart US) has sales far from the others, the mean does not tell me a realistic value. Therefore, I will analyze the data excluding the best selling company.
df_retail_mean = df_retail_sorted['Sales'][1:25].mean()
df_retail_mean = df_retail_mean.round(2)
print(f'The mean sales for all retailers excluding Walmart is U$D {df_retail_mean:,.2f}.')
print('All values are expressed in Millions Dollar.')
x_02 = df_retail_sorted['Company'][1:25]
y_02 = df_retail_sorted['Sales'][1:25]
plt.figure(figsize=(12,6))
plt.title('Average Sales of the Main Retailers (excluding Walmart) in the U.S.', fontsize=16)
sns.barplot(data=df_retail , x=x_02, y=y_02, palette='Spectral')
plt.grid()
plt.axhline(df_retail_mean, color ="r", linestyle = ':', linewidth = 2)
plt.xlabel('Company')
plt.ylabel('Sales')
plt.xticks(rotation = 60)
plt.show()
The new average looks more realistic
df_retail_median = df_retail['Sales'].median().round(0)
print(f'Most companies made at least U$D {df_retail_median:,.0f}')
print('This value is expressed in Millions Dollar.')
x_03 = df_retail['Company']
y_03 = df_retail['Sales']
plt.figure(figsize=(14,8))
plt.title('Median Sales of the Main Retailers in the U.S.', fontsize=18)
sns.barplot(data=df_retail , x=x_03, y=y_03, palette='Spectral')
plt.grid()
plt.axhline(df_retail_median, color ="r", linestyle = ':', linewidth = 2)
plt.xlabel('Company')
plt.ylabel('Sales')
plt.xticks(rotation = 60)
plt.show()
df_retail_sorted_stores=df_retail.sort_values('Stores',ascending=False)
median_stores = df_retail_sorted_stores['Stores'].median()
mean_stores = df_retail_sorted_stores['Stores'].mean().round(2)
print(f'Most companies have at least {median_stores:,.0f} stores and the mean of all is {mean_stores:,.2f}.')
x_04 = df_retail_sorted_stores['Company']
y_04 = df_retail_sorted_stores['Stores']
plt.figure(figsize=(14,6))
plt.title('Number of Stores of the Main Retailers in the U.S.', fontsize=18)
sns.barplot(data=df_retail , x=x_04, y=y_04, palette='Spectral')
plt.grid()
plt.axhline(median_stores, color ="r", linestyle = ':', linewidth = 2)
plt.xlabel('Company')
plt.ylabel('Stores')
plt.xticks(rotation=50)
plt.show()
plt.figure(figsize=(12,8))
sns.scatterplot(data=df_retail, x='Stores', y='Sales', marker='*', hue='Company', s=550)
plt.title('Correlation Between Sales and Stores', fontsize=18)
plt.grid()
plt.legend(loc='upper right', bbox_to_anchor=(1.23,1))
print('As we can see in the scatter diagram, we do not see any trend.')
print('Also in the chart, the value for Sales/Stores is very close to zero, which means no correlations.')
plt.show()
df_corr = df_retail[['Sales', 'Stores']]
df_corr.corr()
df_sales = df_retail['Sales']
max_sales = df_sales.max()
min_sales = df_sales.min()
range_sales = max_sales-min_sales
print(f'The range of sales is U$D {range_sales:,.0f} millons dolars')
df_stores5 = df_retail.sort_values('Stores', ascending=False)[0:5]
x_05 = df_stores5['Company']
y_05 = df_stores5['Stores']
plt.figure(figsize=(12,6))
plt.title('Top 5 Retailers with more stores', fontsize=18)
sns.barplot(data=df_retail , x=x_05, y=y_05, palette='Spectral')
plt.grid()
plt.xlabel('Company')
plt.ylabel('Stores')
plt.xticks(rotation=30)
plt.show()
df_sales5 = df_retail.sort_values('Sales', ascending=False)[0:5]
x_06 = df_sales5['Company']
y_06 = df_sales5['Sales']
plt.figure(figsize=(12,6))
plt.title('Top 5 Retailers With More Sales', fontsize=18)
sns.barplot(data=df_retail , x=x_06, y=y_06, palette='Spectral')
plt.grid()
plt.xlabel('Company')
plt.ylabel('Sales')
plt.xticks(rotation=30)
plt.show()
combination = pd.merge(left=df_stores5, right=df_sales5)
x_07 = combination['Company']
y_07 = combination['Sales']
print('In conclusion, among the Retailers with more number of stores only Walgreen Boots and Walmart US belong to the Retailers with higher sales')
print('This support the finding that there is not correlation between sales and number of stores.')
plt.figure(figsize=(7,6))
plt.title('Joining Between Top 5 in Numbers of Stores and Top 5 in Sales', fontsize=12)
sns.barplot(data=df_retail , x=x_07, y=y_07, palette='Spectral')
plt.grid()
plt.xlabel('Company')
plt.ylabel('Sales')
plt.xticks(rotation='horizontal')
plt.show()
plt.figure(figsize=(12,6))
x_08 = df_retail.groupby('Category')['Sales'].sum().sort_values(ascending=False)
plt.plot(x_08, marker='s')
plt.title('Sales per Category', fontsize=18)
plt.xlabel('Category')
plt.ylabel('Sales')
plt.xticks(rotation=60)
plt.grid()
top_sale = df_retail.groupby('Category')['Sales'].sum().max()
print(f'The category with higher sales is Supercenters with USD {top_sale:,.0f} Million Dollars')
plt.show()
plt.figure(figsize=(10,8))
sns.barplot(data=df_retail.sort_values('Sales/Avg. Store', ascending=False) ,x='Sales/Avg. Store',y='Company')
print('Costco is the company which has more sales per every store')
plt.title('Ratio Sales Per Physical Stores', fontsize=18)
plt.show()
df_online_stores = df_retail[df_retail.Stores.isna()]
df_retail['Type of Store']=['Online' if i else 'Physical' for i in df_retail.Stores.isna()]
sales_type = df_retail.groupby('Type of Store').sum()
df_retail[df_retail['Type of Store']=='Online'][['Company']]
plt.figure(figsize=(5,5))
df_online_stores = df_retail[df_retail.Stores.isna()]
df_retail['Type of Store']=['Online' if i else 'Physical' for i in df_retail.Stores.isna()]
sales_by_type = df_retail.groupby('Type of Store').sum()
sales_by_type['Sales'].plot.bar()
plt.title('Types of Stores', fontsize=14)
plt.ylabel('Sales')
plt.xticks(rotation='horizontal')
current_values = plt.gca().get_yticks()
plt.gca().set_yticklabels(['{:,.0f}'.format(x) for x in current_values])
plt.show()
print('Observation:')
plt.figure (figsize = (12,6))
x_09 = df_retail.groupby('Category')['Stores'].sum().sort_values(ascending=False)
plt.plot(x_09, marker='s')
plt.title('Numbers of Stores per Category', fontsize=18)
plt.xticks(rotation = 60)
max_store = df_retail['Stores'].sum()
print(f'The categry with more stores is "Dollar Stores" with: USD {max_store:,.0f}')