Análisis de los 25 retailers más grandes de Estados Unidos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('/work/largest_us_retailers.csv')
df_sales = df.sort_values('Sales', ascending=0)
df_sales.head()
Companyobject
Salesint64
0
Walmart US
658119
1
Kroger
115037
2
Costco
90048
3
Home Depot
83976
4
Walgreen Boots
78924
I. Preguntas del negocio
#Promedio sin la compañía dominante
avg = df_sales['Sales'][1:].mean()
print(avg)
46602.416666666664
#Promedio con la compañia dominante
df['Sales'].mean()
x = ['Con Dominante', 'Sin Dominante']
y = [71063, 46602]
plt.bar(x, y)
plt.show()
x = ['Más Alto', 'Promedio', 'Más Bajo']
y = [115000, 46602, 16572]
plt.bar(x, y)
plt.style.use('ggplot')
x = df_sales['Sales'][1:]
plt.hist(x)
plt.style.use('ggplot')
df_stores = df.sort_values('Stores', ascending=True)
x = df_stores['Stores']
plt.hist(x)
plt.style.use('ggplot')
bins = [i for i in range(0, 2001, 100)]
plt.hist(x, bins)
plt.style.use('ggplot')
bins = [i for i in range(2000, 4001, 100)]
plt.hist(x, bins)
plt.style.use('ggplot')
plt.scatter(df_sales['Stores'], df_sales['Sales'])
plt.title('Sales & Stores')
plt.xlabel('Stores')
plt.ylabel('Sales')
plt.show()
max_sales = df_sales['Sales'][1:].max()
min_sales = df_sales['Sales'].min()
print(max_sales)
print(min_sales)
x = ['Max', 'Min']
y = [max_sales, min_sales]
plt.bar(x, y)
115037
16592
#Top 5 empresas que más tiendas físicas tienen
df_stores = df.sort_values('Stores', ascending=False)
x = df_stores['Company'][:5]
y = df_stores['Stores'][:5]
print(df_stores['Company'][:5])
plt.bar(x, y)
plt.xticks(rotation=45)
plt.show()
20 Dollar Tree
19 Dollar General
5 CVS incl. Target
4 Walgreen Boots
0 Walmart US
Name: Company, dtype: object
#Top 5 empresas que más ventas tienen
x = df_sales['Company'][:5]
y = df_sales['Sales'][:5]
print(df_sales['Company'][:5])
plt.bar(x,y)
plt.xticks(rotation=10)
plt.show()
0 Walmart US
1 Kroger
2 Costco
3 Home Depot
4 Walgreen Boots
Name: Company, dtype: object
category = df.groupby('Category')['Sales'].sum()
print(category.sort_values(ascending=False))
Category
Supercenters 745919
Grocery 297164
Drug Stores 184202
Home Improvement 144287
Warehouse Club 90048
Electronic/Mail Order 71687
Warehouse Clubs 56828
Department Stores 45088
Dollar Stores 43698
Electronics 34980
Apparel 25012
Name: Sales, dtype: int64
x = ['Supercenters', 'Grocery', 'Drug Stores', 'Home Improvment', 'Warehouse Club', 'Electronic/Mail Order', 'Warehouse Clubs', 'Department Stores', 'Dollar Stores', 'Electronics', 'Apparel']
y = category.sort_values(ascending=False)
plt.bar(x, y)
plt.xticks(rotation=90)
plt.show()
df_sales_avg = df.sort_values('Sales/Avg. Store', ascending=False)
df_sales_avg.head()
Companyobject
Salesint64
2
Costco
90048
10
Sam's Club
56828
24
Meijer
16592
21
HEB
21384
0
Walmart US
658119
x = df_sales_avg['Company'][:15]
y = df_sales_avg['Sales/Avg. Store'][:15]
plt.bar(x, y)
plt.xticks(rotation = 90)
plt.show()
df[df.Category == 'Electronic/Mail Order']
Companyobject
Salesint64
6
Amazon
71687
df.sort_values('Company', ascending=True)
Companyobject
Ahold4%
Albertsons4%
23 others92%
Salesint64
16592 - 658119
15
Ahold
26903
9
Albertsons
56829
18
Aldi
24402
6
Amazon
71687
11
Apple incl. Online
37664
12
Best Buy
34980
5
CVS incl. Target
77792
2
Costco
90048
23
Delhaize
18201
19
Dollar General
22234
df_company = df.sort_values('Company', ascending=True)
df_company = df_company.drop(6)
df_company = df_company.drop(11)
df_company = df_company.drop(0)
df_company['Sales'].mean()
# Promedio de ventas de compañias con solo tiendas físicas
x = ['Amazon', 'Apple incl. Online', 'Average']
y = [71687, 37664, 45868]
plt.bar(x, y)
plt.ylabel('Sales')
plt.show()
y = df.groupby('Category')['Stores'].sum()
x = df['Category'].drop_duplicates()
x = x.sort_values(ascending=True)[:11]
print(y.sort_values(ascending=False))
plt.bar(x, y)
plt.xticks(rotation=90)
plt.show()
Category
Dollar Stores 27600.0
Drug Stores 22368.0
Grocery 12026.0
Supercenters 6577.0
Home Improvement 3793.0
Apparel 2770.0
Department Stores 2058.0
Electronics 1389.0
Warehouse Clubs 655.0
Warehouse Club 495.0
Electronic/Mail Order 0.0
Name: Stores, dtype: float64
stores_per_category = df.groupby('Category')['Stores'].sum()
bins = [i for i in range(0,25,10)]
print(stores_per_category.sort_values(ascending=True))
plt.hist(stores_per_category)
plt.xlabel('Stores')
plt.show()
Category
Electronic/Mail Order 0.0
Warehouse Club 495.0
Warehouse Clubs 655.0
Electronics 1389.0
Department Stores 2058.0
Apparel 2770.0
Home Improvement 3793.0
Supercenters 6577.0
Grocery 12026.0
Drug Stores 22368.0
Dollar Stores 27600.0
Name: Stores, dtype: float64
This chart is empty
Chart was probably not set up properly in the notebook
x = df.groupby('Category')['Sales/Avg. Store'].mean()
list = df['Category'].drop_duplicates().sort_values(ascending=True)
list = list[:11]
print(x.sort_values(ascending=True))
plt.bar(list, x)
plt.xticks(rotation=90)
plt.show()
Category
Dollar Stores 1.666402
Drug Stores 8.220845
Apparel 9.369545
Department Stores 23.397801
Electronics 24.685956
Grocery 29.668370
Home Improvement 38.023835
Supercenters 60.123499
Warehouse Clubs 87.293395
Warehouse Club 187.795620
Electronic/Mail Order NaN
Name: Sales/Avg. Store, dtype: float64