Análisis de los 25 retailers más grandes de Estados Unidos
# Importar librerías aquí
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Importar datos aquí
df = pd.read_csv('largest_us_retailers.csv')
df.head()
Companyobject
Salesint64
0
Walmart US
658119
1
Kroger
115037
2
Costco
90048
3
Home Depot
83976
4
Walgreen Boots
78924
I. Preguntas del negocio
mean = df['Sales'][1:].mean()
mean = mean.round(2)
print("El promedio solicitado es de " + "$ " + str(mean))
El promedio solicitado es de $ 46602.42
df_sorted1 = df.sort_values('Sales', ascending=0)
df_sorted1
sns.histplot(data=df_sorted1,x='Sales',y='Company')
df_sorted2 = df.sort_values('Stores', ascending=0)
df_sorted2
sns.histplot(data=df_sorted2,x='Stores',y='Company')
#Scatterplot
sns.scatterplot(data=df_sorted2,x='Stores',y='Sales',hue='Company')
#Legend
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0)
median = df['Sales'].median()
Q1 = df['Sales'].quantile(q=0.25)
Q3 = df['Sales'].quantile(q=0.75)
min_val = df['Sales'].quantile(q=0)
max_val= df['Sales'].quantile(q=1.0)
print(min_val,Q1,median,Q3,max_val)
16592.0 24402.0 34980.0 71687.0 658119.0
IQR = Q3-Q1
IQR
rango = max_val - min_val
rango
sns.boxplot(df['Sales'])
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/_decorators.py:43: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
FutureWarning
minlimit = Q1 - 1.5*IQR
maxlimit = Q1 + 1.5*IQR
print('Rango para detección de outliers: {}, {}'.format(minlimit,maxlimit))
Rango para detección de outliers: -46525.5, 95329.5
x = df_sorted1['Company'][0:5]
y = df_sorted1['Sales'][0:5]
plt.bar(x,y)
plt.title('Top 5 compañías con más ventas')
plt.xlabel('Company')
plt.ylabel('Sales')
plt.xticks(rotation='vertical')
plt.show()
x = df_sorted2['Company'][0:5]
y = df_sorted2['Stores'][0:5]
plt.bar(x,y)
plt.title('Top 5 compañías con más tiendas')
plt.xlabel('Company')
plt.ylabel('Stores')
plt.xticks(rotation='vertical')
plt.show()
x = df_sorted1['Category'][0:5]
y = df_sorted1['Sales'][0:5]
plt.bar(x,y)
plt.title('Top 5 categorías con más ventas')
plt.xlabel('Category')
plt.ylabel('Sales')
plt.xticks(rotation='vertical')
plt.show()
df_sorted1.groupby('Category').sum()
Salesint64
25012 - 745919
Storesfloat64
0.0 - 27600.0
Supercenters
745919
6577
Grocery
297164
12026
Drug Stores
184202
22368
Home Improvement
144287
3793
Warehouse Club
90048
495
Electronic/Mail Order
71687
0
Warehouse Clubs
56828
655
Department Stores
45088
2058
Dollar Stores
43698
27600
Electronics
34980
1389
df_sorted1['Sales/Avg. Store_2'] = df_sorted1['Sales'] / df_sorted1['Stores']
df_sorted_by_sales_stores = df_sorted1.sort_values('Sales/Avg. Store_2',ascending=0)
df_sorted_by_sales_stores
Companyobject
Costco4%
Walmart US4%
23 others92%
Salesint64
16592 - 658119
2
Costco
90048
0
Walmart US
658119
10
Sam's Club
56828
24
Meijer
16592
21
HEB
21384
3
Home Depot
83976
7
Target
71208
15
Ahold
26903
8
Lowe's
60311
16
Macy's
26028
df_online = df_sorted1[df_sorted1['Stores'].isna()]
df_online
Companyobject
Salesint64
6
Amazon
71687
11
Apple incl. Online
37664
df_grocery = df_sorted1[(df['Category']=='Grocery')]
df_grocery.head()
Companyobject
Salesint64
1
Kroger
115037
9
Albertsons
56829
13
Publix
34408
15
Ahold
26903
18
Aldi
24402
df_drugstores = df_sorted2[(df['Category']=='Drug Stores')]
df_drugstores.head()
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
"""Entry point for launching an IPython kernel.
Companyobject
Salesint64
5
CVS incl. Target
77792
4
Walgreen Boots
78924
14
Rite Aid
27486
df_sorted1.tail()
Companyobject
Salesint64
20
Dollar Tree
21464
21
HEB
21384
22
Kohl's
19060
23
Delhaize
18201
24
Meijer
16592