Análisis de los 25 retailers más grandes de Estados Unidos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
import squarify # pip install squarify (algorithm for treemap)
data = pd.read_csv('largest_us_retailers.csv')
data
sns.displot(data['Sales'], bins = 30)
df = data.copy(deep = True) #Dataframe Copy
df.sort_values('Sales',ascending= False,inplace=True) #Sort values beacause to biggest goes first and keep yourself
df.drop(df.index[0],inplace = True) #remove to biggest, the first
df
I. Preguntas del negocio
prom =df['Sales'].mean() #mean
total = round(prom,2)
print(total)
f, ax = plt.subplots() #Create box dimensions
sns.histplot(df['Sales'],bins = 10,edgecolor="black", color="#69b3a2") #Show displot
ax.axvline(np.mean(df['Sales']), color='black', ls= '--', label = "Mean") #Show mean whith color Black
ax.legend();
sns.kdeplot(df['Sales'], shade=True, bw=0.05, color='olive');
rangSal = df[df['Sales'] <= 40000]
rangSal['Sales'].sum()
sns.kdeplot(df['Stores'],shade=True, bw=0.05,vertical = False, color='skyblue')
rangStor = df[df['Stores'] <= 3800]
rangStor['Stores'].sum()
sns.regplot(x=df['Stores'], y=df['Sales'],line_kws={"color":"r","alpha":0.5,"lw":5})
maxim =df['Sales'].max()
minim = df['Sales'].min()
rango =maxim - minim
rango
expenses = [maxim, rango, minim]
labels = ['Maxim', 'Range', 'Minim']
colors = [ '#B7C3F3', '#DD7596', '#8EB897']
def func(pct):
return "{:1.1f}%".format(pct)
plt.pie(expenses, labels=labels, autopct=lambda pct: func(pct), explode=[0,0.2,0], shadow=True, colors = colors)
plt.title('Sales Range')
plt.axis('equal')
plt.show()
df.sort_values('Stores', ascending= False).iloc[:5]
five = df.sort_values('Stores', ascending= False).iloc[:5]
expenses = five['Stores']
labels = ['D Gener', 'D Tree', 'Csv', 'walgreen', 'Ride']
def func(pct):
return "{:1.1f}%".format(pct)
plt.pie(expenses, labels=labels, autopct=lambda pct: func(pct), explode=[0,0,0,0.2,0.2], shadow=True)
plt.title('Five')
plt.axis('equal')
plt.show()
group = df.groupby('Category')['Sales'].sum()
group.sort_values(ascending = False)
sizes = group
#Utilise matplotlib to scale our goal numbers between the min and max, then assign this scale to our values.
norm = matplotlib.colors.Normalize(vmin=min(group), vmax=max(group))
colors = [matplotlib.cm.Blues(norm(value)) for value in group]
#Create our plot and resize it.
fig = plt.gcf()
ax = fig.add_subplot()
fig.set_size_inches(16, 4.5)
#Use squarify to plot our data, label it and add colours. We add an alpha layer to ensure black labels show through
squarify.plot(label=df['Category'],sizes=group, color = colors, alpha=.6)
plt.title("Five",fontsize=23,fontweight="bold")
#Remove our axes and display the plot
plt.axis('off')
plt.show()
df['Sales/Avg. Store_2'] = df['Sales'] / df['Stores']
org = df['Sales/Avg. Store_2'].sort_values(ascending=False)
org
sns.barplot(org, df['Company']);
plt.xticks(rotation = 'vertical');
df[df['Stores'].isna()]
isOnline = df[df['Stores'].isna()]
isOnline2 = isOnline.Sales.sum()
notOnline =df[df['Stores'].notna()]
notOnline2 = notOnline.Sales.sum()
notOnline2 - isOnline2
x = np.arange(0,1)
label = ['Is Online', 'Not Online']
fig, ax = plt.subplots(figsize=(10, 10))
# Define bar width. We'll use this to offset the second bar.
bar_width = 0.02
# Note we add the `width` parameter now which sets the width of each bar.
b1 = ax.bar(x, isOnline2, width=bar_width)
b2 = ax.bar(x + bar_width, notOnline2, width=bar_width)
df.groupby('Category')['Store Count Growth'].sum()