# A veces necesitamos instalar nuevas librerías en nuestros proyectos
!pip install openpyxl==3.0.10
Collecting openpyxl==3.0.10
Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 242.1/242.1 KB 9.9 MB/s eta 0:00:00
Collecting et-xmlfile
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10
WARNING: You are using pip version 22.0.4; however, version 22.1.2 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# Levanto los datos en 3 diferentes dataframes
con = sq3.connect('/work/data/articles.db')
sql_query = pd.read_sql_query('SELECT * FROM articles', con)
df_articles = pd.DataFrame(sql_query, columns=['article_id', 'article_name', 'unit_price'])
df_sellers = pd.read_excel('/work/data/sellers.xlsx', index_col=0)
df_orders =pd.read_csv('/work/data/orders.csv')
print(df_orders)
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
.. ... ... ... ... ... ...
995 16019 4 20021 1 7 Brazil
996 16020 4 20040 15 15 Brazil
997 16021 4 20040 2 11 Colombia
998 16022 4 20018 14 11 Brazil
999 16023 4 20026 12 9 Brazil
[1000 rows x 6 columns]
#Cambio de indice del df_articles
df_articles.set_index('article_id', inplace=True)
print(df_articles.head())
article_name unit_price
article_id
20015 Smartphone 525.00
20016 Full Pc 2127.81
20017 Monitor 230.00
20018 Tablet 130.00
20019 Desk 130.10
#Creo una copia de df_orders y le agrego las columnas que necesito
my_df = df_orders.copy()
my_df = my_df.assign(article_name = my_df['article_id'])
my_df = my_df.assign(total_amount = my_df['article_id'])
my_df = my_df.assign(seller_name = my_df['seller_id'])
print(my_df.head())
order_id week article_id quantity seller_id country_name article_name \
0 15024 1 20039 10 10 Peru 20039
1 15025 1 20029 15 5 Peru 20029
2 15026 1 20024 5 14 Bolivia 20024
3 15027 1 20018 9 14 Brazil 20018
4 15028 1 20035 6 15 Mexico 20035
total_amount seller_name
0 20039 10
1 20029 5
2 20024 14
3 20018 14
4 20035 15
#Reemplazar los valores reales en el df
for i in range (len(my_df.index)):
#len(my_df.index) devuelva la cantidad de filas (registros)
#reemplazo el nombre del articulo usando el id guardado en my_df
article = df_articles.loc[my_df.loc[i]['article_id']]['article_name']
my_df.loc[i, 'article_name'] = article
#reemplazo el nombre del vendedor usando el id guardado en my_df
my_df.loc[i, 'seller_name'] = df_sellers.loc[my_df.loc[i]['seller_id']]['seller_name']
#busco el pecio unitario y lo multiplico por la cantidad de unidades vendidas.
my_df.loc[i,'total_amount'] = df_articles.loc[my_df.loc[i]['article_id']]['unit_price']*my_df.loc[i,'quantity']
#elimino las columnas que no utilizo
my_df.drop(['article_id', 'seller_id', 'order_id'], axis='columns', inplace=True)
print(my_df.head())
week quantity country_name article_name total_amount seller_name
0 1 10 Peru Water Cooling 675.0 Cirilo Grandham
1 1 15 Peru Mouse 454.5 Ewell Peres
2 1 5 Bolivia Netbook 725.0 Janel O'Curran
3 1 9 Brazil Tablet 1170.0 Janel O'Curran
4 1 6 Mexico Case 227.4 Daisie Slograve
# RESOLUCIÓN ANALÍTICA
my_df2 = my_df.groupby('article_name').sum()
por_cantidad= my_df2.sort_values('quantity', ascending=False)
print(por_cantidad['quantity'].head(1))
'''
# Otra RESOLUCIÓN ANALÍTICA
my_df2 = my_df.groupby('article_name').sum()
print(my_df2['quantity'].max())
De esta otra manera, sólo nos devuelve el número (413)
'''
article_name
HDD 413
Name: quantity, dtype: int64
# RESOLUCIÓN GRÁFICA
sns.barplot(x=my_df2.index, y=my_df2['quantity'], data=my_df2, order= my_df2.sort_values('quantity', ascending=False).index)
plt.xticks(rotation=90)
plt.show()
# OTRA RESOLUCIÓN GRÁFICA
plt.figure(figsize=(5,8))
plt.barh(por_cantidad.index, por_cantidad['quantity'],color= 'green', alpha= 0.8)
plt.xlabel('Cantidad vendida')
plt.ylabel('Producto')
plt.title('Unidades vendidas por producto')
plt.show()
#RESOLUCIÓN ANALÍTICA
my_df2 = my_df.groupby('article_name').sum()
por_precio = my_df2.sort_values('total_amount', ascending=False).head()
print(por_precio['total_amount'].head())
article_name
Full Pc 538335.93
Notebook 251000.00
Smartphone 152250.00
Chair 69477.48
Tablet 48620.00
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
# OJO: Solo graficamos los 5 primeros productos, si ponemos el porcentaje de cada uno en el grafico,
# no sería el numero real.
plt.pie(x=por_precio['total_amount'], labels=por_precio.index)
plt.show()
# RESOLUCIÓN ANALÍTICA
df3= my_df.groupby('seller_name').sum().sort_values('total_amount', ascending=False).head()
print(df3[['quantity']+['total_amount']])
quantity total_amount
seller_name
Janel O'Curran 703 192832.47
Brockie Patience 441 142709.88
Oliviero Charkham 555 141329.76
Vasily Danilyuk 521 129157.55
Daisie Slograve 554 120520.11
# RESOLUCIÓN GRÁFICA
df3= my_df.groupby('seller_name').sum().sort_values('total_amount', ascending=False).head()
plt.bar(df3.index,df3['total_amount'])
plt.xticks(rotation=25)
plt.show()
# RESOLUCIÓN ANALÍTICA
df4= my_df.groupby('week').sum()
print(df4)
quantity total_amount
week
1 2449 507458.81
2 2444 415364.44
3 2114 329140.03
4 1058 223844.56
# RESOLUCIÓN GRÁFICA
plt.bar(df4.index, df4['total_amount'])
plt.show()
# RESOLUCIÓN
df5 = my_df.groupby('country_name').sum()
por_cantidad = df5.sort_values('quantity', ascending=False).head(1)
por_cantidad['quantity']
df6=my_df[(my_df['country_name'])== 'Brazil']
df11=df6.groupby(['week','seller_name'],as_index=False).sum()
print(df11[['week']+['seller_name'] + ['total_amount']].sort_values('total_amount', ascending=False))
sns.set(rc={'figure.figsize':(15,8)})
sns.barplot(x='week', y='total_amount',data=df11, hue='seller_name').set(title='Cantidad de unidades despachadas por vendedor')
plt.legend(bbox_to_anchor= (1.2,1))
plt.show()
week seller_name total_amount
16 2 Aveline Swanwick 33183.54
5 1 Daisie Slograve 32346.44
0 1 Arnold Kilkenny 28572.80
7 1 Janel O'Curran 23278.97
29 2 Vasily Danilyuk 19116.78
8 1 Jase Doy 18720.47
49 4 Daisie Slograve 15252.77
31 3 Aveline Swanwick 14898.65
57 4 Tobin Roselli 14855.21
53 4 Kati Innot 14241.93
20 2 Daisie Slograve 13785.22
42 3 Onida Cosely 13725.92
30 3 Arnold Kilkenny 10333.06
9 1 Kati Innot 9442.00
10 1 Milly Christoffe 8778.44
3 1 Cirilo Grandham 8492.02
15 2 Arnold Kilkenny 8337.68
50 4 Ewell Peres 8333.00
24 2 Kati Innot 7897.14
13 1 Tobin Roselli 7511.00
54 4 Milly Christoffe 7075.00
38 3 Jase Doy 6811.48
43 3 Tobin Roselli 6159.97
45 4 Aveline Swanwick 5756.35
41 3 Oliviero Charkham 5736.11
12 1 Onida Cosely 5501.19
56 4 Onida Cosely 5468.82
47 4 Cirilo Grandham 5034.60
1 1 Aveline Swanwick 4932.94
27 2 Onida Cosely 4658.38
33 3 Cirilo Grandham 4544.10
55 4 Oliviero Charkham 4515.12
2 1 Brockie Patience 4384.94
14 1 Vasily Danilyuk 4243.27
6 1 Ewell Peres 4040.00
35 3 Daisie Slograve 3898.85
28 2 Tobin Roselli 3471.75
37 3 Janel O'Curran 3306.13
51 4 Janel O'Curran 3236.32
18 2 Cirilo Grandham 2991.02
25 2 Milly Christoffe 2971.33
11 1 Oliviero Charkham 2939.54
23 2 Jase Doy 2900.58
32 3 Brockie Patience 2561.50
40 3 Milly Christoffe 2422.61
46 4 Brockie Patience 2366.24
58 4 Vasily Danilyuk 2290.54
4 1 Cornie Wynrehame 2105.88
21 2 Ewell Peres 1890.52
44 3 Vasily Danilyuk 1845.18
26 2 Oliviero Charkham 1760.95
22 2 Janel O'Curran 1741.44
34 3 Cornie Wynrehame 1601.99
17 2 Brockie Patience 1444.28
48 4 Cornie Wynrehame 1349.98
36 3 Ewell Peres 989.70
19 2 Cornie Wynrehame 653.24
39 3 Kati Innot 506.25
52 4 Jase Doy 60.72
# RESOLUCIÓN
df7=my_df.groupby('seller_name').agg(promedio=('total_amount',np.mean)).round(2)
#df7= my_df.groupby('seller_name')[['total_amount']].mean().round(2).sort_values('total_amount', ascending=False)
print(df7)
promedio
seller_name
Arnold Kilkenny 1390.47
Aveline Swanwick 1584.99
Brockie Patience 2503.68
Cirilo Grandham 725.96
Cornie Wynrehame 746.48
Daisie Slograve 1673.89
Ewell Peres 1184.00
Janel O'Curran 2440.92
Jase Doy 1104.50
Kati Innot 1328.64
Milly Christoffe 1143.22
Oliviero Charkham 1990.56
Onida Cosely 1154.83
Tobin Roselli 949.74
Vasily Danilyuk 2050.12