# A veces necesitamos instalar nuevas librerías en nuestros proyectos
!pip install openpyxl==3.0.10
# imports
import numpy as np
import pandas as pd
import sqlite3 as sql3
import openpyxl
import matplotlib.pyplot as plt
import seaborn as sns
# Levanto los datos en 3 diferentes dataframes
# Articulos
conn = sql3.connect('/work/data/articles.db')
sql_query = pd.read_sql_query('SELECT * FROM articles', conn)
df_articles = pd.DataFrame(sql_query, columns=['article_id', 'article_name', 'unit_price'])
# Vendedores
df_sellers = pd.read_excel('/work/data/sellers.xlsx', index_col=0)
# Ordenes
df_orders = pd.read_csv('/work/data/orders.csv')
# Exploración del df de artículos
print('Muestra de datos')
print(df_articles.head()) # head() 5 Filas por defecto
print('\nFormato del dataframe')
print(df_articles.shape)
print('\nBúsqueda de valores null por columna')
print(df_articles.isnull().sum())
print('\nFormato de los datos por columna')
print(df_articles.dtypes)
Muestra de datos
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
Formato del dataframe
(31, 3)
Búsqueda de valores null por columna
article_id 0
article_name 0
unit_price 0
dtype: int64
Formato de los datos por columna
article_id int64
article_name object
unit_price object
dtype: object
# Exploración del df de vendedores
print('Muestra de datos')
print(df_sellers) # head() 5 Filas por defecto
print('\nFormato del dataframe')
print(df_sellers.shape)
print('\nBúsqueda de valores null por columna')
print(df_sellers.isnull().sum())
print('\nFormato de los datos por columna')
print(df_sellers.dtypes)
Muestra de datos
seller_name
seller_id
1 Aveline Swanwick
2 Jase Doy
3 Oliviero Charkham
4 Cornie Wynrehame
5 Ewell Peres
6 Milly Christoffe
7 Kati Innot
8 Tobin Roselli
9 Onida Cosely
10 Cirilo Grandham
11 Vasily Danilyuk
12 Brockie Patience
13 Arnold Kilkenny
14 Janel O'Curran
15 Daisie Slograve
Formato del dataframe
(15, 1)
Búsqueda de valores null por columna
seller_name 0
dtype: int64
Formato de los datos por columna
seller_name object
dtype: object
# Exploración del df de órdenes
print('Muestra de datos')
print(df_orders.head()) # head() 5 Filas por defecto
print('\nFormato del dataframe')
print(df_orders.shape)
print('\nBúsqueda de valores null por columna')
print(df_orders.isnull().sum())
print('\nFormato de los datos por columna')
print(df_orders.dtypes)
Muestra de datos
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
Formato del dataframe
(1000, 6)
Búsqueda de valores null por columna
order_id 0
week 0
article_id 0
quantity 0
seller_id 0
country_name 0
dtype: int64
Formato de los datos por columna
order_id int64
week int64
article_id int64
quantity int64
seller_id int64
country_name object
dtype: object
df_articles['unit_price'] = df_articles['unit_price'].astype(float)
print(df_articles.dtypes)
article_id int64
article_name object
unit_price float64
dtype: object
# https://pandas.pydata.org/docs/user_guide/indexing.html
# https://towardsdatascience.com/how-to-use-loc-and-iloc-for-selecting-data-in-pandas-bd09cb4c3d79
# https://stackoverflow.com/questions/28754603/indexing-pandas-data-frames-integer-rows-named-columns
my_df = df_orders.copy() # shallow copy
# Cambio el indice del df de artículos
df_articles.set_index('article_id', inplace=True)
my_df = my_df.assign(article_name = my_df['article_id'])
my_df = my_df.assign(total_amount = my_df['article_id'])
my_df = my_df.assign(seller_name = my_df['seller_id'])
# print(df_articles)
# print()
# my_df
for i in range(max(my_df.count())):
# print(i)
# SINTAXIS: df_articles.loc[indice][columna]
# [indice]: va a ser el dato que obtengo de [my_df.loc[i, 'article_id']]
# o sea, tomo registro a registro el article_id y lo uso para extraer el nombre del artículo (article_name) de df_articles (tabla)
# print(df_articles.loc[my_df.loc[i ,'article_id']]['article_name'])
article = df_articles.loc[my_df.loc[i ,'article_id']]['article_name']
# print(article)
# Asignar a cada valor id de la columna 'article_name' de my_df, el nombre del artículo
my_df.loc[i, 'article_name'] = article
#my_df
# hacemos lo mismo con total_amount
my_df.loc[i, 'total_amount'] = my_df.loc[i, 'quantity']*df_articles.loc[my_df.loc[i ,'article_id']]['unit_price']
#my_df
# Columna de seller name
my_df.loc[i, 'seller_name'] = df_sellers.loc[my_df.loc[i ,'seller_id']]['seller_name']
# elimino las columnas que no necesito de my_df
my_df.drop(['order_id', 'article_id', 'seller_id'], axis='columns', inplace=True)
print(my_df)
week quantity country_name article_name total_amount seller_name
0 1 10 Peru Water Cooling 675.0 Cirilo Grandham
1 1 15 Peru Mouse 454.5 Ewell Peres
2 1 5 Bolivia Netbook 725.0 Janel O'Curran
3 1 9 Brazil Tablet 1170.0 Janel O'Curran
4 1 6 Mexico Case 227.4 Daisie Slograve
.. ... ... ... ... ... ...
995 4 1 Brazil Modem 67.5 Kati Innot
996 4 15 Brazil Heatsink 150.0 Daisie Slograve
997 4 2 Colombia Heatsink 20.0 Vasily Danilyuk
998 4 14 Brazil Tablet 1820.0 Vasily Danilyuk
999 4 12 Brazil SDD 264.0 Onida Cosely
[1000 rows x 6 columns]
# my_df.loc[:, 'quantity']
# my_df.loc[:, ['quantity','country_name']]
# my_df.loc[0:5, ['quantity','country_name']]
my_df.iloc[:, 1]
# my_df.iloc[:, [1,2]]
# my_df.iloc[0:6, [1,2]]
my_df.iloc[0:5, [1,2]]
quantityint64
country_nameobject
0
10
Peru
1
15
Peru
2
5
Bolivia
3
9
Brazil
4
6
Mexico
# RESOLUCIÓN ANALÍTICA
# Si no filtro por 'quantity' toma las series numéricas
# df7 = my_df.groupby(by='article_name').sum().sort_values('quantity', ascending=False)
# print(df7.head())
# print(df7[['quantity']].head())
# print()
# Tomamos article_name como una columna más
# df7 = my_df.groupby(by='article_name').sum().sort_values('quantity', ascending=False).reset_index()
# print(df7.head())
# df7_2 = df7[['article_name', 'quantity']].groupby('article_name').sum('quantity').sort_values('quantity', ascending=False)
# print(df7_2.head())
# print()
# print(df7_2.head().index)
# print(df7_2.index[0]) # toma correctamente el index
# Vista tipo Serie
# df7 = my_df.groupby(by='article_name')['quantity'].sum().sort_values(ascending=False).head()
# df7 = my_df.groupby(by='article_name')[['quantity'] + ['total_amount']].sum().sort_values('quantity',ascending=False).head()
# print(df7)
# print(df7.head().index)
# Vista tipo Serie
# df7 = my_df.groupby(by='article_name')['quantity'].sum().sort_values(ascending=False)
# print(df7.iloc[0:5])
# Vista tipo Serie (reset index)
# df7 = my_df.groupby(by='article_name')['quantity'].sum().sort_values(ascending=False).reset_index()
# print(df7.iloc[0:5])
# Vista tipo Serie
# df7 = my_df.groupby(by='article_name')['quantity'].sum().sort_values(ascending=False)
# df7 = my_df.groupby(by='article_name').sum().sort_values('quantity', ascending=False)
# print(df7.loc['HDD':'Netbook'])
# print(df7.loc['HDD':'Netbook']['quantity'])
# print(df7.head())
# df7 = my_df.groupby(by='article_name').agg({'quantity':'sum'}).sort_values('quantity',ascending=False)
# print(df7.head())
# df7 = my_df.groupby(by='article_name').agg({'quantity':'sum'}).rename(columns={'quantity':'Cantidad'}).sort_values('Cantidad',ascending=False)
# print(df7.head())
# df7 = my_df.groupby(by='article_name').agg({'quantity':'sum'}).reset_index().rename(columns={'quantity':'Cantidad'}).sort_values('Cantidad',ascending=False)
# print(df7.head())
df7 = my_df.groupby(by='article_name').sum().sort_values('quantity', ascending=False).head()
pd.options.display.float_format= '$ {:,.2f}'.format
# print(df7[['quantity']].head())
# print(df7[['quantity', 'total_amount']].head(5))
print(df7[['quantity', 'total_amount']])
quantity total_amount
article_name
HDD 413 $ 22,558.06
Tablet 374 $ 48,620.00
SDD 372 $ 8,184.00
Mouse 322 $ 9,756.60
Netbook 320 $ 46,400.00
# RESOLUCIÓN GRÁFICA
sns.barplot(data=df7, x=df7.index, y='quantity')
plt.xticks(rotation=45)
plt.show()
# RESOLUCIÓN ANALÍTICA
df2 = my_df.groupby(by='article_name').sum().sort_values('total_amount', ascending=False).head(5)
pd.options.display.float_format= '$ {:,.2f}'.format
print(df2['total_amount'])
# 'article_name' es el index
# print()
# print(df2.index[0])
# print()
# print(df2.reset_index().iloc[0])
# print()
# for i in range(5):
# # print(df2.index[i])
# print(df2.reset_index().iloc[i])
# print()
# print()
# print(df2.reset_index())
article_name
Full Pc $ 538,335.93
Notebook $ 251,000.00
Smartphone $ 152,250.00
Chair $ 69,477.48
Tablet $ 48,620.00
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
sns.barplot(data=df2, x=df2.index, y='total_amount')
plt.xticks(rotation=45)
plt.show()
print()
plt.pie(x=df2['total_amount'], labels=df2.index, autopct='%1.2f%%')
plt.show()
# RESOLUCIÓN ANALÍTICA
df4 = my_df.groupby(by='seller_name').sum().sort_values('total_amount', ascending=False)
pd.options.display.float_format= '$ {:,.2f}'.format
print(df4[['quantity'] + ['total_amount']].head(5))
quantity total_amount
seller_name
Janel O'Curran 703 $ 192,832.47
Brockie Patience 441 $ 142,709.88
Oliviero Charkham 555 $ 141,329.76
Vasily Danilyuk 521 $ 129,157.55
Daisie Slograve 554 $ 120,520.11
# RESOLUCIÓN GRÁFICA
plt.bar(df4.index, df4['total_amount'])
plt.xticks(rotation=90)
plt.show()
# RESOLUCIÓN ANALÍTICA
df5 = my_df.groupby(by='week').sum().sort_values('total_amount', ascending=False)
pd.options.display.float_format= '$ {:,.2f}'.format
print(df5[['quantity'] + ['total_amount']])
quantity total_amount
week
1 2449 $ 507,458.81
2 2444 $ 415,364.44
3 2114 $ 329,140.03
4 1058 $ 223,844.56
# RESOLUCIÓN GRÁFICA
plt.bar(df5.index, df5['total_amount'])
plt.show()
# RESOLUCIÓN
df_mexico = my_df[my_df['country_name'] == 'Mexico']
df_peru = my_df[my_df['country_name'] == 'Peru']
df = df_mexico.groupby(by='week').sum()
df2 = df_peru.groupby(by='week').sum()
print("Ventas en unidades Mexico")
print(df[['quantity']])
print()
print("Ventas en unidades Peru")
print(df2[['quantity']])
weeks = [1,2,3,4]
fig, ax = plt.subplots()
ax.plot(weeks, df['quantity'], label='Mexico')
ax.plot(weeks, df2['quantity'], label='Peru')
ax.legend(loc = 'upper right')
plt.show()
Ventas en unidades Mexico
quantity
week
1 275
2 202
3 192
4 177
Ventas en unidades Peru
quantity
week
1 393
2 293
3 247
4 94
# RESOLUCIÓN
df = my_df.groupby(by='country_name').sum().sort_values('total_amount', ascending=False).head()
pd.options.display.float_format= '$ {:,.2f}'.format
print(df[['total_amount']])
print()
plt.pie(x=df['total_amount'], labels=df.index, autopct='%1.2f%%')
plt.show()
total_amount
country_name
Brazil $ 441,271.85
Argentina $ 205,832.78
Colombia $ 177,514.29
Peru $ 161,421.12
Mexico $ 138,619.99
# RESOLUCIÓN
df2 = my_df.groupby(['seller_name', 'article_name']).sum().sort_values('total_amount', ascending=False).head()
df1 = my_df.groupby(['seller_name']).sum().sort_values('total_amount', ascending=False).head()
pd.options.display.float_format= '$ {:,.2f}'.format
print(df2.loc[(df2['total_amount']>=100000) ,['total_amount']])
plt.bar(df1.index, df2['total_amount'])
plt.xticks(rotation=90)
plt.show()
total_amount
seller_name article_name
Janel O'Curran Full Pc $ 119,157.36
df6 = my_df[my_df['country_name'] == 'Brazil']
#A partir del anterior, ordenar vendedores por total de ventas
vendedores_pais = df6.groupby('seller_name').sum().sort_values('total_amount', ascending=False)
vendedores_pais.head()
weekint64
quantityint64
Daisie Slograve
60
226
Aveline Swanwick
66
227
Arnold Kilkenny
47
184
Kati Innot
52
151
Tobin Roselli
54
211
plt.bar(vendedores_pais.index,vendedores_pais['total_amount'],
color = ['green' if x in vendedores_pais.index[0:5] else 'blue' for x in vendedores_pais.index]) #Color verde para los mejores 5
plt.xticks(rotation = 90)
plt.title("Desempeño vendedores del país que más compra")
plt.xlabel("Vendedores")
plt.ylabel("Total de Ventas")
plt.show()
df_brazil = my_df[my_df['country_name'] == 'Brazil']
print(df_brazil[['seller_name','total_amount', 'quantity']].groupby('seller_name').sum().sort_values('total_amount', ascending=False))
print()
df_brazil_bars = df_brazil[['seller_name','total_amount']].groupby('seller_name').sum('total_amount').sort_values('total_amount', ascending=False)
print(df_brazil_bars)
print()
df_brazil_line = df_brazil[['seller_name','quantity']].groupby('seller_name').sum('quantity').sort_values('quantity', ascending=False).reset_index()
print(df_brazil_line)
fig, ax1 = plt.subplots()
# ax1: axes1
# sns.barplot(data = df_brazil_bars, x='seller_name', y='total_amount', ax=ax1, color='C3')
sns.barplot(data=df_brazil_bars, x = df_brazil_bars.index, y='total_amount', ax=ax1, color='C3')
# ax: axes
plt.xticks(rotation=90)
ax2 = ax1.twinx()
# ax2: axes2 = ax1.twinx()
sns.lineplot(data = df_brazil_line, x='seller_name', y='quantity', ax=ax2, color='C11')
plt.xticks(rotation=90)
plt.title('Top sellers in Brazil')
ax1.set_xlabel("Seller Name")
ax1.set_ylabel("Income ($)")
ax2.set_ylabel("Quantity (line)")
plt.show()
total_amount quantity
seller_name
Daisie Slograve $ 65,283.28 226
Aveline Swanwick $ 58,771.48 227
Arnold Kilkenny $ 47,243.54 184
Kati Innot $ 32,087.32 151
Tobin Roselli $ 31,997.93 211
Janel O'Curran $ 31,562.86 182
Onida Cosely $ 29,354.31 206
Jase Doy $ 28,493.25 149
Vasily Danilyuk $ 27,495.77 124
Milly Christoffe $ 21,247.38 161
Cirilo Grandham $ 21,061.74 158
Ewell Peres $ 15,253.22 91
Oliviero Charkham $ 14,951.72 195
Brockie Patience $ 10,756.96 125
Cornie Wynrehame $ 5,711.09 125
total_amount
seller_name
Daisie Slograve $ 65,283.28
Aveline Swanwick $ 58,771.48
Arnold Kilkenny $ 47,243.54
Kati Innot $ 32,087.32
Tobin Roselli $ 31,997.93
Janel O'Curran $ 31,562.86
Onida Cosely $ 29,354.31
Jase Doy $ 28,493.25
Vasily Danilyuk $ 27,495.77
Milly Christoffe $ 21,247.38
Cirilo Grandham $ 21,061.74
Ewell Peres $ 15,253.22
Oliviero Charkham $ 14,951.72
Brockie Patience $ 10,756.96
Cornie Wynrehame $ 5,711.09
seller_name quantity
0 Aveline Swanwick 227
1 Daisie Slograve 226
2 Tobin Roselli 211
3 Onida Cosely 206
4 Oliviero Charkham 195
5 Arnold Kilkenny 184
6 Janel O'Curran 182
7 Milly Christoffe 161
8 Cirilo Grandham 158
9 Kati Innot 151
10 Jase Doy 149
11 Brockie Patience 125
12 Cornie Wynrehame 125
13 Vasily Danilyuk 124
14 Ewell Peres 91