# A veces necesitamos instalar nuevas librerías en nuestros proyectos
!pip install openpyxl==3.0.10
Requirement already satisfied: openpyxl==3.0.10 in /root/venv/lib/python3.9/site-packages (3.0.10)
Requirement already satisfied: et-xmlfile in /root/venv/lib/python3.9/site-packages (from openpyxl==3.0.10) (1.1.0)
WARNING: You are using pip version 22.0.4; however, version 22.1.2 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# imports
import numpy as np
import pandas as pd
import sqlite3 as sql3
import openpyxl
import matplotlib.pyplot as plt
import seaborn as sns
# Levanto los datos en 3 diferentes dataframes
#Articulos
conn = sql3.connect('/work/data/articles.db')
sql_query = pd.read_sql_query('SELECT * FROM articles', conn)
df_articles = pd.DataFrame(sql_query, columns=['article_id', 'article_name', 'unit_price'])
# Vendedores
df_sellers = pd.read_excel('/work/data/sellers.xlsx', index_col=0)
# Ordenes
df_orders = pd.read_csv('/work/data/orders.csv')
# Exploración del df de artículos
print('Muestra de datos')
print(df_articles.head()) # head() 5 Filas por defecto
print('\nFormato del dataframe')
print(df_articles.shape)
print('\nBusqueda de valores nulls por columna')
print(df_articles.isnull().sum())
print('\nFormato de los datos por columna')
print(df_articles.dtypes)
Muestra de datos
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
Formato del dataframe
(31, 3)
Busqueda de valores nulls por columna
article_id 0
article_name 0
unit_price 0
dtype: int64
Formato de los datos por columna
article_id int64
article_name object
unit_price object
dtype: object
# Exploración del df de vendedores
print('Muestra de datos')
print(df_sellers.head()) # head() 5 Filas por defecto
print('\nFormato del dataframe')
print(df_sellers.shape)
print('\nBusqueda de valores nuls por columna')
print(df_sellers.isnull().sum())
print('\nFormato de los datos por columna')
print(df_sellers.dtypes)
Muestra de datos
seller_name
seller_id
1 Aveline Swanwick
2 Jase Doy
3 Oliviero Charkham
4 Cornie Wynrehame
5 Ewell Peres
Formato del dataframe
(15, 1)
Busqueda de valores nuls por columna
seller_name 0
dtype: int64
Formato de los datos por columna
seller_name object
dtype: object
# Exploración del df de órdenes
print('Muestra de datos')
print(df_orders.head()) # head() 5 Filas por defecto
print('\nFormato del dataframe')
print(df_orders.shape)
print('\nBusqueda de valores nuls por columna')
print(df_orders.isnull().sum())
print('\nFormato de los datos por columna')
print(df_orders.dtypes)
Muestra de datos
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
Formato del dataframe
(1000, 6)
Busqueda de valores nuls por columna
order_id 0
week 0
article_id 0
quantity 0
seller_id 0
country_name 0
dtype: int64
Formato de los datos por columna
order_id int64
week int64
article_id int64
quantity int64
seller_id int64
country_name object
dtype: object
df_articles['unit_price'] = df_articles['unit_price'].astype(float)
print(df_articles.dtypes)
article_id int64
article_name object
unit_price float64
dtype: object
my_df = df_orders.copy() # shallow copy o copia superficial (otro Objeto alojado en la memoria)
# Cambio el indice del df de artículos
df_articles.set_index('article_id', inplace=True)
my_df = my_df.assign(article_name = my_df['article_id'])
my_df = my_df.assign(total_amount = my_df['article_id'])
my_df = my_df.assign(seller_name = my_df['seller_id'])
# df_articles
# print()
my_df
for i in range(max(my_df.count())):
# SINTAXIS: df_articles.loc[indice][columna]
# [indice] va a ser el dato que obtengo de my_df.loc[i]['article_id']
# [indice] -> [my_df.loc[i]['article_id']]
# o sea, tomo registro a registro el article_id y lo uso para extraer el nombre del artículo (article_name)
# print(df_articles.loc[my_df.loc[i]['article_id']]['article_name'])
article = df_articles.loc[my_df.loc[i]['article_id']]['article_name']
# print(article)
# Asignar a cada valor id de la columna 'article_name' (my_df) el nombre del artículo
my_df.loc[i, 'article_name'] = article
# my_df
# hacemos lo mismo con total amount
my_df.loc[i, 'total_amount'] = my_df.loc[i, 'quantity']*df_articles.loc[my_df.loc[i]['article_id']]['unit_price']
# columna de seller_name
my_df.loc[i, 'seller_name'] = df_sellers.loc[my_df.loc[i]['seller_id']]['seller_name']
# elimino las columnas que no necesito del df
my_df.drop(['order_id', 'article_id', 'seller_id'], axis='columns', inplace=True)
print(my_df)
week quantity country_name article_name total_amount seller_name
0 1 10 Peru Water Cooling 675.0 Cirilo Grandham
1 1 15 Peru Mouse 454.5 Ewell Peres
2 1 5 Bolivia Netbook 725.0 Janel O'Curran
3 1 9 Brazil Tablet 1170.0 Janel O'Curran
4 1 6 Mexico Case 227.4 Daisie Slograve
.. ... ... ... ... ... ...
995 4 1 Brazil Modem 67.5 Kati Innot
996 4 15 Brazil Heatsink 150.0 Daisie Slograve
997 4 2 Colombia Heatsink 20.0 Vasily Danilyuk
998 4 14 Brazil Tablet 1820.0 Vasily Danilyuk
999 4 12 Brazil SDD 264.0 Onida Cosely
[1000 rows x 6 columns]
# RESOLUCIÓN ANALÍTICA
print(my_df['article_name'].value_counts()) # cuenta valores únicos
HDD 47
Netbook 45
SDD 45
Tablet 40
Usb Cable 39
Sata Cable 38
Pci Express Port 37
Range Extender 36
Smartphone 35
Heatsink 34
Power Supply 34
Mouse 34
Full Pc 34
Headphones 34
Motherboard 33
Water Cooling 32
Video Card 31
Ram Memory 31
Notebook 30
Modem 29
CPU 29
Mesh Wi-Fi X 2 28
Webcam 28
Desk 28
Case 26
Monitor 26
Fan Cooler 25
Scanner 24
Chair 24
Wi-Fi Card 22
Keyboard 22
Name: article_name, dtype: int64
# RESOLUCIÓN GRÁFICA
sns.displot(my_df, x='article_name')
plt.xticks(rotation=90) # 'vertical'
plt.show()
# RESOLUCIÓN ANALÍTICA
df2 = (my_df.groupby(by='article_name').sum()).sort_values('total_amount', ascending=False).head(5)
pd.options.display.float_format='$ {:,.2f}'.format
print(df2['total_amount'])
# 'article_name' es el index
# print(df2) # agrupa (y suma) sólo variables numéricas
# SINTAXIS ALTERNATIVA
# pd.options.display.float_format='$ {:,.2f}'.format
# print(my_df[['article_name'] + ['total_amount']].groupby(['article_name']).sum().sort_values('total_amount',ascending=False).head(5))
article_name
Full Pc $ 538,335.93
Notebook $ 251,000.00
Smartphone $ 152,250.00
Chair $ 69,477.48
Tablet $ 48,620.00
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
plt.pie(x=df2['total_amount'],labels=df2.index, autopct='%1.2f%%')
plt.show()
# index es el 'article_name' del df2
# RESOLUCIÓN ANALÍTICA
df4 = (my_df.groupby(by='seller_name').sum()).sort_values('total_amount', ascending=False)
pd.options.display.float_format='$ {:,.2f}'.format
# print(df4[['quantity'] + ['total_amount']])
print(df4[['quantity'] + ['total_amount']].head(5))
# 'seller_name' es el index
quantity total_amount
seller_name
Janel O'Curran 703 $ 192,832.47
Brockie Patience 441 $ 142,709.88
Oliviero Charkham 555 $ 141,329.76
Vasily Danilyuk 521 $ 129,157.55
Daisie Slograve 554 $ 120,520.11
# RESOLUCIÓN GRÁFICA
plt.bar(df4.index, df4['total_amount'])
plt.xticks(rotation=60)
plt.show()
# RESOLUCIÓN ANALÍTICA
df5 = (my_df.groupby(by='week').sum()).sort_values('total_amount', ascending=False)
pd.options.display.float_format='$ {:,.2f}'.format
print(df5[['quantity'] + ['total_amount']])
# 'week' es el index
quantity total_amount
week
1 2449 $ 507,458.81
2 2444 $ 415,364.44
3 2114 $ 329,140.03
4 1058 $ 223,844.56
# RESOLUCIÓN GRÁFICA
plt.bar(df5.index, df5['total_amount'])
plt.show()
# Resolución Analítica y Gráfica
#RESOLUCION ANALITICA
#Eleccion de paises: 2 de America del Norte/Central y 2 de America del Sur
print(my_df['country_name'].value_counts())
#Comparacion
print()
print('VARIACION DE VENTAS POR SEMANA DE BRASIL')
df_Brazil = my_df.loc[(my_df['country_name'] == 'Brazil')]
print(df_Brazil.groupby(by='week').agg({'total_amount':'sum'}).sort_values('week', ascending=True))
df_Brazil_Ventas = df_Brazil.groupby(by='week').agg({'total_amount':'sum'}).sort_values('week', ascending=True)
print()
print('VARIACION DE VENTAS POR SEMANA DE URUGUAY')
df_Uruguay = my_df.loc[(my_df['country_name'] == 'Uruguay')]
print(df_Uruguay.groupby(by='week').agg({'total_amount':'sum'}).sort_values('week', ascending=True))
df_Uruguay_Ventas = df_Uruguay.groupby(by='week').agg({'total_amount':'sum'}).sort_values('week', ascending=True)
print()
print('VARIACION DE VENTAS POR SEMANA DE MEXICO')
df_Mexico = my_df.loc[(my_df['country_name'] == 'Mexico')]
print(df_Mexico.groupby(by='week').agg({'total_amount':'sum'}).sort_values('week', ascending=True))
df_Mexico_Ventas = df_Mexico.groupby(by='week').agg({'total_amount':'sum'}).sort_values('week', ascending=True)
print()
print('VARIACION DE VENTAS POR SEMANA DE HONDURAS')
df_Honduras = my_df.loc[(my_df['country_name'] == 'Honduras')]
print(df_Honduras.groupby(by='week').agg({'total_amount':'sum'}).sort_values('week', ascending=True))
df_Honduras_Ventas = df_Honduras.groupby(by='week').agg({'total_amount':'sum'}).sort_values('week', ascending=True)
Brazil 315
Peru 125
Argentina 113
Colombia 111
Mexico 101
Honduras 38
Venezuela 37
Guatemala 29
Chile 28
Bolivia 19
Costa Rica 19
Ecuador 18
Paraguay 17
El Salvador 16
Uruguay 12
Puerto Rico 2
Name: country_name, dtype: int64
VARIACION DE VENTAS POR SEMANA DE BRASIL
total_amount
week
1 $ 165,289.90
2 $ 106,803.85
3 $ 79,341.50
4 $ 89,836.60
VARIACION DE VENTAS POR SEMANA DE URUGUAY
total_amount
week
1 $ 303.20
2 $ 14,562.58
3 $ 2,812.33
4 $ 164.98
VARIACION DE VENTAS POR SEMANA DE MEXICO
total_amount
week
1 $ 58,549.25
2 $ 20,959.09
3 $ 19,577.33
4 $ 39,534.32
VARIACION DE VENTAS POR SEMANA DE HONDURAS
total_amount
week
1 $ 24,953.09
2 $ 2,744.10
3 $ 7,285.07
4 $ 1,781.30
#RESOLUCIÓN GRÁFICA
sns.displot(my_df, x='country_name', color='peru')
plt.xticks(rotation=90)
plt.title("Frecuencia de Venta por País")
plt.xlabel("País")
plt.ylabel("Frecuencia")
plt.show()
print("COMPARACION DE VENTAS POR PAIS - CONO SUR")
#Plot 1
plt.subplot(1,2,1)
sns.barplot(df_Brazil_Ventas.index, df_Brazil_Ventas['total_amount'],palette='pastel')
plt.xticks(rotation=0)
plt.yticks(rotation=60)
plt.title("Brasil")
plt.xlabel("Semana")
plt.ylabel("Monto")
#Plot 2
plt.subplot(1,2,2)
sns.barplot(df_Uruguay_Ventas.index, df_Uruguay_Ventas['total_amount'],palette='pastel')
plt.xticks(rotation=0)
plt.yticks(rotation=60)
plt.title("Uruguay")
plt.xlabel("Semana")
plt.ylabel("")
plt.show()
print("COMPARACION DE VENTAS POR PAIS - CONO NORTE")
#Plot 3
plt.subplot(1,2,1)
sns.barplot(df_Mexico_Ventas.index, df_Mexico_Ventas['total_amount'],palette='pastel')
plt.xticks(rotation=0)
plt.yticks(rotation=60)
plt.title("Mexico")
plt.xlabel("Semana")
plt.ylabel("Monto")
#Plot 4
plt.subplot(1,2,2)
sns.barplot(df_Honduras_Ventas.index, df_Honduras_Ventas['total_amount'],palette='pastel')
plt.xticks(rotation=0)
plt.yticks(rotation=60)
plt.title("Honduras")
plt.xlabel("Semana")
plt.ylabel("")
plt.show()
COMPARACION DE VENTAS POR PAIS - CONO SUR
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
COMPARACION DE VENTAS POR PAIS - CONO NORTE
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
# RESOLUCIÓN ANALITICA
df_Brazil_Top3Sup = (df_Brazil[['article_name'] + ['quantity']].groupby(['article_name']).sum().sort_values('quantity',ascending=False).head(3))
print('ARTICULOS MAS VENDIDOS EN BRASIL')
print(df_Brazil_Top3Sup['quantity'])
#otra alternativa
#print(df_Brazil[['article_name'] + ['quantity']].groupby(['article_name']).sum().sort_values('quantity',ascending=False).head(3))
print()
df_Uruguay_Top3Sup = (df_Uruguay[['article_name'] + ['quantity']].groupby(['article_name']).sum().sort_values('quantity',ascending=False).head(3))
print('ARTICULOS MAS VENDIDOS EN URUGUAY')
print(df_Uruguay_Top3Sup['quantity'])
print()
df_Mexico_Top3Sup = (df_Mexico[['article_name'] + ['quantity']].groupby(['article_name']).sum().sort_values('quantity',ascending=False).head(3))
print('ARTICULOS MAS VENDIDOS EN MEXICO')
print(df_Mexico_Top3Sup['quantity'])
print()
df_Honduras_Top3Sup = (df_Honduras[['article_name'] + ['quantity']].groupby(['article_name']).sum().sort_values('quantity',ascending=False).head(3))
print('ARTICULOS MAS VENDIDOS EN HONDURAS')
print(df_Honduras_Top3Sup['quantity'])
ARTICULOS MAS VENDIDOS EN BRASIL
article_name
Tablet 156
HDD 119
SDD 117
Name: quantity, dtype: int64
ARTICULOS MAS VENDIDOS EN URUGUAY
article_name
Motherboard 15
Pci Express Port 14
Headphones 13
Name: quantity, dtype: int64
ARTICULOS MAS VENDIDOS EN MEXICO
article_name
Water Cooling 63
HDD 63
SDD 58
Name: quantity, dtype: int64
ARTICULOS MAS VENDIDOS EN HONDURAS
article_name
Mesh Wi-Fi X 2 28
Usb Cable 28
Mouse 27
Name: quantity, dtype: int64
#RESOLUCION GRAFICA
print("TOP 3 ARTICULOS MÁS VENDIDOS POR PAIS - CONO SUR")
#Plot 1
plt.subplot(1,2,1)
plt.barh(df_Brazil_Top3Sup.index, df_Brazil_Top3Sup['quantity'], color='paleturquoise')
plt.xticks(rotation=0)
plt.yticks(rotation=90)
plt.title("Brasil")
plt.xlabel("Cantidad de Ventas")
plt.ylabel("Artículos")
#Plot 2
plt.subplot(1,2,2)
plt.barh(df_Uruguay_Top3Sup.index, df_Uruguay_Top3Sup['quantity'], color='paleturquoise')
plt.xticks(rotation=0)
plt.yticks(rotation=90)
plt.title("Uruguay")
plt.xlabel("Cantidad de Ventas")
plt.ylabel("")
plt.show()
print("TOP 3 ARTICULOS MÁS VENDIDOS POR PAIS - CONO NORTE")
#Plot 3
plt.subplot(1,2,1)
plt.barh(df_Mexico_Top3Sup.index, df_Mexico_Top3Sup['quantity'], color='mediumturquoise')
plt.xticks(rotation=0)
plt.yticks(rotation=90)
plt.title("Mexico")
plt.xlabel("Cantidad de Ventas")
plt.ylabel("Artículos")
#Plot 4
plt.subplot(1,2,2)
plt.barh(df_Honduras_Top3Sup.index, df_Honduras_Top3Sup['quantity'], color='mediumturquoise')
plt.xticks(rotation=0)
plt.yticks(rotation=90)
plt.title("Honduras")
plt.xlabel("Cantidad de Ventas")
plt.show()
TOP 3 ARTICULOS MÁS VENDIDOS POR PAIS - CONO SUR
TOP 3 ARTICULOS MÁS VENDIDOS POR PAIS - CONO NORTE
# RESOLUCIÓN
# Mejor vendedor/producto. Mostrar importe/cantidad. Comparar 2 países (variable categórica)
# Resolución Analítica y Gráfica
df_Brazil_Top3Inf = (df_Brazil[['article_name'] + ['quantity']].groupby(['article_name']).sum().sort_values('quantity',ascending=False).tail(3))
print('ARTICULOS MENOS VENDIDOS EN BRASIL')
print(df_Brazil_Top3Inf['quantity'])
print()
df_Uruguay_Top3Inf = (df_Uruguay[['article_name'] + ['quantity']].groupby(['article_name']).sum().sort_values('quantity',ascending=False).tail(3))
print('ARTICULOS MENOS VENDIDOS EN URUGUAY')
print(df_Uruguay_Top3Inf['quantity'])
print()
df_Mexico_Top3Inf = (df_Mexico[['article_name'] + ['quantity']].groupby(['article_name']).sum().sort_values('quantity',ascending=False).tail(3))
print('ARTICULOS MENOS VENDIDOS EN MEXICO')
print(df_Mexico_Top3Inf['quantity'])
print()
df_Honduras_Top3Inf = (df_Honduras[['article_name'] + ['quantity']].groupby(['article_name']).sum().sort_values('quantity',ascending=False).tail(3))
print('ARTICULOS MENOS VENDIDOS EN HONDURAS')
print(df_Honduras_Top3Inf['quantity'])
ARTICULOS MENOS VENDIDOS EN BRASIL
article_name
Keyboard 37
Wi-Fi Card 37
Monitor 19
Name: quantity, dtype: int64
ARTICULOS MENOS VENDIDOS EN URUGUAY
article_name
Mesh Wi-Fi X 2 6
Full Pc 4
Webcam 1
Name: quantity, dtype: int64
ARTICULOS MENOS VENDIDOS EN MEXICO
article_name
Sata Cable 8
Power Supply 2
Chair 1
Name: quantity, dtype: int64
ARTICULOS MENOS VENDIDOS EN HONDURAS
article_name
Range Extender 6
Webcam 5
Keyboard 2
Name: quantity, dtype: int64
#RESOLUCION GRAFICA
print("ARTICULOS MENOS VENDIDOS POR PAIS - CONO SUR")
#Plot 1
plt.subplot(1,2,1)
e=(0.05,0.05,0.05)
a=['goldenrod','gold', 'wheat']
plt.pie(df_Brazil_Top3Inf['quantity'],labels=df_Brazil_Top3Inf.index,autopct='%1.0f%%',explode=e, colors=a,pctdistance=0.65)
plt.title("Brasil")
#Plot 2
plt.subplot(1,2,2)
e=(0.05,0.05,0.05)
a=['goldenrod','gold', 'wheat']
plt.pie(df_Uruguay_Top3Inf['quantity'],labels=df_Uruguay_Top3Inf.index,autopct='%1.0f%%',explode=e, colors=a,pctdistance=0.65)
plt.title("Uruguay")
plt.show()
print("ARTICULOS MENOS VENDIDOS POR PAIS - CONO NORTE")
#Plot 3
plt.subplot(1,2,1)
e=(0.05,0.05,0.05)
a=['coral','orange', 'navajowhite']
plt.pie(df_Mexico_Top3Inf['quantity'],labels=df_Mexico_Top3Inf.index,autopct='%1.0f%%',explode=e, colors=a,pctdistance=0.65)
plt.title("Mexico")
#Plot 4
plt.subplot(1,2,2)
e=(0.05,0.05,0.05)
a=['coral','orange', 'navajowhite']
plt.pie(df_Honduras_Top3Inf['quantity'],labels=df_Honduras_Top3Inf.index,autopct='%1.0f%%',explode=e, colors=a,pctdistance=0.65)
plt.title("Honduras")
plt.show()
ARTICULOS MENOS VENDIDOS POR PAIS - CONO SUR
ARTICULOS MENOS VENDIDOS POR PAIS - CONO NORTE