# A veces necesitamos instalar nuevas librerías en nuestros proyectos
!pip install openpyxl==3.0.10
Requirement already satisfied: openpyxl==3.0.10 in /root/venv/lib/python3.9/site-packages (3.0.10)
Requirement already satisfied: et-xmlfile in /root/venv/lib/python3.9/site-packages (from openpyxl==3.0.10) (1.1.0)
WARNING: You are using pip version 22.0.4; however, version 22.1.2 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# imports # importacion de librerias
import numpy as np
import pandas as pd
import sqlite3 as sql3
import matplotlib.pyplot as plt
import seaborn as sns
import openpyxl # libreria para leer/escribir archivos excel 2010 xlsx/xlsm/xltx/xltm
# Levanto los datos en 3 diferentes dataframes: df_articles, df_sellers, df_orders
# Articulos
conn = sql3.connect('/work/data/articles.db')
sql_query = pd.read_sql_query('SELECT * FROM articles', conn)
df_articles = pd.DataFrame(sql_query, columns=['article_id', 'article_name', 'unit_price'])
print(df_articles)
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
5 20020 Chair 335.64
6 20021 Modem 67.50
7 20022 Range Extender 20.45
8 20023 Notebook 1000.00
9 20024 Netbook 145.00
10 20025 HDD 54.62
11 20026 SDD 22.00
12 20027 Ram Memory 35.95
13 20028 Motherboard 138.38
14 20029 Mouse 30.30
15 20030 Fan Cooler 4.25
16 20031 Webcam 20.07
17 20032 Keyboard 22.60
18 20033 Headphones 23.30
19 20034 Scanner 185.00
20 20035 Case 37.90
21 20036 Video Card 131.50
22 20037 CPU 139.62
23 20038 Power Supply 43.95
24 20039 Water Cooling 67.50
25 20040 Heatsink 10.00
26 20041 Usb Cable 2.95
27 20042 Sata Cable 2.14
28 20043 Pci Express Port 10.12
29 20044 Wi-Fi Card 59.61
30 20045 Mesh Wi-Fi X 2 32.50
# Vendedores
df_sellers = pd.read_excel('/work/data/sellers.xlsx', index_col=0)
print(df_sellers)
seller_name
seller_id
1 Aveline Swanwick
2 Jase Doy
3 Oliviero Charkham
4 Cornie Wynrehame
5 Ewell Peres
6 Milly Christoffe
7 Kati Innot
8 Tobin Roselli
9 Onida Cosely
10 Cirilo Grandham
11 Vasily Danilyuk
12 Brockie Patience
13 Arnold Kilkenny
14 Janel O'Curran
15 Daisie Slograve
# Ordenes
df_orders = pd.read_csv('/work/data/orders.csv')
print(df_articles)
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
5 20020 Chair 335.64
6 20021 Modem 67.50
7 20022 Range Extender 20.45
8 20023 Notebook 1000.00
9 20024 Netbook 145.00
10 20025 HDD 54.62
11 20026 SDD 22.00
12 20027 Ram Memory 35.95
13 20028 Motherboard 138.38
14 20029 Mouse 30.30
15 20030 Fan Cooler 4.25
16 20031 Webcam 20.07
17 20032 Keyboard 22.60
18 20033 Headphones 23.30
19 20034 Scanner 185.00
20 20035 Case 37.90
21 20036 Video Card 131.50
22 20037 CPU 139.62
23 20038 Power Supply 43.95
24 20039 Water Cooling 67.50
25 20040 Heatsink 10.00
26 20041 Usb Cable 2.95
27 20042 Sata Cable 2.14
28 20043 Pci Express Port 10.12
29 20044 Wi-Fi Card 59.61
30 20045 Mesh Wi-Fi X 2 32.50
# Exploración del df de artículos
print('Muestra de datos:')
print(df_articles.head())
print('\nFormato del dataframe:')
print(df_articles.shape)
print('\nBusqueda y conteo de valores nulos:')
print(df_articles.isnull().sum())
print('\nTipo de datos del dataframe:')
print(df_articles.dtypes)
Muestra de datos:
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
Formato del dataframe:
(31, 3)
Busqueda y conteo de valores nulos:
article_id 0
article_name 0
unit_price 0
dtype: int64
Tipo de datos del dataframe:
article_id int64
article_name object
unit_price object
dtype: object
# Exploración del df de vendedores
print('Muestra de datos:')
print(df_sellers.head())
print('\nFormato del dataframe:')
print(df_sellers.shape)
print('\nBusqueda y conteo de valores nulos:')
print(df_sellers.isnull().sum())
print('\nTipos de datos del dataframe:')
print(df_sellers.dtypes)
Muestra de datos:
seller_name
seller_id
1 Aveline Swanwick
2 Jase Doy
3 Oliviero Charkham
4 Cornie Wynrehame
5 Ewell Peres
Formato del dataframe:
(15, 1)
Busqueda y conteo de valores nulos:
seller_name 0
dtype: int64
Tipos de datos del dataframe:
seller_name object
dtype: object
# Exploración del df de órdenes
print('Muestra de datos:')
print(df_orders.head())
print('\nFormato del dataframe:')
print(df_orders.shape)
print('\nBusqueda y conteo de valores nulos:')
print(df_orders.isnull().sum())
print('\nTipo de datos del dataframe:')
print(df_orders.dtypes)
Muestra de datos:
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
Formato del dataframe:
(1000, 6)
Busqueda y conteo de valores nulos:
order_id 0
week 0
article_id 0
quantity 0
seller_id 0
country_name 0
dtype: int64
Tipo de datos del dataframe:
order_id int64
week int64
article_id int64
quantity int64
seller_id int64
country_name object
dtype: object
# Cambiamos el formato de precios unitarios a tipo float y comprobamos
df_articles['unit_price'] = df_articles['unit_price'].astype(float)
print(df_articles.dtypes)
article_id int64
article_name object
unit_price float64
dtype: object
# Teniendo como referencia el modelo de datos del paso 3
# Creo una copia de los indices y datos del df_oders
my_df = df_orders.copy()
print(my_df)
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
.. ... ... ... ... ... ...
995 16019 4 20021 1 7 Brazil
996 16020 4 20040 15 15 Brazil
997 16021 4 20040 2 11 Colombia
998 16022 4 20018 14 11 Brazil
999 16023 4 20026 12 9 Brazil
[1000 rows x 6 columns]
# Cambio los indices (etiquetas de fila existente) del df_articles, fijando la columna article_id como los nuevos indices del df
df_articles.set_index('article_id', inplace=True) # debemos ejecutar esta linea de codigo una vez, de lo contrario veremos un mensaje de error
print(df_articles.head(10))
article_name unit_price
article_id
20015 Smartphone 525.00
20016 Full Pc 2127.81
20017 Monitor 230.00
20018 Tablet 130.00
20019 Desk 130.10
20020 Chair 335.64
20021 Modem 67.50
20022 Range Extender 20.45
20023 Notebook 1000.00
20024 Netbook 145.00
# Agrego 3 columnas o campos que me va a servir de "ancla" para buscar la informacion real de manera indexada
my_df = my_df.assign(article_name = my_df['article_id'])
my_df = my_df.assign(seller_name = my_df['seller_id'])
my_df = my_df.assign(total_amount = my_df['article_id'])
print(my_df.head(10))
order_id week article_id quantity seller_id country_name article_name \
0 15024 1 20039 10 10 Peru 20039
1 15025 1 20029 15 5 Peru 20029
2 15026 1 20024 5 14 Bolivia 20024
3 15027 1 20018 9 14 Brazil 20018
4 15028 1 20035 6 15 Mexico 20035
5 15029 1 20024 6 11 Peru 20024
6 15030 1 20025 13 13 Colombia 20025
7 15031 1 20029 14 14 Peru 20029
8 15032 1 20026 7 3 Mexico 20026
9 15033 1 20027 4 13 Honduras 20027
seller_name total_amount
0 10 20039
1 5 20029
2 14 20024
3 14 20018
4 15 20035
5 11 20024
6 13 20025
7 14 20029
8 3 20026
9 13 20027
# Reemplazamos los id por los vlores reales en el df
for i in range (len(my_df.index)): # la funcion len(my_df.index) devuelve la cantidad de filas (registros) de my_df
# localizo el nombre del articulo usando el id guardado en my_df y lo asigno a la variable auxiliar article
article = df_articles.loc[my_df.loc[i]['article_name']]['article_name']
# reemplazo el id actual por el nombre del articulo encontrado
my_df.loc[i,'article_name']= article
# reemplazo el id actual de my_df por el nombre del vendedor
my_df.loc[i,'seller_name'] = df_sellers.loc[my_df.loc[i]['seller_name']]['seller_name']
# busco el precio unitario y lo multiplico por la cantidad de unidades vendidas, los resultados se guardan en la columna total_amount
my_df.loc[i,'total_amount'] = my_df.loc[i,'quantity'] * df_articles.loc[my_df.loc[i]['article_id']]['unit_price']
print(my_df.head())
order_id week article_id quantity seller_id country_name \
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
article_name seller_name total_amount
0 Water Cooling Cirilo Grandham 675.0
1 Mouse Ewell Peres 454.5
2 Netbook Janel O'Curran 725.0
3 Tablet Janel O'Curran 1170.0
4 Case Daisie Slograve 227.4
# Elimino las columnas que no necesito
my_df.drop(['order_id', 'article_id', 'seller_id'], axis='columns', inplace=True)
print(my_df.head(10))
print(my_df.shape)
week quantity country_name article_name seller_name total_amount
0 1 10 Peru Water Cooling Cirilo Grandham 675.00
1 1 15 Peru Mouse Ewell Peres 454.50
2 1 5 Bolivia Netbook Janel O'Curran 725.00
3 1 9 Brazil Tablet Janel O'Curran 1170.00
4 1 6 Mexico Case Daisie Slograve 227.40
5 1 6 Peru Netbook Vasily Danilyuk 870.00
6 1 13 Colombia HDD Arnold Kilkenny 710.06
7 1 14 Peru Mouse Janel O'Curran 424.20
8 1 7 Mexico SDD Oliviero Charkham 154.00
9 1 4 Honduras Ram Memory Arnold Kilkenny 143.80
(1000, 6)
# RESOLUCIÓN ANALÍTICA
my_df2 = my_df.groupby('article_name').count()
por_cantidad = my_df2.sort_values('quantity', ascending=False)
print(por_cantidad['quantity'].head(5))
article_name
HDD 47
Netbook 45
SDD 45
Tablet 40
Usb Cable 39
Name: quantity, dtype: int64
# RESOLUCIÓN GRÁFICA
plt.figure(figsize = (15,5))
sns.barplot(x=por_cantidad.index, y=por_cantidad['quantity'], data = por_cantidad)
plt.xticks(rotation=75)
plt.show()
# RESOLUCIÓN ANALÍTICA
my_df2 = my_df.groupby('article_name').sum()
mas_ingresos = my_df2.sort_values('total_amount', ascending=False).head()
print(mas_ingresos['total_amount'])
article_name
Full Pc 538335.93
Notebook 251000.00
Smartphone 152250.00
Chair 69477.48
Tablet 48620.00
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
# Solo graficamos los 5 primeros productos, para no distorcionar el grafico, entonces no estan representados el 100% de los productos por un tema grafico
plt.pie(x=mas_ingresos['total_amount'], labels=mas_ingresos.index, autopct='%.2f%%', radius=2)
plt.show()
# RESOLUCIÓN ANALÍTICA
df3 = my_df.groupby('seller_name').sum().sort_values('total_amount', ascending =False)
print(df3[['quantity']+['total_amount']].head(1))
quantity total_amount
seller_name
Janel O'Curran 703 192832.47
# RESOLUCIÓN ANALÍTICA
df4 = my_df.groupby('week').sum()
print(df4)
quantity total_amount
week
1 2449 507458.81
2 2444 415364.44
3 2114 329140.03
4 1058 223844.56
Cantidad Monto_Total
Pais
Brazil 2515 $ 441,271.85
Argentina 947 $ 205,832.78
Colombia 881 $ 177,514.29
Peru 1027 $ 161,421.12
Mexico 846 $ 138,619.99
Articulo Cantidad
Pais
Brazil Tablet 156
Peru Mouse 125
Argentina CPU 104
Colombia SDD 74
Mexico HDD 63
Bolivia Pci Express Port 34
Costa Rica Power Supply 32
Venezuela Smartphone 32
Honduras Mesh Wi-Fi X 2 28
Chile Mouse 27
Guatemala Heatsink 25
Ecuador Power Supply 24
Paraguay Headphones 24
El Salvador Smartphone 22
Uruguay Motherboard 15
Puerto Rico CPU 9
Semana Cantidad Monto_Total
Pais
El Salvador 2 6 $ 3,586.95
Venezuela 2 8 $ 2,099.58
Argentina 2 8 $ 1,821.53
Costa Rica 2 7 $ 1,821.39
Guatemala 2 6 $ 1,813.08
Colombia 2 7 $ 1,599.23
Uruguay 2 7 $ 1,486.92
Brazil 2 7 $ 1,400.86
Mexico 2 8 $ 1,372.48
Peru 2 8 $ 1,291.37
Bolivia 2 9 $ 1,193.83
Ecuador 2 7 $ 970.85
Honduras 2 7 $ 967.46
Chile 2 8 $ 880.75
Puerto Rico 1 6 $ 632.71
Paraguay 2 7 $ 482.07