# A veces necesitamos instalar nuevas librerías en nuestros proyectos
!pip install openpyxl==3.0.10
Requirement already satisfied: openpyxl==3.0.10 in /root/venv/lib/python3.9/site-packages (3.0.10)
Requirement already satisfied: et-xmlfile in /root/venv/lib/python3.9/site-packages (from openpyxl==3.0.10) (1.1.0)
WARNING: You are using pip version 22.0.4; however, version 22.1.2 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# imports
import numpy as np
import pandas as pd
import sqlite3 as sql3
import openpyxl # para que levante bien el excel
import matplotlib.pyplot as plt
import seaborn as sns
# Levanto los datos en 3 diferentes dataframes
# Artículos
conn = sql3.connect('/work/data/articles.db')
sql_query = pd.read_sql_query('SELECT * FROM articles', conn)
df_articulos = pd.DataFrame(sql_query, columns = ['article_id', 'article_name', 'unit_price'])
# Vendedores
df_vendedores = pd.read_excel('/work/data/sellers.xlsx', index_col=0)
# Órdenes
df_ordenes = pd.read_csv('/work/data/orders.csv')
# Exploración del df de artículos
print('Muestra de datos')
print(df_articulos.head(10))
print('\nShape del dataframe: ', df_articulos.shape)
print('\nCantidad de valores nulos por columna: ', df_articulos.isnull().sum())
print('\nFormato de los datos por columna')
print(df_articulos.dtypes)
Muestra de datos
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
5 20020 Chair 335.64
6 20021 Modem 67.50
7 20022 Range Extender 20.45
8 20023 Notebook 1000.00
9 20024 Netbook 145.00
Shape del dataframe: (31, 3)
Cantidad de valores nulos por columna: article_id 0
article_name 0
unit_price 0
dtype: int64
Formato de los datos por columna
article_id int64
article_name object
unit_price object
dtype: object
# Exploración del df de vendedores
print('Muestra de datos')
print(df_vendedores.head(10))
print('\nShape del dataframe: ', df_vendedores.shape)
print('\nCantidad de valores nulos por columna', df_vendedores.isnull().sum())
print('\nFormato de los datos por columna')
print(df_vendedores.dtypes)
Muestra de datos
seller_name
seller_id
1 Aveline Swanwick
2 Jase Doy
3 Oliviero Charkham
4 Cornie Wynrehame
5 Ewell Peres
6 Milly Christoffe
7 Kati Innot
8 Tobin Roselli
9 Onida Cosely
10 Cirilo Grandham
Shape del dataframe: (15, 1)
Cantidad de valores nulos por columna seller_name 0
dtype: int64
Formato de los datos por columna
seller_name object
dtype: object
# Exploración del df de órdenes
print('Muestra de datos')
print(df_ordenes.head(10))
print('\nShape del dataframe: ',df_ordenes.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_ordenes.isnull().sum())
print('\nFormato de los datos por columna')
print(df_ordenes.dtypes)
Muestra de datos
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
5 15029 1 20024 6 11 Peru
6 15030 1 20025 13 13 Colombia
7 15031 1 20029 14 14 Peru
8 15032 1 20026 7 3 Mexico
9 15033 1 20027 4 13 Honduras
Shape del dataframe: (1000, 6)
Búsqueda de valores nulos por columna
order_id 0
week 0
article_id 0
quantity 0
seller_id 0
country_name 0
dtype: int64
Formato de los datos por columna
order_id int64
week int64
article_id int64
quantity int64
seller_id int64
country_name object
dtype: object
df_articulos['unit_price'] = df_articulos['unit_price'].astype(float)
print(df_articulos.dtypes)
article_id int64
article_name object
unit_price float64
dtype: object
my_df = df_ordenes.merge(df_vendedores, left_on='seller_id', right_on='seller_id', how= 'left').merge(df_articulos, left_on='article_id', right_on='article_id', how= 'left')
total = lambda row: float(row['quantity']) * float(row['unit_price'])
my_df['total_amount'] = my_df.apply(total, axis=1)
my_df = my_df[['week', 'quantity', 'article_name', 'total_amount', 'seller_name', 'country_name']]
print(my_df.head(10))
week quantity article_name total_amount seller_name country_name
0 1 10 Water Cooling 675.00 Cirilo Grandham Peru
1 1 15 Mouse 454.50 Ewell Peres Peru
2 1 5 Netbook 725.00 Janel O'Curran Bolivia
3 1 9 Tablet 1170.00 Janel O'Curran Brazil
4 1 6 Case 227.40 Daisie Slograve Mexico
5 1 6 Netbook 870.00 Vasily Danilyuk Peru
6 1 13 HDD 710.06 Arnold Kilkenny Colombia
7 1 14 Mouse 424.20 Janel O'Curran Peru
8 1 7 SDD 154.00 Oliviero Charkham Mexico
9 1 4 Ram Memory 143.80 Arnold Kilkenny Honduras
# RESOLUCIÓN ANALÍTICA
df_cant_art = my_df.groupby('article_name').sum()
df_cant_art_ord = df_cant_art.sort_values('quantity', ascending=False)
print(df_cant_art['quantity'].head(1) )
article_name
CPU 266
Name: quantity, dtype: int64
# RESOLUCIÓN GRÁFICA
sns.barplot(y = df_cant_art_ord["quantity"], x=df_cant_art_ord.index, palette="rocket")
plt.xticks(rotation=90)
plt.show()
# RESOLUCIÓN ANALÍTICA
df_monto_articulo_ord = my_df.groupby('article_name').sum().sort_values('total_amount', ascending=False).head(5)
print(df_monto_articulo_ord)
week quantity total_amount
article_name
Full Pc 70 253 538335.93
Notebook 69 251 251000.00
Smartphone 74 290 152250.00
Chair 56 207 69477.48
Tablet 90 374 48620.00
# RESOLUCIÓN GRÁFICA
# plt.pie(x=df_monto_articulo_ord['total_amount'], labels=df_monto_articulo_ord.index)
# plt.show()
plt.barh(df_monto_articulo_ord.index, df_monto_articulo_ord['total_amount'] , height=0.7)
plt.show()
# RESOLUCIÓN ANALÍTICA
df_monto_vendedor_ord = my_df.groupby('seller_name').sum().sort_values('total_amount', ascending=False)
print(df_monto_vendedor_ord[['quantity']+['total_amount']])
quantity total_amount
seller_name
Janel O'Curran 703 192832.47
Brockie Patience 441 142709.88
Oliviero Charkham 555 141329.76
Vasily Danilyuk 521 129157.55
Daisie Slograve 554 120520.11
Aveline Swanwick 629 118874.33
Arnold Kilkenny 583 94552.04
Kati Innot 512 83704.62
Jase Doy 582 80628.31
Ewell Peres 496 78144.32
Onida Cosely 535 77373.37
Milly Christoffe 442 61733.69
Tobin Roselli 519 56984.42
Cornie Wynrehame 523 52253.57
Cirilo Grandham 470 45009.40
# RESOLUCIÓN GRÁFICA
sns.barplot(df_monto_vendedor_ord.index, df_monto_vendedor_ord['total_amount'], palette="rocket")
plt.xticks(rotation=45, ha="right")
plt.show()
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
# RESOLUCIÓN ANALÍTICA
df_venta_semanal = my_df.groupby(by='week').sum().sort_values('total_amount',ascending=False)
print(df_venta_semanal['total_amount'])
week
1 507458.81
2 415364.44
3 329140.03
4 223844.56
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
sns.barplot(df_venta_semanal.index,df_venta_semanal['total_amount'], color= 'blue')
plt.show()
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
#mas caros
df_articulos['unit_price'] = df_articulos['unit_price'].astype(float)
mas_caros = df_articulos.sort_values('unit_price',ascending=False)
print(mas_caros.head(10))
mas_caros = mas_caros.head(10)
sns.barplot(mas_caros['article_name'],df_articulos['unit_price'], color= 'blue')
plt.xticks(rotation=90, ha="right")
plt.show()
#mas baratos
df_articulos['unit_price'] = df_articulos['unit_price'].astype(float)
mas_baratos = df_articulos.sort_values('unit_price',ascending=True)
print(mas_baratos.head(10))
mas_baratos = mas_baratos.head(10)
sns.barplot(mas_baratos['article_name'],df_articulos['unit_price'], color= 'blue')
plt.xticks(rotation=90, ha="right")
plt.show()
article_id article_name unit_price
1 20016 Full Pc 2127.81
8 20023 Notebook 1000.00
0 20015 Smartphone 525.00
5 20020 Chair 335.64
2 20017 Monitor 230.00
19 20034 Scanner 185.00
9 20024 Netbook 145.00
22 20037 CPU 139.62
13 20028 Motherboard 138.38
21 20036 Video Card 131.50
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
article_id article_name unit_price
27 20042 Sata Cable 2.14
26 20041 Usb Cable 2.95
15 20030 Fan Cooler 4.25
25 20040 Heatsink 10.00
28 20043 Pci Express Port 10.12
16 20031 Webcam 20.07
7 20022 Range Extender 20.45
11 20026 SDD 22.00
17 20032 Keyboard 22.60
18 20033 Headphones 23.30
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
# RESOLUCIÓN
# ¿Cual es la semana de mayor venta para cada pais?
# Previsualizar el DF final
# Pais1 4 monto
# Pais2 3 monto
df_pais_semana = my_df.groupby(['country_name','week']).sum()
paises = my_df.groupby('country_name').sum().index.values
#print(type(paises))
df_p_s = pd.DataFrame(columns=['Semana', 'Ventas'], index=paises)
for p in paises:
df = df_pais_semana.loc[p]
m = df.max()['total_amount']
w = df[(df['total_amount'] == m)].index[0]
#print('Pais:',p,' Semana:',w,' Total ventas:', m)
df_p_s.loc[p]= {'Semana': w, 'Ventas': m }
df_p_s['Semana'] = df_p_s['Semana'].astype(int)
#print(df_p_s)
g=sns.barplot(data=df_p_s, x=df_p_s.index ,y='Ventas', hue='Semana')
g.set_xticklabels(labels=df_p_s.index,rotation=90)
plt.show()
# RESOLUCImy_df = df_ordenes.merge(df_vendedores, left_on='seller_id', right_on='seller_id', how= 'left').merge(df_articulos, left_on='article_id', right_on='article_id', how= 'left')
my_df = df_ordenes.merge(df_vendedores, left_on='seller_id', right_on='seller_id', how= 'left').merge(df_articulos, left_on='article_id', right_on='article_id', how= 'left')
total = lambda row: float(row['quantity']) * float(row['unit_price'])
my_df['total_amount'] = my_df.apply(total, axis=1)
my_df = my_df[['order_id', 'quantity', 'article_name', 'total_amount', 'seller_name', 'country_name']]
ordernes_mas_costosas = my_df.sort_values('total_amount',ascending=False)
ordernes_mas_costosas['total_amount'] = ordernes_mas_costosas['total_amount'].astype(float)
ordernes_mas_costosas = ordernes_mas_costosas.head(5)
print(ordernes_mas_costosas)
sns.barplot(ordernes_mas_costosas['seller_name'],ordernes_mas_costosas['total_amount'], color= 'blue')
plt.xticks(rotation=90, ha="right")
plt.show()
order_id quantity article_name total_amount seller_name \
299 15323 15 Full Pc 31917.15 Brockie Patience
610 15634 15 Full Pc 31917.15 Janel O'Curran
275 15299 13 Full Pc 27661.53 Jase Doy
203 15227 13 Full Pc 27661.53 Oliviero Charkham
960 15984 13 Full Pc 27661.53 Vasily Danilyuk
country_name
299 Argentina
610 El Salvador
275 Mexico
203 Venezuela
960 Guatemala
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(