# A veces necesitamos instalar nuevas librerías en nuestros proyectos
!pip install xlrd
!pip install openpyxl==3.0.10
Requirement already satisfied: xlrd in /root/venv/lib/python3.9/site-packages (2.0.1)
WARNING: You are using pip version 22.0.4; however, version 22.1.2 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
Requirement already satisfied: openpyxl==3.0.10 in /root/venv/lib/python3.9/site-packages (3.0.10)
Requirement already satisfied: et-xmlfile in /root/venv/lib/python3.9/site-packages (from openpyxl==3.0.10) (1.1.0)
WARNING: You are using pip version 22.0.4; however, version 22.1.2 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# imports
import pandas as pd
import numpy as np
import sqlite3 as sql3
import matplotlib.pyplot as plt
import seaborn as sns
# Levanto los datos en 3 diferentes dataframes
#ARTÍCULOS
conn = sql3.connect('/work/data/articles.db')
query1 = pd.read_sql_query('SELECT * FROM articles',conn)
df_articles=pd.DataFrame(query1,columns=['article_id','article_name','unit_price'])
#VENDEDORES
df_sellers=pd.read_excel('/work/data/sellers.xlsx',index_col=0)
#ORDENES
df_orders=pd.read_csv('/work/data/orders.csv')
print(df_orders)
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
.. ... ... ... ... ... ...
995 16019 4 20021 1 7 Brazil
996 16020 4 20040 15 15 Brazil
997 16021 4 20040 2 11 Colombia
998 16022 4 20018 14 11 Brazil
999 16023 4 20026 12 9 Brazil
[1000 rows x 6 columns]
# Exploración del df de artículos
print('Muestra de datos')
print(df_articles.head())
print('\nFormato del dataframe')
print(df_articles.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_articles.isnull().sum())
print('\nFormato de los datos')
print(df_articles.dtypes)
Muestra de datos
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
Formato del dataframe
(31, 3)
Búsqueda de valores nulos por columna
article_id 0
article_name 0
unit_price 0
dtype: int64
Formato de los datos
article_id int64
article_name object
unit_price object
dtype: object
# Exploración del df de vendedores
print('Muestra de datos')
print(df_sellers.head())
print('\nFormato del dataframe')
print(df_sellers.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_sellers.isnull().sum())
print('\nFormato de los datos')
print(df_sellers.dtypes)
Muestra de datos
seller_name
seller_id
1 Aveline Swanwick
2 Jase Doy
3 Oliviero Charkham
4 Cornie Wynrehame
5 Ewell Peres
Formato del dataframe
(15, 1)
Búsqueda de valores nulos por columna
seller_name 0
dtype: int64
Formato de los datos
seller_name object
dtype: object
# Exploración del df de órdenes
print('Muestra de datos')
print(df_orders.head())
print('\nFormato del dataframe')
print(df_orders.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_orders.isnull().sum())
print('\nFormato de los datos')
print(df_orders.dtypes)
Muestra de datos
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
Formato del dataframe
(1000, 6)
Búsqueda de valores nulos por columna
order_id 0
week 0
article_id 0
quantity 0
seller_id 0
country_name 0
dtype: int64
Formato de los datos
order_id int64
week int64
article_id int64
quantity int64
seller_id int64
country_name object
dtype: object
df_articles['unit_price']=df_articles['unit_price'].astype(float)
# Creo una copia del df base
my_df = df_orders.copy()
# Cambio el índice del df artículos
df_articles.set_index('article_id', inplace=True)
print(my_df)
#print(df_articles)
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
.. ... ... ... ... ... ...
995 16019 4 20021 1 7 Brazil
996 16020 4 20040 15 15 Brazil
997 16021 4 20040 2 11 Colombia
998 16022 4 20018 14 11 Brazil
999 16023 4 20026 12 9 Brazil
[1000 rows x 6 columns]
# Agrego algunas columnas y pongo el campo que me va a servir de "ancla"
my_df = my_df.assign(article_name = my_df['article_id'])
my_df = my_df.assign(total_amount = my_df['article_id']) # voy a necesitar el unit_price
my_df = my_df.assign(seller_name = my_df['seller_id'])
# reeplazar los valores en el nuevo df
# df_articles[?]['article_name']
for i in range(len(my_df.index)):
# len... devuelve la cantidad de registros
article = df_articles.loc[my_df.loc[i]['article_name']]['article_name']
# reemplazo en la columna 'article_name'
my_df.loc[i,'article_name'] = article
# modificar la columna total_amount
my_df.loc[i,'total_amount'] = my_df.loc[i,'quantity'] * df_articles.loc[my_df.loc[i]['total_amount']]['unit_price']
seller=df_sellers.loc[my_df.loc[i]['seller_name']]['seller_name']
my_df.loc[i,'seller_name']=seller
# elimino las columnas que no necesito
my_df.drop(['order_id','article_id','seller_id'], axis='columns', inplace=True)
print(my_df)
week quantity country_name article_name total_amount seller_name
0 1 10 Peru Water Cooling 675.0 Cirilo Grandham
1 1 15 Peru Mouse 454.5 Ewell Peres
2 1 5 Bolivia Netbook 725.0 Janel O'Curran
3 1 9 Brazil Tablet 1170.0 Janel O'Curran
4 1 6 Mexico Case 227.4 Daisie Slograve
.. ... ... ... ... ... ...
995 4 1 Brazil Modem 67.5 Kati Innot
996 4 15 Brazil Heatsink 150.0 Daisie Slograve
997 4 2 Colombia Heatsink 20.0 Vasily Danilyuk
998 4 14 Brazil Tablet 1820.0 Vasily Danilyuk
999 4 12 Brazil SDD 264.0 Onida Cosely
[1000 rows x 6 columns]
# RESOLUCIÓN ANALÍTICA
my_df2=my_df[['article_name','quantity']]
my_df2=my_df2.groupby('article_name').sum('quantity')
my_df2=my_df2.sort_values('quantity',ascending=False)
print(my_df2.head(1))
quantity
article_name
HDD 413
# RESOLUCIÓN GRÁFICA
my_df2.plot(kind='bar',title='Unidades por artículo')
# RESOLUCIÓN ANALÍTICA
my_df3=my_df[['article_name','total_amount']]
my_df3=my_df3.groupby('article_name').sum('total_amount')
my_df3=my_df3.sort_values('total_amount',ascending=False)
print(my_df3.head(1))
total_amount
article_name
Full Pc 538335.93
# RESOLUCIÓN GRÁFICA
plt.bar(my_df3.index,my_df3['total_amount'])
plt.xticks(rotation=90)
plt.title('Monto por artículo')
plt.show()
# RESOLUCIÓN ANALÍTICA
my_df4=my_df[['seller_name','quantity','total_amount']]
my_df4=my_df4.groupby('seller_name').sum().sort_values('quantity',ascending=False)
print(my_df4.head(1))
quantity total_amount
seller_name
Janel O'Curran 703 192832.47
# RESOLUCIÓN GRÁFICA
plt.bar(my_df4.index,my_df4['quantity'])
plt.xticks(rotation=90)
plt.title('Unidades por vendedor')
plt.show()
# RESOLUCIÓN ANALÍTICA
my_df5=(my_df.groupby('week').sum()).sort_values('total_amount',ascending=False)
print(my_df5)
quantity total_amount
week
1 2449 507458.81
2 2444 415364.44
3 2114 329140.03
4 1058 223844.56
# RESOLUCIÓN GRÁFICA
plt.scatter(my_df5.index,my_df5['total_amount'])
plt.plot(my_df5.index,my_df5['total_amount'])
plt.title('Monto por semana')
plt.show()
# RESOLUCIÓN
my_df6=my_df[['country_name','quantity','total_amount']]
my_df6=my_df6.groupby('country_name').sum().sort_values('quantity',ascending=False)
print(my_df6.head(1))
quantity total_amount
country_name
Brazil 2515 441271.85
# RESOLUCIÓN
countries=my_df6.index.to_list()
countries=countries[:3]
my_df7=my_df[['week','country_name','quantity']]
my_df7=my_df[my_df['country_name'].isin(countries)]
my_df7=my_df7.groupby(['week','country_name']).sum().sort_values('quantity',ascending=False)
my_df7=my_df7.unstack('week')
my_df7=my_df7.stack()
print(my_df7)
sns.relplot(x='week',y='total_amount',col='country_name',data=my_df7)
quantity total_amount
country_name week
Argentina 1 275 63760.48
2 413 96789.13
3 160 26601.97
4 99 18681.20
Brazil 1 765 165289.90
2 664 106803.85
3 646 79341.50
4 440 89836.60
Peru 1 393 48807.97
2 293 56591.80
3 247 51855.31
4 94 4166.04
# RESOLUCIÓN
my_df8=my_df[my_df['week'] == 4]
my_df8=my_df8.groupby('seller_name').sum().sort_values('quantity',ascending=False)
my_df8=my_df8[['quantity','total_amount']]
print(my_df8.head(1))
quantity total_amount
seller_name
Aveline Swanwick 130 18878.9