# A veces necesitamos instalar nuevas librerías en nuestros proyectos
# En este caso, instalamos openpyxl para trabajar con el DF sellers.xlsx
!pip install openpyxl==3.0.10
Collecting openpyxl==3.0.10
Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 242.1/242.1 KB 5.3 MB/s eta 0:00:00
Collecting et-xmlfile
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10
WARNING: You are using pip version 22.0.4; however, version 22.1.2 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# imports
# Importamos los módulos estadísticos y graficadores, más los requeridos para trabajar los DF
import numpy as np
import pandas as pd
import sqlite3 as sql3
import openpyxl
import matplotlib.pyplot as plt
import seaborn as sns
# Levantamos los datos en 3 diferentes dataframes:
# Artículos:
conn = sql3.connect('/work/data/articles.db')
sql_query = pd.read_sql_query('SELECT * FROM articles', conn)
df_articles = pd.DataFrame(sql_query, columns = ['article_id', 'article_name', 'unit_price'])
# Vendedores:
df_sellers = pd.read_excel('/work/data/sellers.xlsx', index_col=0)
# Órdenes:
df_orders = pd.read_csv('/work/data/orders.csv')
df_orders
order_idint64
15024 - 16023
weekint64
1 - 4
10
15034
1
11
15035
1
12
15036
1
13
15037
1
14
15038
1
15
15039
1
16
15040
1
17
15041
1
18
15042
1
19
15043
1
# Exploración del df de artículos:
print('Muestra de datos')
print(df_articles.head())
print('\nFormato del dataframe')
print(df_articles.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_articles.isnull().sum())
print('\nFormato de los datos por columna')
print(df_articles.dtypes)
print('\nLos IDs de producto no son unicos?')
print(df_articles['article_id'].is_unique)
Muestra de datos
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
Formato del dataframe
(31, 3)
Búsqueda de valores nulos por columna
article_id 0
article_name 0
unit_price 0
dtype: int64
Formato de los datos por columna
article_id int64
article_name object
unit_price object
dtype: object
Los IDs de producto no son unicos?
True
# Exploración del df de vendedores:
print('Muestra de datos')
print(df_sellers.head())
print('\nFormato del dataframe')
print(df_sellers.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_sellers.isnull().sum())
print('\nFormato de los datos por columna')
print(df_sellers.dtypes)
Muestra de datos
seller_name
seller_id
1 Aveline Swanwick
2 Jase Doy
3 Oliviero Charkham
4 Cornie Wynrehame
5 Ewell Peres
Formato del dataframe
(15, 1)
Búsqueda de valores nulos por columna
seller_name 0
dtype: int64
Formato de los datos por columna
seller_name object
dtype: object
# Exploración del df de órdenes:
print('Muestra de datos')
print(df_orders.head())
print('\nFormato del dataframe')
print(df_orders.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_orders.isnull().sum())
print('\nFormato de los datos por columna')
print(df_orders.dtypes)
print('\nLos IDs de ordenes no son unicos?')
print(df_orders['order_id'].is_unique)
Muestra de datos
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
Formato del dataframe
(1000, 6)
Búsqueda de valores nulos por columna
order_id 0
week 0
article_id 0
quantity 0
seller_id 0
country_name 0
dtype: int64
Formato de los datos por columna
order_id int64
week int64
article_id int64
quantity int64
seller_id int64
country_name object
dtype: object
Los IDs de ordenes no son unicos?
True
df_articles['unit_price'] = df_articles['unit_price'].astype(float)
print(df_articles.dtypes)
article_id int64
article_name object
unit_price float64
dtype: object
# Creamos una copia del df_orders:
my_df = df_orders.copy()
# Cambiamos el índice del df de artículos:
df_articles.set_index('article_id',inplace = True)
# Agregamos columnas nuevas:
my_df = my_df.assign(article_name = '')
my_df = my_df.assign(total_amount = 1)
my_df = my_df.assign(seller_name = '')
for i in range(len(my_df.index)):
# Recuperamos el nombre del articulo:
current_article_id = my_df.loc[i]['article_id']
current_article_name = df_articles.loc[current_article_id]['article_name']
my_df.loc[i,'article_name'] = current_article_name
# Calculamos el monto total:
current_article_unit_price = df_articles.loc[current_article_id]['unit_price']
current_quantity = my_df.loc[i, 'quantity']
my_df.loc[i,'total_amount'] = current_article_unit_price * current_quantity
# Recuperemos el nombre del vendedor:
current_seller_id = my_df.loc[i]['seller_id']
current_seller_name = df_sellers.loc[current_seller_id]['seller_name']
my_df.loc[i,'seller_name'] = current_seller_name
my_df.drop(['order_id', 'article_id', 'seller_id'], axis = 'columns', inplace=True)
print(my_df)
week quantity country_name article_name total_amount seller_name
0 1 10 Peru Water Cooling 675.0 Cirilo Grandham
1 1 15 Peru Mouse 454.5 Ewell Peres
2 1 5 Bolivia Netbook 725.0 Janel O'Curran
3 1 9 Brazil Tablet 1170.0 Janel O'Curran
4 1 6 Mexico Case 227.4 Daisie Slograve
.. ... ... ... ... ... ...
995 4 1 Brazil Modem 67.5 Kati Innot
996 4 15 Brazil Heatsink 150.0 Daisie Slograve
997 4 2 Colombia Heatsink 20.0 Vasily Danilyuk
998 4 14 Brazil Tablet 1820.0 Vasily Danilyuk
999 4 12 Brazil SDD 264.0 Onida Cosely
[1000 rows x 6 columns]
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html
d1=pd.DataFrame({'mes': ['ene','feb','mar','abr'], 'ventas':[10,20,30,15]})
d2=pd.DataFrame({'mes': ['ene','feb','mar','abr'], 'costos':[7,16,25,12]})
print(pd.merge(d1,d2))
mes ventas costos
0 ene 10 7
1 feb 20 16
2 mar 30 25
3 abr 15 12
# RESOLUCIÓN ANALÍTICA
sumatory_article_df = my_df.groupby('article_name').sum()
top_ten_articles_by_unit = sumatory_article_df.sort_values('quantity', ascending=False).head(10)
print(top_ten_articles_by_unit['quantity'].head(5))
article_name
HDD 413
Tablet 374
SDD 372
Mouse 322
Netbook 320
Name: quantity, dtype: int64
# RESOLUCIÓN GRÁFICA
sns.barplot(y=top_ten_articles_by_unit["quantity"], x=top_ten_articles_by_unit.index)
plt.title('TOP TEN Articles by Units')
plt.xticks(rotation=60)
plt.xlabel('Articles')
plt.ylabel('Quantity')
plt.show()
# RESOLUCIÓN ANALÍTICA
sumatory_article_df = my_df.groupby(by='article_name').sum()
top_ten_articles_by_amount = sumatory_article_df.sort_values('total_amount', ascending=False).head(15)
print(top_ten_articles_by_amount['total_amount'].head(15))
article_name
Full Pc 538335.93
Notebook 251000.00
Smartphone 152250.00
Chair 69477.48
Tablet 48620.00
Monitor 47840.00
Netbook 46400.00
Scanner 40885.00
Motherboard 40268.58
CPU 37138.92
Desk 29012.30
Video Card 27483.50
HDD 22558.06
Water Cooling 17010.00
Modem 16470.00
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
# Tomo sólo los 15 artículos que más ingresos proporcionaron
sns.barplot(x=top_ten_articles_by_amount['total_amount'], y=top_ten_articles_by_amount.index)
plt.title('TOP TEN Articles by Total Amount')
plt.ylabel('Articles')
plt.xlabel('Total Amount')
plt.show()
# RESOLUCIÓN ANALÍTICA
sumatory_sellers_df = my_df.groupby('seller_name').sum()
ranking_sellers_by_amount = sumatory_sellers_df.sort_values('total_amount', ascending=False)
print(ranking_sellers_by_amount['total_amount'].head(15))
seller_name
Janel O'Curran 192832.47
Brockie Patience 142709.88
Oliviero Charkham 141329.76
Vasily Danilyuk 129157.55
Daisie Slograve 120520.11
Aveline Swanwick 118874.33
Arnold Kilkenny 94552.04
Kati Innot 83704.62
Jase Doy 80628.31
Ewell Peres 78144.32
Onida Cosely 77373.37
Milly Christoffe 61733.69
Tobin Roselli 56984.42
Cornie Wynrehame 52253.57
Cirilo Grandham 45009.40
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
sns.barplot(y=ranking_sellers_by_amount['total_amount'], x=ranking_sellers_by_amount.index)
plt.xticks(rotation=90)
plt.xlabel('Seller')
plt.ylabel('Total amount')
plt.show()
# RESOLUCIÓN ANALÍTICA
sumatory_weeks_df = my_df.groupby('week').sum()
ranking_weeks_by_amount = sumatory_weeks_df.sort_values('total_amount', ascending=False)
weak_week = ranking_weeks_by_amount['total_amount'].tail(1)
print(ranking_weeks_by_amount)
# Semana mas debil (para lanzar campaña)
print(weak_week)
quantity total_amount
week
1 2449 507458.81
2 2444 415364.44
3 2114 329140.03
4 1058 223844.56
week
4 223844.56
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
ax = sns.barplot(y=ranking_weeks_by_amount['total_amount'], x=ranking_weeks_by_amount.index)
for i in ax.containers:
ax.bar_label(i,)
plt.xlabel('Week')
plt.ylabel('Total Amount')
plt.show()
# RESOLUCIÓN ANALÍTICA
# Los 5 paises en cuestion, debe ser extraido del DF my_df:
sumatory_countries_df = my_df.groupby('country_name').sum()
top_five_countries_by_amount = sumatory_countries_df.sort_values('total_amount', ascending=False).head(5)
print(top_five_countries_by_amount)
week quantity total_amount
country_name
Brazil 717 2515 441271.85
Argentina 241 947 205832.78
Colombia 230 881 177514.29
Peru 266 1027 161421.12
Mexico 237 846 138619.99
# RESOLUCIÓN GRÁFICA
colors = sns.color_palette('ch:s=.25,rot=-.25')[0:5]
plt.pie(top_five_countries_by_amount['total_amount'], labels = top_five_countries_by_amount.index, colors = colors, autopct='%.0f%%')
centre_circle = plt.Circle((0,0), 0.5, fc="white")
fig=plt.gcf()
fig.gca().add_artist(centre_circle)
# fin círculo
plt.savefig('plot.png', dpi=300)
plt.show()
# RESOLUCIÓN ANALÍTICA
df7 = my_df.query('week==4').groupby(by='seller_name').sum().sort_values(by='quantity', ascending=False).head(3)
print(df7['quantity'])
seller_name
Aveline Swanwick 130
Vasily Danilyuk 119
Ewell Peres 111
Name: quantity, dtype: int64
# RESOLUCIÓN GRÁFICA
cantidades_vendidas = df7['quantity'].values
for i, cant in enumerate(cantidades_vendidas):
plt.annotate(cant, (df7.index[i],cant))
plt.bar(df7.index,df7['quantity'])
plt.title('Vendedores que más artículos vendieron en la última semana', fontsize='large')
plt.xlabel('Vendedor')
plt.ylabel(ylabel='Cantidad')
plt.show()
# RESOLUCIÓN
# Extraigo los articulos que me interesan de la muestra.
boolean_mask_df = my_df['article_name'].isin(['Full Pc','Smartphone'])
selected_articles_df = my_df[boolean_mask_df].sort_values('country_name', ascending=True)
# Agrupo por Pais y Articulo para sumarizar.
selected_articles_sumatory_df = selected_articles_df.groupby(['country_name', 'article_name']).sum()
# Paso los indices a columnas para ayudar a plotearlos.
selected_articles_sumatory_df.reset_index(level=1, inplace=True)
plt.xticks(rotation=90)
sns.barplot(y=selected_articles_sumatory_df.index, x=selected_articles_sumatory_df['quantity'], hue=selected_articles_sumatory_df['article_name'])
plt.xlabel('Quantity')
plt.ylabel('Country')
plt.show()
# Trabajamos sobre los paises en los que las unidad de smartphones sobrepasan las pc:
resume_df = pd.DataFrame()
resume_df = resume_df.assign(smartphone_units = 0)
resume_df = resume_df.assign(full_pc_units = 0)
for index, row in selected_articles_sumatory_df.iterrows():
if (row['article_name'] == 'Smartphone'):
resume_df.loc[index,'smartphone_units'] = row['quantity']
if (row['article_name'] == 'Full Pc'):
resume_df.loc[index,'full_pc_units'] = row['quantity']
# Rellenamos con ceros los NaN:
resume_df = resume_df.fillna(0)
resume_df = resume_df[resume_df['smartphone_units'] > resume_df['full_pc_units']]
print(f'Los paises en los que las unidad de smartphones sobrepasan las pc completas son:')
for country in (resume_df.index):
print(f'- {country}')
Los paises en los que las unidad de smartphones sobrepasan las pc completas son:
- Brazil
- Chile
- Colombia
- Ecuador
- El Salvador
- Mexico
- Uruguay
- Venezuela