# A veces necesitamos instalar nuevas librerías en nuestros proyectos
!pip install --upgrade pip
!pip install openpyxl==3.0.10
Requirement already satisfied: pip in /root/venv/lib/python3.9/site-packages (22.0.4)
Collecting pip
Downloading pip-22.1.2-py3-none-any.whl (2.1 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.1/2.1 MB 31.3 MB/s eta 0:00:00
Installing collected packages: pip
Attempting uninstall: pip
Found existing installation: pip 22.0.4
Uninstalling pip-22.0.4:
Successfully uninstalled pip-22.0.4
Successfully installed pip-22.1.2
Collecting openpyxl==3.0.10
Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 242.1/242.1 kB 30.6 MB/s eta 0:00:00
Collecting et-xmlfile
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10
# imports
import numpy as np
import pandas as pd
import sqlite3 as sql3
import openpyxl # Para que lea bien el excel
import matplotlib.pyplot as plt
import seaborn as sns
# Levanto los datos en 3 diferentes dataframes
## Articulos
conn = sql3.connect('/work/data/articles.db')
df_articles = pd.read_sql_query('SELECT * FROM articles', conn)
## Vendedores
df_sellers = pd.read_excel('/work/data/sellers.xlsx', index_col=0)
## Ordenes
df_orders = pd.read_csv('/work/data/orders.csv')
# Exploración del df de artículos
print('Muestra de datos')
print(df_articles.head())
print('\nFormato del dataframe')
print(df_articles.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_articles.isnull().sum())
print('\nFormato de los datos por columna')
print(df_articles.dtypes)
print('\nLos IDs de producto son unicos?')
print(df_articles['article_id'].is_unique)
Muestra de datos
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
Formato del dataframe
(31, 3)
Búsqueda de valores nulos por columna
article_id 0
article_name 0
unit_price 0
dtype: int64
Formato de los datos por columna
article_id int64
article_name object
unit_price object
dtype: object
Los IDs de producto son unicos?
True
# Exploración del df de vendedores
print('Muestra de datos')
print(df_sellers.head())
print('\nFormato del dataframe')
print(df_sellers.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_sellers.isnull().sum())
print('\nFormato de los datos por columna')
print(df_sellers.dtypes)
Muestra de datos
seller_name
seller_id
1 Aveline Swanwick
2 Jase Doy
3 Oliviero Charkham
4 Cornie Wynrehame
5 Ewell Peres
Formato del dataframe
(15, 1)
Búsqueda de valores nulos por columna
seller_name 0
dtype: int64
Formato de los datos por columna
seller_name object
dtype: object
# Exploración del df de órdenes
print('Muestra de datos')
print(df_orders.head())
print('\nFormato del dataframe')
print(df_orders.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_orders.isnull().sum())
print('\nFormato de los datos por columna')
print(df_orders.dtypes)
print('\nLos IDs de ordenes son unicos?')
print(df_orders['order_id'].is_unique)
Muestra de datos
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
Formato del dataframe
(1000, 6)
Búsqueda de valores nulos por columna
order_id 0
week 0
article_id 0
quantity 0
seller_id 0
country_name 0
dtype: int64
Formato de los datos por columna
order_id int64
week int64
article_id int64
quantity int64
seller_id int64
country_name object
dtype: object
Los IDs de ordenes son unicos?
True
df_articles['unit_price'] = df_articles['unit_price'].astype(float)
print(df_articles.dtypes)
article_id int64
article_name object
unit_price float64
dtype: object
# Hacemos una copia
my_df = df_orders.copy()
df_articles.set_index('article_id', inplace=True)
# Creamos tres columnas nuevas
my_df = my_df.assign(article_name = '')
my_df = my_df.assign(total_amount = 1)
my_df = my_df.assign(seller_name = '')
for i in range(len(my_df.index)):
# Recupero el nombre del articulo
current_article_id = my_df.loc[i]['article_id']
current_article_name = df_articles.loc[current_article_id]['article_name']
my_df.loc[i,'article_name'] = current_article_name
# Calculo el monto total
current_article_unit_price = df_articles.loc[current_article_id]['unit_price']
current_quantity = my_df.loc[i, 'quantity']
my_df.loc[i,'total_amount'] = current_article_unit_price * current_quantity
# Recupero el nombre del vendedor
current_seller_id = my_df.loc[i]['seller_id']
current_seller_name = df_sellers.loc[current_seller_id]['seller_name']
my_df.loc[i,'seller_name'] = current_seller_name
my_df.drop(['order_id', 'article_id', 'seller_id'], axis = 'columns', inplace=True)
print(my_df)
week quantity country_name article_name total_amount seller_name
0 1 10 Peru Water Cooling 675.0 Cirilo Grandham
1 1 15 Peru Mouse 454.5 Ewell Peres
2 1 5 Bolivia Netbook 725.0 Janel O'Curran
3 1 9 Brazil Tablet 1170.0 Janel O'Curran
4 1 6 Mexico Case 227.4 Daisie Slograve
.. ... ... ... ... ... ...
995 4 1 Brazil Modem 67.5 Kati Innot
996 4 15 Brazil Heatsink 150.0 Daisie Slograve
997 4 2 Colombia Heatsink 20.0 Vasily Danilyuk
998 4 14 Brazil Tablet 1820.0 Vasily Danilyuk
999 4 12 Brazil SDD 264.0 Onida Cosely
[1000 rows x 6 columns]
# RESOLUCIÓN ANALÍTICA
sumatory_article_df = my_df.groupby('article_name').sum()
top_ten_articles_by_unit = sumatory_article_df.sort_values('quantity', ascending=False).head(10)
print(top_ten_articles_by_unit['quantity'].head(1))
article_name
HDD 413
Name: quantity, dtype: int64
# RESOLUCIÓN GRÁFICA
sns.barplot(y=top_ten_articles_by_unit["quantity"], x=top_ten_articles_by_unit.index)
plt.title('TOP TEN Articles by Units')
plt.xticks(rotation=90)
plt.xlabel('Articles')
plt.ylabel('Quantity')
plt.show()
# RESOLUCIÓN ANALÍTICA
sumatory_article_df = my_df.groupby(by='article_name').sum()
top_ten_articles_by_amount = sumatory_article_df.sort_values('total_amount', ascending=False).head(10)
print(top_ten_articles_by_amount['total_amount'].head(1))
article_name
Full Pc 538335.93
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
sns.barplot(x=top_ten_articles_by_amount['total_amount'], y=top_ten_articles_by_amount.index)
plt.title('TOP TEN Articles by Amount')
plt.ylabel('Articles')
plt.xlabel('Total Amount')
plt.show()
# RESOLUCIÓN ANALÍTICA
sumatory_sellers_df = my_df.groupby('seller_name').sum()
ranking_sellers_by_amount = sumatory_sellers_df.sort_values('total_amount', ascending=False)
print(ranking_sellers_by_amount['total_amount'].head(1))
seller_name
Janel O'Curran 192832.47
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
sns.barplot(y=ranking_sellers_by_amount['total_amount'], x=ranking_sellers_by_amount.index)
plt.xticks(rotation=90)
plt.xlabel('Seller')
plt.ylabel('Total amount')
plt.show()
# RESOLUCIÓN ANALÍTICA
sumatory_weeks_df = my_df.groupby('week').sum()
ranking_weeks_by_amount = sumatory_weeks_df.sort_values('total_amount', ascending=False)
weak_week = ranking_weeks_by_amount['total_amount'].tail(1)
print(ranking_weeks_by_amount)
# Semana mas debil (para lanzar campaña)
print(weak_week)
quantity total_amount
week
1 2449 507458.81
2 2444 415364.44
3 2114 329140.03
4 1058 223844.56
week
4 223844.56
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
ax = sns.barplot(y=ranking_weeks_by_amount['total_amount'], x=ranking_weeks_by_amount.index)
for i in ax.containers:
ax.bar_label(i,)
plt.xlabel('Week')
plt.ylabel('Total amount')
plt.show()
# RESOLUCIÓN ANALÍTICA
sumatory_countries_df = my_df.groupby('country_name').sum()
top_five_countries_by_amount = sumatory_countries_df.sort_values('total_amount', ascending=False).head(5)
print(top_five_countries_by_amount)
week quantity total_amount
country_name
Brazil 717 2515 441271.85
Argentina 241 947 205832.78
Colombia 230 881 177514.29
Peru 266 1027 161421.12
Mexico 237 846 138619.99
# RESOLUCIÓN GRÁFICA
colors = sns.color_palette('pastel')[0:5]
plt.pie(top_five_countries_by_amount['total_amount'], labels = top_five_countries_by_amount.index, colors = colors, autopct='%.0f%%')
plt.show()
# RESOLUCIÓN
first_quartil_limit = sumatory_sellers_df['total_amount'].quantile(q=0.25)
boolean_mask_df = sumatory_sellers_df['total_amount'] <= first_quartil_limit
candidate_for_course_df = sumatory_sellers_df[boolean_mask_df].sort_values('total_amount', ascending=False)
print(f'El limite para el primer cuartil es de ${round(first_quartil_limit, 2)} en montos totales de ventas personales.')
print(f'Los siguientes vendedores son candidatos a realizar el curso:')
for seller in (candidate_for_course_df.index):
amount = round(candidate_for_course_df['total_amount'][seller],2)
print(f'- {seller} (${amount})')
El limite para el primer cuartil es de $69553.53 en montos totales de ventas personales.
Los siguientes vendedores son candidatos a realizar el curso:
- Milly Christoffe ($61733.69)
- Tobin Roselli ($56984.42)
- Cornie Wynrehame ($52253.57)
- Cirilo Grandham ($45009.4)
# RESOLUCIÓN
# Extraigo los articulos que me interesan de la muestra
boolean_mask_df = my_df['article_name'].isin(['Full Pc','Smartphone'])
selected_articles_df = my_df[boolean_mask_df].sort_values('country_name', ascending=True)
# Agrupo por Pais y Articulo para sumarizar
selected_articles_sumatory_df = selected_articles_df.groupby(['country_name', 'article_name']).sum()
# Paso los indices a columnas para ayudar a plotearlos
selected_articles_sumatory_df.reset_index(level=1, inplace=True)
plt.xticks(rotation=90)
sns.barplot(x=selected_articles_sumatory_df.index, y=selected_articles_sumatory_df['quantity'], hue=selected_articles_sumatory_df['article_name'])
plt.xlabel('Country')
plt.ylabel('Quantity')
plt.show()
# Para la respuesta coloquial voy a trabajar un poco mas los datos para tener el listado
# de paises.
resume_df = pd.DataFrame()
resume_df = resume_df.assign(smartphone_units = 0)
resume_df = resume_df.assign(full_pc_units = 0)
for index, row in selected_articles_sumatory_df.iterrows():
if (row['article_name'] == 'Smartphone'):
resume_df.loc[index,'smartphone_units'] = row['quantity']
if (row['article_name'] == 'Full Pc'):
resume_df.loc[index,'full_pc_units'] = row['quantity']
# Rellenamos con ceros los NaN
resume_df = resume_df.fillna(0)
resume_df = resume_df[resume_df['smartphone_units'] > resume_df['full_pc_units']]
print(f'Los paises en los que las unidad de smartphones sobrepasan las pc completas son:')
for country in (resume_df.index):
print(f'- {country}')
Los paises en los que las unidad de smartphones sobrepasan las pc completas son:
- Brazil
- Chile
- Colombia
- Ecuador
- El Salvador
- Mexico
- Uruguay
- Venezuela