# A veces necesitamos instalar nuevas librerías en nuestros proyectos
!pip install --upgrade pip
!pip install openpyxl==3.0.10
# imports
import numpy as np
import pandas as pd
import sqlite3 as sql3
import openpyxl # Para que lea bien el excel
import matplotlib.pyplot as plt
import seaborn as sns
# Levanto los datos en 3 diferentes dataframes
## Articulos
conn = sql3.connect('/work/data/articles.db')
df_articles = pd.read_sql_query('SELECT * FROM articles', conn)
## Vendedores
df_sellers = pd.read_excel('/work/data/sellers.xlsx', index_col=0)
## Ordenes
df_orders = pd.read_csv('/work/data/orders.csv')
# Exploración del df de artículos
print('Muestra de datos')
print(df_articles.head())
print('\nFormato del dataframe')
print(df_articles.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_articles.isnull().sum())
print('\nFormato de los datos por columna')
print(df_articles.dtypes)
print('\nLos IDs de producto son unicos?')
print(df_articles['article_id'].is_unique)
# Exploración del df de vendedores
print('Muestra de datos')
print(df_sellers.head())
print('\nFormato del dataframe')
print(df_sellers.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_sellers.isnull().sum())
print('\nFormato de los datos por columna')
print(df_sellers.dtypes)
# Exploración del df de órdenes
print('Muestra de datos')
print(df_orders.head())
print('\nFormato del dataframe')
print(df_orders.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_orders.isnull().sum())
print('\nFormato de los datos por columna')
print(df_orders.dtypes)
print('\nLos IDs de ordenes son unicos?')
print(df_orders['order_id'].is_unique)
df_articles['unit_price'] = df_articles['unit_price'].astype(float)
print(df_articles.dtypes)
# Hacemos una copia
my_df = df_orders.copy()
df_articles.set_index('article_id', inplace=True)
# Creamos tres columnas nuevas
my_df = my_df.assign(article_name = '')
my_df = my_df.assign(total_amount = 1)
my_df = my_df.assign(seller_name = '')
for i in range(len(my_df.index)):
# Recupero el nombre del articulo
current_article_id = my_df.loc[i]['article_id']
current_article_name = df_articles.loc[current_article_id]['article_name']
my_df.loc[i,'article_name'] = current_article_name
# Calculo el monto total
current_article_unit_price = df_articles.loc[current_article_id]['unit_price']
current_quantity = my_df.loc[i, 'quantity']
my_df.loc[i,'total_amount'] = current_article_unit_price * current_quantity
# Recupero el nombre del vendedor
current_seller_id = my_df.loc[i]['seller_id']
current_seller_name = df_sellers.loc[current_seller_id]['seller_name']
my_df.loc[i,'seller_name'] = current_seller_name
my_df.drop(['order_id', 'article_id', 'seller_id'], axis = 'columns', inplace=True)
print(my_df)
# RESOLUCIÓN ANALÍTICA
sumatory_article_df = my_df.groupby('article_name').sum()
top_ten_articles_by_unit = sumatory_article_df.sort_values('quantity', ascending=False).head(10)
print(top_ten_articles_by_unit['quantity'].head(1))
# RESOLUCIÓN GRÁFICA
sns.barplot(y=top_ten_articles_by_unit["quantity"], x=top_ten_articles_by_unit.index)
plt.title('TOP TEN Articles by Units')
plt.xticks(rotation=90)
plt.xlabel('Articles')
plt.ylabel('Quantity')
plt.show()
# RESOLUCIÓN ANALÍTICA
sumatory_article_df = my_df.groupby(by='article_name').sum()
top_ten_articles_by_amount = sumatory_article_df.sort_values('total_amount', ascending=False).head(10)
print(top_ten_articles_by_amount['total_amount'].head(1))
# RESOLUCIÓN GRÁFICA
sns.barplot(x=top_ten_articles_by_amount['total_amount'], y=top_ten_articles_by_amount.index)
plt.title('TOP TEN Articles by Amount')
plt.ylabel('Articles')
plt.xlabel('Total Amount')
plt.show()
# RESOLUCIÓN ANALÍTICA
sumatory_sellers_df = my_df.groupby('seller_name').sum()
ranking_sellers_by_amount = sumatory_sellers_df.sort_values('total_amount', ascending=False)
print(ranking_sellers_by_amount['total_amount'].head(1))
# RESOLUCIÓN GRÁFICA
sns.barplot(y=ranking_sellers_by_amount['total_amount'], x=ranking_sellers_by_amount.index)
plt.xticks(rotation=90)
plt.xlabel('Seller')
plt.ylabel('Total amount')
plt.show()
# RESOLUCIÓN ANALÍTICA
sumatory_weeks_df = my_df.groupby('week').sum()
ranking_weeks_by_amount = sumatory_weeks_df.sort_values('total_amount', ascending=False)
weak_week = ranking_weeks_by_amount['total_amount'].tail(1)
print(ranking_weeks_by_amount)
# Semana mas debil (para lanzar campaña)
print(weak_week)
# RESOLUCIÓN GRÁFICA
ax = sns.barplot(y=ranking_weeks_by_amount['total_amount'], x=ranking_weeks_by_amount.index)
for i in ax.containers:
ax.bar_label(i,)
plt.xlabel('Week')
plt.ylabel('Total amount')
plt.show()
# RESOLUCIÓN ANALÍTICA
sumatory_countries_df = my_df.groupby('country_name').sum()
top_five_countries_by_amount = sumatory_countries_df.sort_values('total_amount', ascending=False).head(5)
print(top_five_countries_by_amount)
# RESOLUCIÓN GRÁFICA
colors = sns.color_palette('pastel')[0:5]
plt.pie(top_five_countries_by_amount['total_amount'], labels = top_five_countries_by_amount.index, colors = colors, autopct='%.0f%%')
plt.show()
# RESOLUCIÓN
first_quartil_limit = sumatory_sellers_df['total_amount'].quantile(q=0.25)
boolean_mask_df = sumatory_sellers_df['total_amount'] <= first_quartil_limit
candidate_for_course_df = sumatory_sellers_df[boolean_mask_df].sort_values('total_amount', ascending=False)
print(f'El limite para el primer cuartil es de ${round(first_quartil_limit, 2)} en montos totales de ventas personales.')
print(f'Los siguientes vendedores son candidatos a realizar el curso:')
for seller in (candidate_for_course_df.index):
amount = round(candidate_for_course_df['total_amount'][seller],2)
print(f'- {seller} (${amount})')
# RESOLUCIÓN
# Extraigo los articulos que me interesan de la muestra
boolean_mask_df = my_df['article_name'].isin(['Full Pc','Smartphone'])
selected_articles_df = my_df[boolean_mask_df].sort_values('country_name', ascending=True)
# Agrupo por Pais y Articulo para sumarizar
selected_articles_sumatory_df = selected_articles_df.groupby(['country_name', 'article_name']).sum()
# Paso los indices a columnas para ayudar a plotearlos
selected_articles_sumatory_df.reset_index(level=1, inplace=True)
plt.xticks(rotation=90)
sns.barplot(x=selected_articles_sumatory_df.index, y=selected_articles_sumatory_df['quantity'], hue=selected_articles_sumatory_df['article_name'])
plt.xlabel('Country')
plt.ylabel('Quantity')
plt.show()
# Para la respuesta coloquial voy a trabajar un poco mas los datos para tener el listado
# de paises.
resume_df = pd.DataFrame()
resume_df = resume_df.assign(smartphone_units = 0)
resume_df = resume_df.assign(full_pc_units = 0)
for index, row in selected_articles_sumatory_df.iterrows():
if (row['article_name'] == 'Smartphone'):
resume_df.loc[index,'smartphone_units'] = row['quantity']
if (row['article_name'] == 'Full Pc'):
resume_df.loc[index,'full_pc_units'] = row['quantity']
# Rellenamos con ceros los NaN
resume_df = resume_df.fillna(0)
resume_df = resume_df[resume_df['smartphone_units'] > resume_df['full_pc_units']]
print(f'Los paises en los que las unidad de smartphones sobrepasan las pc completas son:')
for country in (resume_df.index):
print(f'- {country}')