import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import session_info
# Para que no me tire warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
items = pd.read_csv('/work/data/olist_order_items_dataset.csv')
payments = pd.read_csv('/work/data/olist_order_payments_dataset.csv')
reviews = pd.read_csv('/work/data/olist_order_reviews_dataset.csv')
orders = pd.read_csv('/work/data/olist_orders_dataset.csv')
products = pd.read_csv('/work/data/olist_products_dataset.csv')
category_product = pd.read_csv('/work/data/product_category_name_translation.csv')
#Comprobar si tengo la misma cantidad de categorías
(
products["product_category_name"].nunique()
==
category_product["product_category_name"].shape[0]
)
contar_en_products = products["product_category_name"].nunique()
contar_en_category = category_product["product_category_name"].shape[0]
print(f'''Categorías en tabla products: {contar_en_products}
Categorías en tabla category_product: {contar_en_category}''')
ver_distintos = (
products[~products["product_category_name"]
.isin(category_product["product_category_name"])]['product_category_name']
)
ver_distintos.drop_duplicates()
products.query("product_category_name=='pc_gamer'")['product_id'].count()
products.query('product_category_name.isnull()')['product_id'].count()
products.query("product_category_name=='pcs'")['product_id'].count()
(
products.query(
"product_category_name=='portateis_cozinha_e_preparadores_de_alimentos'"
)['product_id'].count()
)
(
products.query(
"product_category_name=='portateis_casa_forno_e_cafe'"
)['product_id'].count()
)
products['product_category_name'] = (
products['product_category_name'].fillna('sin_categorizar')
)
products.query('product_category_name.isnull()')['product_id'].count()
products.query("product_category_name=='sin_categorizar'")['product_id'].count()
category_product = category_product.append(
{'product_category_name': 'sin_categorizar',
'product_category_name_english': 'without_category'}, ignore_index=True)
'''
Hay categorías que tenían un 2 como si fuera una segunda parte :P
Entonces los borré de las dos tablas
'''
products["product_category_name"] = (
products["product_category_name"].replace("_2", '', regex=True)
)
category_product = category_product.replace("_2", '', regex=True)
category_product = category_product.drop_duplicates('product_category_name')
category_product.sample(3)
category_product['product_category_name_english'] = (
category_product['product_category_name_english'].replace("_", ' ', regex=True)
).str.capitalize()
category_product.sample(4)
'''
Renombrar las categorías
pc_gamer == pcs
portateis_cozinha_e_preparadores_de_alimentos = portateis_casa_forno_e_cafe
'''
products.loc[products['product_category_name'] == 'pc_gamer', 'product_category_name'] = 'pcs'
products.loc[products['product_category_name'] == 'portateis_cozinha_e_preparadores_de_alimentos', 'product_category_name'] = 'portateis_casa_forno_e_cafe'
(
products["product_category_name"].nunique()
==
category_product["product_category_name"].shape[0]
)
take_product = (
items.groupby(['product_id'])['price', 'freight_value'].sum().round()
.reset_index()
)
take_product = take_product.astype({'price': 'int64', 'freight_value': 'int64'})
take_product.sample(4)
product_with_category = (
products.groupby(['product_category_name', 'product_id'])['product_id'].size()
.reset_index(name='total')
)
del(product_with_category['total'])
product_with_category.head()
take_category = product_with_category.merge(take_product, on='product_id')
take_category.columns = take_category.columns.str.replace('price', 'take')
take_category.sample(6)
sum_take_category = (
take_category.groupby(['product_category_name'])['take', 'freight_value'].sum()
.reset_index()
)
sum_take_category.sample(5)
table_questions_123 = (
category_product.merge(sum_take_category, on='product_category_name')
.drop('product_category_name', 1)
)
question_1 = table_questions_123.sort_values('take')
fig = px.bar(
question_1[['product_category_name_english', 'take']][-5::],
y='product_category_name_english', x="take",
title="Take of Product Categories",
text="take", template="ggplot2",
labels = {'take': 'Take', 'product_category_name_english': 'Product Category'}
)
fig.update_layout(showlegend=False)
fig.show()
q2and3 = table_questions_123.sort_values('freight_value')
q2and3['margin_of_gain'] = q2and3['freight_value'] / q2and3['take']
tabla_q23 = q2and3.copy()
tabla_q23['margin_of_gain'] = tabla_q23['margin_of_gain'].apply('{:.0%}'.format)
fig = px.bar(
tabla_q23[-5::],
y='product_category_name_english', x="freight_value",
title="Freight and Margin of gain",
text="margin_of_gain", template="seaborn",
labels = {'freight_value': 'Freight Cost',
'product_category_name_english': 'Product Category',
'margin_of_gain': 'Margin if gain'}
)
fig.update_layout(showlegend=False)
fig.show()
question_3 = q2and3.sort_values('margin_of_gain', ascending=0)
question_3['margin_of_gain'] = question_3['margin_of_gain'].apply('{:.0%}'.format)
question_3[0:1]
# A ver si tiene nulos
reviews.query("review_score.isnull()")['review_id'].count()
# Cuantas órdenes tienen cada producto
total_product_orders = (
items.groupby(['product_id', 'order_id'])['order_id'].size()
.reset_index(name='total_orders')
)
total_product_orders['total_rake'] = round(
items['price'] * total_product_orders['total_orders'])
total_product_orders.sort_values('order_id').sample(5)
#Promedio de nota de reviews por producto
total_product_reviews = (
reviews.groupby(['order_id'])['review_score'].mean()
.reset_index()
)
total_product_reviews.sample(4)
merge_order_review = (
pd.merge(total_product_orders, total_product_reviews, on='order_id', how='left')
)
without_review = merge_order_review[merge_order_review['review_score'].isnull()]
merge_order_review = (
merge_order_review.drop(without_review.index)
.sort_values('product_id')
)
merge_order_review.head(5)
tabla_pregunta_4 = (
merge_order_review.groupby(['product_id']).aggregate(
{'total_orders': 'sum', 'total_rake': 'sum', 'review_score': 'mean'})
.reset_index()
.sort_values('total_orders', ascending=0)
)
tabla_pregunta_4.head()
(
tabla_pregunta_4.loc[tabla_pregunta_4['total_rake'].idxmax(), 'product_id']
==
tabla_pregunta_4.loc[tabla_pregunta_4['review_score'].idxmax(), 'product_id']
)
para_graficar = (
tabla_pregunta_4[tabla_pregunta_4["total_orders"]
> tabla_pregunta_4["total_orders"].quantile(q=0.999)]
)
f, axs = plt.subplots(1, 2, figsize=(8, 4), gridspec_kw=dict(width_ratios=[4, 3]))
sns.heatmap(
tabla_pregunta_4.corr()[['review_score']],
annot=True, cmap='Spectral', linewidths=2, linecolor='indigo'
, ax=axs[0]
)
sns.scatterplot(
x ='total_orders',y='review_score', data=para_graficar, ax=axs[1]
)
f.tight_layout()
payment_type = (
payments.groupby(['payment_type']).aggregate(
{'order_id': 'count', 'payment_value': 'sum'})
.reset_index()
.sort_values('order_id', ascending=0)
)
payment_type.payment_value = payment_type.payment_value.astype('int64')
payment_type['payment_type'] = (
payment_type['payment_type'].replace("_", ' ', regex=True)
).str.capitalize()
payment_type = payment_type.rename(
columns={payment_type.columns[0]: 'Payment Type',
payment_type.columns[1]: 'Orders',
payment_type.columns[2]: 'Cash Flow'}
)
(
payment_type.loc[payment_type['Orders'].idxmax(), 'Payment Type']
==
payment_type.loc[payment_type['Cash Flow'].idxmax(), 'Payment Type']
)
payment_type
best_selling_product_category = (
take_category.query("product_id=='aca2eb7d00ea1a7b8ebd4e68314663af'")
)
selling_category = (
category_product.merge(best_selling_product_category, on='product_category_name')
.drop('product_category_name', 1)
)
category_name = selling_category['product_category_name_english'][0]
question_1_asc = question_1.sort_values('take', ascending=0)
puesto = question_1_asc.index.get_loc(
question_1_asc[question_1_asc['product_category_name_english']
== 'Furniture decor'].index[0]
) + 1
porcentaje = int(
100 * best_selling_product_category['take'] / question_1_asc['take'][4]
)
print(f'''El producto que más se vende está en la categoría de {category_name},
cuya misma se encuentra en el puesto {puesto} de las categorías que más recaudan,
y las ventas del producto equivale al {porcentaje}% de la recaudación de su categoría''')
session_info.show()