# A veces necesitamos instalar nuevas librerías en nuestros proyectos
!pip install openpyxl==3.0.10 # esta librería no viene incorporado en deepnote, entonces la voy a tener que instalar SIEMPRE desde la consola del SISTEMA OPERATIVO (!) y no desde el interprete de python.
#pip lines: son los métodos que le voy a aplicar a cada dataframe sucesivamente, serie de procesos donde lo que sale de un proceso se convierte en la entrada de un nuevo proceso.
Requirement already satisfied: openpyxl==3.0.10 in /root/venv/lib/python3.9/site-packages (3.0.10)
Requirement already satisfied: et-xmlfile in /root/venv/lib/python3.9/site-packages (from openpyxl==3.0.10) (1.1.0)
WARNING: You are using pip version 22.0.4; however, version 22.1.2 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# imports
import numpy as np
import pandas as pd
import sqlite3 as sql3
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
# Levanto los datos en 3 diferentes dataframes
# ARTÍCULOS
conn = sql3.connect('/work/data/articles.db')
query1 = pd.read_sql_query('SELECT * FROM articles', conn)
df_articles = pd.DataFrame(query1, columns = ['article_id', 'article_name', 'unit_price']) #creamos un dt de articles
#print(df_articles)
# VENDEDORES
df_sellers = pd.read_excel('/work/data/sellers.xlsx', index_col=0)
#print(df_sellers)
# ÓRDENES
df_orders = pd.read_csv('/work/data/orders.csv')
print(df_orders)
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
.. ... ... ... ... ... ...
995 16019 4 20021 1 7 Brazil
996 16020 4 20040 15 15 Brazil
997 16021 4 20040 2 11 Colombia
998 16022 4 20018 14 11 Brazil
999 16023 4 20026 12 9 Brazil
[1000 rows x 6 columns]
# Exploración del df de artículos
print('Muestra de datos')
print(df_articles.head())
print('\nFormato del dataframe')
print(df_articles.shape) # para ver el formato del dt, se usa el atributo "shape"
print('\nBúsqueda de valores nulos por columna')
print(df_articles.isnull().sum()) # busco los nulos de cada columna y los voy a contar
print('\nFormato de los datos')
print(df_articles.dtypes) # ver el tipo de dato que tengo en cada columna
Muestra de datos
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
Formato del dataframe
(31, 3)
Búsqueda de valores nulos por columna
article_id 0
article_name 0
unit_price 0
dtype: int64
Formato de los datos
article_id int64
article_name object
unit_price object
dtype: object
# Exploración del df de vendedores
print('Muestra de datos')
print(df_sellers.head())
print('\nFormato del dataframe')
print(df_sellers.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_sellers.isnull().sum())
print('\nFormato de los datos')
print(df_sellers.dtypes)
Muestra de datos
seller_name
seller_id
1 Aveline Swanwick
2 Jase Doy
3 Oliviero Charkham
4 Cornie Wynrehame
5 Ewell Peres
Formato del dataframe
(15, 1)
Búsqueda de valores nulos por columna
seller_name 0
dtype: int64
Formato de los datos
seller_name object
dtype: object
# Exploración del df de órdenes
print('Muestra de datos')
print(df_orders.head())
print('\nFormato del dataframe')
print(df_orders.shape)
print('\nBúsqueda de valores nulos por columna')
print(df_orders.isnull().sum())
print('\nFormato de los datos')
print(df_orders.dtypes)
Muestra de datos
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
Formato del dataframe
(1000, 6)
Búsqueda de valores nulos por columna
order_id 0
week 0
article_id 0
quantity 0
seller_id 0
country_name 0
dtype: int64
Formato de los datos
order_id int64
week int64
article_id int64
quantity int64
seller_id int64
country_name object
dtype: object
df_articles['unit_price'] = df_articles['unit_price'].astype(float) # cambio el tipo de objeto
print(df_articles.dtypes)
article_id int64
article_name object
unit_price float64
dtype: object
# Creo una copia del df base
my_df = df_orders.copy()
# Cambio el índice del df artículos
df_articles.set_index('article_id', inplace=True) # aplica el cambio sobre el mismo df
print(my_df)
#print(df_articles)
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
.. ... ... ... ... ... ...
995 16019 4 20021 1 7 Brazil
996 16020 4 20040 15 15 Brazil
997 16021 4 20040 2 11 Colombia
998 16022 4 20018 14 11 Brazil
999 16023 4 20026 12 9 Brazil
[1000 rows x 6 columns]
# Agrego algunas columnas y pongo el campo que me va a servir de "ancla"
my_df = my_df.assign(article_name = my_df['article_id'])
my_df = my_df.assign(total_amount = my_df['article_id']) # voy a necesitar el unit_price
my_df = my_df.assign(seller_name = my_df['seller_id'])
# reemplazar los valores en el nuevo df
# df_articles[?]['article_name']
for i in range(len(my_df.index)):
# len... devuelve la cantidad de registros
article = df_articles.loc[my_df.loc[i]['article_name']]['article_name'] # para buscar por filas, uso ".loc"
# reemplazo en la columna 'article_name'
my_df.loc[i,'article_name'] = article
# modificar la columna total_amount
my_df.loc[i,'total_amount'] = my_df.loc[i,'quantity'] * df_articles.loc[my_df.loc[i]['total_amount']]['unit_price']
#modificar la columna "seller_name"
my_df.loc[i,'seller_name'] = df_sellers.loc[my_df.loc[i]['seller_name']]['seller_name']
print(my_df)
order_id week article_id quantity seller_id country_name \
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
.. ... ... ... ... ... ...
995 16019 4 20021 1 7 Brazil
996 16020 4 20040 15 15 Brazil
997 16021 4 20040 2 11 Colombia
998 16022 4 20018 14 11 Brazil
999 16023 4 20026 12 9 Brazil
article_name total_amount seller_name
0 Water Cooling 675.0 Cirilo Grandham
1 Mouse 454.5 Ewell Peres
2 Netbook 725.0 Janel O'Curran
3 Tablet 1170.0 Janel O'Curran
4 Case 227.4 Daisie Slograve
.. ... ... ...
995 Modem 67.5 Kati Innot
996 Heatsink 150.0 Daisie Slograve
997 Heatsink 20.0 Vasily Danilyuk
998 Tablet 1820.0 Vasily Danilyuk
999 SDD 264.0 Onida Cosely
[1000 rows x 9 columns]
# elimino las columnas que no necesito
my_df.drop(['order_id','article_id','seller_id'], axis='columns', inplace=True) #inplace es para que no me devuelva una copia y trabaje sobre el mismo dt
print(my_df)
week quantity country_name article_name total_amount seller_name
0 1 10 Peru Water Cooling 675.0 Cirilo Grandham
1 1 15 Peru Mouse 454.5 Ewell Peres
2 1 5 Bolivia Netbook 725.0 Janel O'Curran
3 1 9 Brazil Tablet 1170.0 Janel O'Curran
4 1 6 Mexico Case 227.4 Daisie Slograve
.. ... ... ... ... ... ...
995 4 1 Brazil Modem 67.5 Kati Innot
996 4 15 Brazil Heatsink 150.0 Daisie Slograve
997 4 2 Colombia Heatsink 20.0 Vasily Danilyuk
998 4 14 Brazil Tablet 1820.0 Vasily Danilyuk
999 4 12 Brazil SDD 264.0 Onida Cosely
[1000 rows x 6 columns]
# RESOLUCIÓN ANALÍTICA
my_df2 = my_df.groupby('article_name').sum()
por_cant = my_df2.sort_values('quantity',ascending=False)
print(por_cant['quantity'].head(1))
article_name
HDD 413
Name: quantity, dtype: int64
# RESOLUCIÓN GRÁFICA
sns.displot(my_df, x='article_name')
plt.xticks(rotation=80)
plt.show()
#darle el estilo que querramos: ponerle titulo, etiqueta a ejes color
# RESOLUCIÓN ANALÍTICA
my_df3 =(my_df.groupby('article_name').sum()).sort_values('total_amount', ascending=False).head(5)
"""
my_df3 =(my_df.groupby('article_name').sum())
my_df3 = my_df3.sort_values('total_amount', ascending=False)
my_df3 = my_df3.head(5)
"""
print(my_df3['total_amount'])
article_name
Full Pc 538335.93
Notebook 251000.00
Smartphone 152250.00
Chair 69477.48
Tablet 48620.00
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
# OJO: No puedo hacer un análisis comparativo ni poner porcentajes porque no están todos los artículos en este df. Sólo puse los top 5
plt.pie(x=my_df3['total_amount'], labels=my_df3.index)
plt.show()
# RESOLUCIÓN ANALÍTICA
my_df4 = (my_df.groupby('seller_name').sum()).sort_values('total_amount', ascending=False)
print(my_df4[['quantity']+['total_amount']]) # para que se visualicen las dos columnas
quantity total_amount
seller_name
Janel O'Curran 703 192832.47
Brockie Patience 441 142709.88
Oliviero Charkham 555 141329.76
Vasily Danilyuk 521 129157.55
Daisie Slograve 554 120520.11
Aveline Swanwick 629 118874.33
Arnold Kilkenny 583 94552.04
Kati Innot 512 83704.62
Jase Doy 582 80628.31
Ewell Peres 496 78144.32
Onida Cosely 535 77373.37
Milly Christoffe 442 61733.69
Tobin Roselli 519 56984.42
Cornie Wynrehame 523 52253.57
Cirilo Grandham 470 45009.40
# RESOLUCIÓN GRÁFICA
plt.bar(my_df4.index, my_df4['total_amount'])
plt.xticks(rotation=60)
plt.show()
# RESOLUCIÓN ANALÍTICA
my_df5 = (my_df.groupby('week').sum()).sort_values('total_amount',ascending=False)
print(my_df5)
quantity total_amount
week
1 2449 507458.81
2 2444 415364.44
3 2114 329140.03
4 1058 223844.56
# Ejemplo con gráfico de tortas
# plt.pie(x=my_df5['total_amount'], labels=my_df5.index)
# plt.show()
my_df5['total_amount'].median()
# RESOLUCIÓN GRÁFICA
plt.bar(my_df5.index,my_df5['total_amount'], color="red", alpha=0.7)
plt.show()
# RESOLUCIÓN
cols = ["quantity", "country_name", "total_amount"]
my_df6 = my_df[cols]
my_df6["profit"] = my_df6.quantity * my_df6.total_amount
my_df6 = my_df6.groupby("country_name").sum()
my_df6.sort_values("profit", ascending=False, inplace=True)
my_df6 = my_df6.head()
print(my_df6)
quantity total_amount profit
country_name
Brazil 2515 441271.85 4559717.13
Argentina 947 205832.78 2129962.50
Colombia 881 177514.29 1804661.91
Peru 1027 161421.12 1556251.78
Mexico 846 138619.99 1426749.47
<ipython-input-21-00033b0ec374>:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
my_df6["profit"] = my_df6.quantity * my_df6.total_amount
plt.bar(my_df6.index, my_df6.profit, color="red", alpha=0.5)
plt.show()
# RESOLUCIÓN
cols1 = ["week", "country_name", "quantity"]
my_df7 = my_df[cols1]
mercosur = ["Argentina", "Brazil", "Uruguay", "Paraguay"]
my_df7 = my_df7[(my_df7.country_name == mercosur[0]) | (my_df7.country_name == mercosur[1]) | (my_df7.country_name == mercosur[2]) | (my_df7.country_name == mercosur[3])]
my_df7 = my_df7.groupby(["country_name", "week"]).sum()
print(my_df7)
quantity
country_name week
Argentina 1 275
2 413
3 160
4 99
Brazil 1 765
2 664
3 646
4 440
Paraguay 1 17
2 58
3 45
4 3
Uruguay 1 8
2 33
3 36
4 15
# RESOLUCIÓN
cols1 = ["week", "country_name", "quantity"]
my_df8 = my_df[cols1]
mercosur = ["Argentina", "Brazil", "Uruguay", "Paraguay"]
my_df8 = my_df8[(my_df8.country_name == mercosur[0]) | (my_df8.country_name == mercosur[1]) | (my_df8.country_name == mercosur[2]) | (my_df8.country_name == mercosur[3])]
print(my_df8)
week country_name quantity
3 1 Brazil 9
10 1 Brazil 1
11 1 Argentina 4
14 1 Brazil 15
15 1 Brazil 11
.. ... ... ...
993 4 Brazil 8
995 4 Brazil 1
996 4 Brazil 15
998 4 Brazil 14
999 4 Brazil 12
[457 rows x 3 columns]
my_df8.columns
sns.pairplot(my_df8)
sns.pairplot(my_df8, hue='week')
escala = StandardScaler()
escalado = escala.fit_transform(my_df[['week', 'quantity']])
escalado.T
matriz_cov = np.cov(escalado.T)
matriz_cov
# Mapa de calor
plt.figure(figsize=(9,9))
mapa_calor = sns.heatmap(matriz_cov, fmt='.2f', square=True, cbar=True, annot=True, xticklabels=['week', 'quantity'], yticklabels=['week', 'quantity'])