TP Final - C#22039 - Francisco Branca

# A veces necesitamos instalar nuevas librerías en nuestros proyectos !pip install openpyxl==3.0.10

Collecting openpyxl==3.0.10
  Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 242.1/242.1 KB 11.7 MB/s eta 0:00:00
Collecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10
WARNING: You are using pip version 22.0.4; however, version 22.1.2 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

# imports import numpy as np import pandas as pd import sqlite3 as sql3 import openpyxl # para que levante bien el excel import matplotlib.pyplot as plt import seaborn as sns

# Levanto los datos en 3 diferentes dataframes # Artículos conn = sql3.connect('/work/data/articles.db') sql_query = pd.read_sql_query('SELECT * FROM articles', conn) df_articles = pd.DataFrame(sql_query, columns = ['article_id', 'article_name', 'unit_price']) # Vendedores df_sellers = pd.read_excel('/work/data/sellers.xlsx', index_col=0) # Órdenes df_orders = pd.read_csv('/work/data/orders.csv')

# Exploración del df de artículos print('Muestra de datos') print(df_articles.head()) print('\nFormato del dataframe') print(df_articles.shape) print('\nBúsqueda de valores nulos por columna') print(df_articles.isnull().sum()) print('\nFormato de los datos por columna') print(df_articles.dtypes)

Muestra de datos
   article_id article_name unit_price
0       20015   Smartphone     525.00
1       20016      Full Pc    2127.81
2       20017      Monitor     230.00
3       20018       Tablet     130.00
4       20019         Desk     130.10

Formato del dataframe
(31, 3)

Búsqueda de valores nulos por columna
article_id      0
article_name    0
unit_price      0
dtype: int64

Formato de los datos por columna
article_id       int64
article_name    object
unit_price      object
dtype: object

# Exploración del df de vendedores print('Muestra de datos') print(df_sellers.head()) print('\nFormato del dataframe') print(df_sellers.shape) print('\nBúsqueda de valores nulos por columna') print(df_sellers.isnull().sum()) print('\nFormato de los datos por columna') print(df_sellers.dtypes)

Muestra de datos
                 seller_name
seller_id                   
1           Aveline Swanwick
2                   Jase Doy
3          Oliviero Charkham
4           Cornie Wynrehame
5                Ewell Peres

Formato del dataframe
(15, 1)

Búsqueda de valores nulos por columna
seller_name    0
dtype: int64

Formato de los datos por columna
seller_name    object
dtype: object

# Exploración del df de órdenes print('Muestra de datos') print(df_orders.head()) print('\nFormato del dataframe') print(df_orders.shape) print('\nBúsqueda de valores nulos por columna') print(df_orders.isnull().sum()) print('\nFormato de los datos por columna') print(df_orders.dtypes)

Muestra de datos
   order_id  week  article_id  quantity  seller_id country_name
0     15024     1       20039        10         10         Peru
1     15025     1       20029        15          5         Peru
2     15026     1       20024         5         14      Bolivia
3     15027     1       20018         9         14       Brazil
4     15028     1       20035         6         15       Mexico

Formato del dataframe
(1000, 6)

Búsqueda de valores nulos por columna
order_id        0
week            0
article_id      0
quantity        0
seller_id       0
country_name    0
dtype: int64

Formato de los datos por columna
order_id         int64
week             int64
article_id       int64
quantity         int64
seller_id        int64
country_name    object
dtype: object

df_articles['unit_price'] = df_articles['unit_price'].astype(float) print(df_articles.dtypes)

article_id        int64
article_name     object
unit_price      float64
dtype: object

# Creo una copia del df_orders my_df = df_orders.copy() # Cambio el índice del df de artículos df_articles.set_index('article_id',inplace = True) # Agrego algunas columnas y pongo el campo que me va a servir de "ancla" para acordarme my_df = my_df.assign(article_name = my_df['article_id']) my_df = my_df.assign(total_amount = my_df['article_id']) my_df = my_df.assign(seller_name = my_df['seller_id']) # reemplazo los valores en el nuevo df # RESUMEN DE PASOS: # Busco el nombre del artículo y lo asigno a una variable # como df_articles está indexado por article_id, lo uso para ubicarme en # el registro que busco # SINTAXIS: df_articles.loc[indice][columna] # [indice] va a ser el dato que obtengo de my_df.loc[i]['article_id'] # -> o sea, tomo registro a registro el article_id y lo uso para extraer # el nombre del artículo for i in range(len(my_df.index)): # len(my_df.index) devuelve la cantidad de registros idArticulo = my_df.loc[i]['article_id'] nombreArticulo = df_articles.loc[idArticulo]['article_name'] # se lo asigno a la columna correspondiente my_df.loc[i,'article_name'] = nombreArticulo # hago lo mismo en un solo paso para la columna de total_amount precioArticulo = df_articles.loc[idArticulo]['unit_price'] cantArticulo = my_df.loc[i]['quantity'] my_df.loc[i,'total_amount'] = precioArticulo * cantArticulo # columna de seller_name idVendedor = my_df.loc[i]['seller_id'] nombreVendedor = df_sellers.loc[idVendedor]['seller_name'] my_df.loc[i,'seller_name'] = nombreVendedor # elimino las columnas que no necesito my_df.drop(['order_id', 'article_id', 'seller_id'], axis = 'columns', inplace=True) print(my_df)

     week  quantity country_name   article_name  total_amount      seller_name
0       1        10         Peru  Water Cooling         675.0  Cirilo Grandham
1       1        15         Peru          Mouse         454.5      Ewell Peres
2       1         5      Bolivia        Netbook         725.0   Janel O'Curran
3       1         9       Brazil         Tablet        1170.0   Janel O'Curran
4       1         6       Mexico           Case         227.4  Daisie Slograve
..    ...       ...          ...            ...           ...              ...
995     4         1       Brazil          Modem          67.5       Kati Innot
996     4        15       Brazil       Heatsink         150.0  Daisie Slograve
997     4         2     Colombia       Heatsink          20.0  Vasily Danilyuk
998     4        14       Brazil         Tablet        1820.0  Vasily Danilyuk
999     4        12       Brazil            SDD         264.0     Onida Cosely

[1000 rows x 6 columns]

# RESOLUCIÓN ANALÍTICA my_df2=my_df.groupby('article_name').sum() por_cant = my_df2.sort_values('quantity', ascending=False) print("El artículo más vendido es el siguiente:\n") print(por_cant['quantity'].head(1))

El artículo más vendido es el siguiente:

article_name
HDD    413
Name: quantity, dtype: int64

# RESOLUCIÓN GRÁFICA sns.barplot(y=por_cant["quantity"], x=por_cant.index, palette="Blues_r").set_xlabel("ARTÍCULOS") plt.xticks(rotation=90) plt.setp(plt.title('Venta de articulos (en unidades)',fontdict={'fontsize': 14},fontstyle="italic",fontweight="bold"),color="b") plt.ylabel("UNIDADES") plt.show()

# RESOLUCIÓN ANALÍTICA df2 = (my_df.groupby(by='article_name').sum()).sort_values('total_amount',ascending=False).head(5) print("La siguiente lista muestra los 5 artículos que más ingresos proporcionaron:\n") print(df2['total_amount'])

La siguiente lista muestra los 5 artículos que más ingresos proporcionaron:

article_name
Full Pc       538335.93
Notebook      251000.00
Smartphone    152250.00
Chair          69477.48
Tablet         48620.00
Name: total_amount, dtype: float64

# RESOLUCIÓN GRÁFICA plt.barh(df2.index,df2['total_amount'] , height=0.8, color=["gold" if i == "Full Pc" else "lightblue" for i in df2.index]) plt.title('Artículo que generó mas ingresos (resaltado en amarillo)',fontdict={'fontsize': 12},fontweight="bold") plt.show()

# RESOLUCIÓN ANALÍTICA df4 = (my_df.groupby(by='seller_name').sum()).sort_values('total_amount',ascending=False) print("El primer vendedor de la siguiente lista fue el mejor porque generó mayor ingreso que el resto, por lo tanto debe otorgársele el bono:\n") print(df4[['quantity']+['total_amount']])

El primer vendedor de la siguiente lista fue el mejor porque generó mayor ingreso que el resto, por lo tanto debe otorgársele el bono:

                   quantity  total_amount
seller_name                              
Janel O'Curran          703     192832.47
Brockie Patience        441     142709.88
Oliviero Charkham       555     141329.76
Vasily Danilyuk         521     129157.55
Daisie Slograve         554     120520.11
Aveline Swanwick        629     118874.33
Arnold Kilkenny         583      94552.04
Kati Innot              512      83704.62
Jase Doy                582      80628.31
Ewell Peres             496      78144.32
Onida Cosely            535      77373.37
Milly Christoffe        442      61733.69
Tobin Roselli           519      56984.42
Cornie Wynrehame        523      52253.57
Cirilo Grandham         470      45009.40

# RESOLUCIÓN GRÁFICA plt.bar(df4.index,df4['total_amount'], color=["tomato" if i == "Janel O'Curran" else "wheat" for i in df4.index], edgecolor=["black" if i == "Janel O'Curran" else "w" for i in df4.index]) plt.xticks(rotation=60, ha="right") plt.title('Mejor vendedor del mes (resaltado en rojo)',fontdict={'fontsize': 15}) plt.show()

# RESOLUCIÓN ANALÍTICA df5 = (my_df.groupby(by='week').sum()).sort_values('total_amount',ascending=False) print("Las ventas van variando decrecientemente a lo largo del mes, como se muestra a continuación:\n") print(df5['total_amount'])

Las ventas van variando decrecientemente a lo largo del mes, como se muestra a continuación:

week
1    507458.81
2    415364.44
3    329140.03
4    223844.56
Name: total_amount, dtype: float64

# RESOLUCIÓN GRÁFICA df5.index=["Semana 1","Semana 2","Semana 3","Semana 4"] plt.bar(df5.index,df5['total_amount']) plt.title('Ventas a lo largo del mes',fontdict={'fontsize': 15},fontweight="bold") plt.show()

# RESOLUCIÓN ANALÍTICA df6 = (my_df.groupby(by='country_name').sum()).sort_values('total_amount',ascending=False) print('Los totales de ventas por país se aprecian a continuación:\n') print(df6['total_amount'])

Los totales de ventas por país se aprecian a continuación:

country_name
Brazil         441271.85
Argentina      205832.78
Colombia       177514.29
Peru           161421.12
Mexico         138619.99
Venezuela       77684.52
El Salvador     57391.26
Guatemala       52579.25
Honduras        36763.56
Costa Rica      34606.50
Chile           24660.98
Bolivia         22682.80
Uruguay         17843.09
Ecuador         17475.30
Paraguay         8195.12
Puerto Rico      1265.43
Name: total_amount, dtype: float64

# RESOLUCIÓN GRÁFICA sns.barplot(y=df6["total_amount"], x=df6.index,palette=("dark")).set(title='Total de Ventas por país',xlabel='Países',ylabel='Ventas totales') plt.xticks(rotation=90) plt.show()

# RESOLUCIÓN ANALÍTICA df7 = my_df.loc[(my_df['country_name']=='Peru')] df7.drop(['week', 'total_amount'], axis = 'columns', inplace=True) df8 = (df7.groupby(by='seller_name').sum()).sort_values('quantity',ascending=False) print(df8)

                   quantity
seller_name                
Ewell Peres             115
Cornie Wynrehame        101
Aveline Swanwick         91
Arnold Kilkenny          86
Cirilo Grandham          86
Kati Innot               74
Vasily Danilyuk          68
Janel O'Curran           66
Jase Doy                 65
Daisie Slograve          63
Milly Christoffe         56
Tobin Roselli            50
Brockie Patience         49
Onida Cosely             41
Oliviero Charkham        16
/shared-libs/python3.9/py/lib/python3.9/site-packages/pandas/core/frame.py:4308: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(

# RESOLUCIÓN GRÁFICA fig, ax = plt.subplots() ax.barh(df8.index,df8["quantity"], height=1, color='pink') ax.set_title('Cantidad de articulos vendidos en Perú por vendedor',fontdict={'fontsize': 15}) #ax.set_xlabel('xlabel', fontsize = 20) bars = ax.barh(0,"") ax.bar_label(bars) for bars in ax.containers: ax.bar_label(bars) plt.show()

# RESOLUCIÓN ANALÍTICA df16 = (my_df.groupby(by='seller_name').sum()).sort_values('total_amount',ascending=False).head(5) df17=my_df.loc[(my_df.seller_name=="Daisie Slograve")|(my_df.seller_name=="Janel O'Curran")|(my_df.seller_name=="Brockie Patience")|(my_df.seller_name=="Oliviero Charkham")|(my_df.seller_name=="Vasily Danilyuk")].groupby('seller_name').agg({'total_amount':["min","max"]}) print('Los 5 mejores vendedores (de acuerdo a mayores ventas generadas) son los siguientes:') print() print(df16['total_amount']) print() print("Los montos máximos y mínimos de los 5 mejores vendedores se detallan a continuacíon:") print(df17)

Los 5 mejores vendedores (de acuerdo a mayores ventas generadas) son los siguientes:

seller_name
Janel O'Curran       192832.47
Brockie Patience     142709.88
Oliviero Charkham    141329.76
Vasily Danilyuk      129157.55
Daisie Slograve      120520.11
Name: total_amount, dtype: float64

Los montos máximos y mínimos de los 5 mejores vendedores se detallan a continuacíon:
                  total_amount          
                           min       max
seller_name                             
Brockie Patience          5.90  31917.15
Daisie Slograve           2.14  25533.72
Janel O'Curran            2.14  31917.15
Oliviero Charkham         2.95  27661.53
Vasily Danilyuk           2.95  27661.53

# RESOLUCIÓN GRÁFICA df17.columns=["Venta mínima","Venta máxima"] df17.index.name="Vendedores" sns.set() fig, axes = plt.subplots(1, 2) sns.barplot(x=df17.index,y=df17[("Venta máxima")], data = df17, ax=axes[0],palette="YlOrBr").set(title="Venta máxima por vendedor") sns.barplot(x=df17.index,y=df17[("Venta mínima")], data = df17, ax=axes[1],palette="Blues").set(title="Venta mínima por vendedor") for n, ax in enumerate(axes): ax.set_xticklabels(df17.index, rotation=65) plt.subplots_adjust(wspace=0.5) plt.show()