# A veces necesitamos instalar nuevas librerías en nuestros proyectos
#el pip install lo hago dentro de la consola del sistema op no la de python
#pongo el signo para que me saque de la consola de python
!pip install openpyxl==3.0.10
Collecting openpyxl==3.0.10
Downloading openpyxl-3.0.10-py2.py3-none-any.whl (242 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 242.1/242.1 KB 16.0 MB/s eta 0:00:00
Collecting et-xmlfile
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.0.10
WARNING: You are using pip version 22.0.4; however, version 22.1.2 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# imports
import numpy as np
import pandas as pd
import sqlite3 as sql3
import matplotlib.pyplot as plt
import seaborn as sns
# Levanto los datos en 3 diferentes dataframes
#ARTULOS
conn = sql3.connect('/work/data/articles.db') #me conecto a la base de datos
query1 = pd.read_sql_query('SELECT * FROM articles', conn)
df_articles = pd.DataFrame(query1, columns = ['article_id', 'article_name', 'unit_price'])
#VENDEDORES
df_sellers=pd.read_excel('/work/data/sellers.xlsx', index_col=0)#la columna 0 va a ser mi indice
#ORDENES
df_orders=pd.read_csv('/work/data/orders.csv')
print(df_orders)
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
.. ... ... ... ... ... ...
995 16019 4 20021 1 7 Brazil
996 16020 4 20040 15 15 Brazil
997 16021 4 20040 2 11 Colombia
998 16022 4 20018 14 11 Brazil
999 16023 4 20026 12 9 Brazil
[1000 rows x 6 columns]
# Exploración del df de artículos
print('Muestra de datos')
print(df_articles.head())
print('\n Formato del df')
print(df_articles.shape)
print('\n Busqueda de valores nulos por columna')
print(df_articles.isnull().sum())
print(df_articles.dtypes)
Muestra de datos
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
Formato del df
(31, 3)
Busqueda de valores nulos por columna
article_id 0
article_name 0
unit_price 0
dtype: int64
article_id int64
article_name object
unit_price object
dtype: object
# Exploración df_sellers
print('Muestra de datos')
print(df_sellers.head())
print('\n Formato del df')
print(df_sellers.shape)
print('\n Busqueda de valores nulos por columna')
print(df_sellers.isnull().sum())
print(df_sellers.dtypes)
Muestra de datos
seller_name
seller_id
1 Aveline Swanwick
2 Jase Doy
3 Oliviero Charkham
4 Cornie Wynrehame
5 Ewell Peres
Formato del df
(15, 1)
Busqueda de valores nulos por columna
seller_name 0
dtype: int64
seller_name object
dtype: object
# Exploración del df_orders
print('Muestra de datos')
print(df_orders.head())
print('\n Formato del df')
print(df_orders.shape)
print('\n Busqueda de valores nulos por columna')
print(df_orders.isnull().sum())
print(df_orders.dtypes)
Muestra de datos
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
Formato del df
(1000, 6)
Busqueda de valores nulos por columna
order_id 0
week 0
article_id 0
quantity 0
seller_id 0
country_name 0
dtype: int64
order_id int64
week int64
article_id int64
quantity int64
seller_id int64
country_name object
dtype: object
#arreglar df los tipo de datos los precios
df_articles['unit_price']=df_articles['unit_price'].astype(float)
print(df_articles.dtypes)
#formato $
pd.options.display.float_format='$ {:,.2f}'.format
df_articles['unit_price']
article_id int64
article_name object
unit_price float64
dtype: object
#creo copia del df base
my_df=df_orders.copy()
#convierto el indice del df articulos
df_articles.set_index('article_id',inplace=True) #solo se ejecuta una vez
#print(my_df)
print(df_articles)
article_name unit_price
article_id
20015 Smartphone $ 525.00
20016 Full Pc $ 2,127.81
20017 Monitor $ 230.00
20018 Tablet $ 130.00
20019 Desk $ 130.10
20020 Chair $ 335.64
20021 Modem $ 67.50
20022 Range Extender $ 20.45
20023 Notebook $ 1,000.00
20024 Netbook $ 145.00
20025 HDD $ 54.62
20026 SDD $ 22.00
20027 Ram Memory $ 35.95
20028 Motherboard $ 138.38
20029 Mouse $ 30.30
20030 Fan Cooler $ 4.25
20031 Webcam $ 20.07
20032 Keyboard $ 22.60
20033 Headphones $ 23.30
20034 Scanner $ 185.00
20035 Case $ 37.90
20036 Video Card $ 131.50
20037 CPU $ 139.62
20038 Power Supply $ 43.95
20039 Water Cooling $ 67.50
20040 Heatsink $ 10.00
20041 Usb Cable $ 2.95
20042 Sata Cable $ 2.14
20043 Pci Express Port $ 10.12
20044 Wi-Fi Card $ 59.61
20045 Mesh Wi-Fi X 2 $ 32.50
#agrego algunas columnas y pongo el campo que me va a servir de'ancla'
my_df=my_df.assign(article_name=my_df['article_id']) #crea una columna llamada article_name c
my_df=my_df.assign(total_amount=my_df['article_id']) #voy a necesitar al precio unitario
my_df=my_df.assign(seller_name=my_df['seller_id']) #nombre del vendedor
#reemplazar los valores en el nuevo df
#rescatar dato del df de articulo df_article(xq ahora estan solo los id[?] este va ir mutando ['article_name']
#lo hago para cada fila
for i in range(len(my_df.index)): #la longitud de cualquiera de las columnas del df, index mira cuantos indices hay q son la cantidad de filas q tengo (es una lista)
# len... devuelve la cantidad de registros
article = df_articles.loc[my_df.loc[i]['article_name']]['article_name']
# reemplazo en la columna 'article_name'
my_df.loc[i,'article_name'] = article
# modificar la columna total_amount
my_df.loc[i,'total_amount'] = my_df.loc[i,'quantity'] * df_articles.loc[my_df.loc[i]['total_amount']]['unit_price']
# modifical la columna 'seller_name'
my_df.loc[i,'seller_name'] = df_sellers.loc[my_df.loc[i]['seller_name']]['seller_name']
print(my_df)
order_id week article_id quantity seller_id country_name \
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
.. ... ... ... ... ... ...
995 16019 4 20021 1 7 Brazil
996 16020 4 20040 15 15 Brazil
997 16021 4 20040 2 11 Colombia
998 16022 4 20018 14 11 Brazil
999 16023 4 20026 12 9 Brazil
article_name total_amount seller_name
0 Water Cooling $ 675.00 Cirilo Grandham
1 Mouse $ 454.50 Ewell Peres
2 Netbook $ 725.00 Janel O'Curran
3 Tablet $ 1,170.00 Janel O'Curran
4 Case $ 227.40 Daisie Slograve
.. ... ... ...
995 Modem $ 67.50 Kati Innot
996 Heatsink $ 150.00 Daisie Slograve
997 Heatsink $ 20.00 Vasily Danilyuk
998 Tablet $ 1,820.00 Vasily Danilyuk
999 SDD $ 264.00 Onida Cosely
[1000 rows x 9 columns]
#elimino columnas id que no sirven para el analis
my_df=my_df.drop(['order_id','article_id','seller_id'],axis=1)
# RESOLUCIÓN ANALÍTICA
my_df2=my_df.groupby('article_name').sum()
por_cant=my_df2.sort_values('quantity',ascending=False)
print(por_cant['quantity'].head(1))
article_name
HDD 413
Name: quantity, dtype: int64
# RESOLUCIÓN GRÁFICA
sns.displot(my_df, x='article_name', color='pink')
plt.xticks(rotation=90)
plt.title('Artículos mas vendidos')
plt.xlabel("")
plt.ylabel("")
plt.show()
# RESOLUCIÓN ANALÍTICA
my_df3 =(my_df.groupby('article_name').sum()).sort_values('total_amount', ascending=False).head(5)
print(my_df3['total_amount'])
article_name
Full Pc $ 538,335.93
Notebook $ 251,000.00
Smartphone $ 152,250.00
Chair $ 69,477.48
Tablet $ 48,620.00
Name: total_amount, dtype: float64
# RESOLUCIÓN GRÁFICA
colores=['#ffb3e6', '#c2c2f0','#66b3ff', '#99ff99','#ffcc99']
plt.pie(x=my_df3['total_amount'],labels=my_df3.index, colors=colores)
plt.title('Artículos que generan mas ingresos($)')
plt.show()
# RESOLUCIÓN ANALÍTICA
my_df4 = (my_df.groupby('seller_name').sum()).sort_values('total_amount', ascending=False)
print(my_df4[['quantity']+['total_amount']])
quantity total_amount
seller_name
Janel O'Curran 703 $ 192,832.47
Brockie Patience 441 $ 142,709.88
Oliviero Charkham 555 $ 141,329.76
Vasily Danilyuk 521 $ 129,157.55
Daisie Slograve 554 $ 120,520.11
Aveline Swanwick 629 $ 118,874.33
Arnold Kilkenny 583 $ 94,552.04
Kati Innot 512 $ 83,704.62
Jase Doy 582 $ 80,628.31
Ewell Peres 496 $ 78,144.32
Onida Cosely 535 $ 77,373.37
Milly Christoffe 442 $ 61,733.69
Tobin Roselli 519 $ 56,984.42
Cornie Wynrehame 523 $ 52,253.57
Cirilo Grandham 470 $ 45,009.40
# RESOLUCIÓN GRÁFICA
sns.barplot(my_df4.index, my_df4['total_amount'],color='pink')
plt.xticks(rotation=60)
plt.title('Mejor vendedor')
plt.xlabel("")
plt.ylabel("$")
plt.show()
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
# RESOLUCIÓN ANALÍTICA
my_df5 = (my_df.groupby('week').sum()).sort_values('total_amount',ascending=False)
print(my_df5)
quantity total_amount
week
1 2449 $ 507,458.81
2 2444 $ 415,364.44
3 2114 $ 329,140.03
4 1058 $ 223,844.56
# RESOLUCIÓN GRÁFICA
sns.barplot(my_df5.index,my_df5['total_amount'],palette='pastel')
plt.title('Ventas por semana ($)')
plt.xlabel("semana")
plt.ylabel("$")
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
# RESOLUCIÓN
df_1 = (my_df.groupby('country_name').sum()).sort_values('total_amount', ascending=False).head(3)
#df_1=df_1.head(2)
print(df_1['total_amount'])
country_name
Brazil $ 441,271.85
Argentina $ 205,832.78
Colombia $ 177,514.29
Name: total_amount, dtype: float64
import seaborn as snNew
import matplotlib.pyplot as pltNew
snNew.barplot(y=df_1.index,x=df_1['total_amount'],palette='pastel')
plt.title('Ventas por país')
plt.xlabel("$")
plt.ylabel("")
pltNew.show()
#me quedo solo con los datos correspondienres a brasil y argentina
df_ab=my_df.loc[(my_df['country_name']=='Brazil')|(my_df['country_name']=='Argentina')]
sns.catplot( x='week',y='total_amount', hue='country_name', kind='swarm',data=df_ab,palette = 'Set2')
plt.title('Ventas ($) Argentina/Brazil por semana')
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning: 52.4% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning: 56.9% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning: 49.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/categorical.py:1296: UserWarning: 39.7% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
warnings.warn(msg, UserWarning)
En las primeras 3 semanas existen mas movimientos de ventas que en la ultima por la concentración de puntos, algo que es en general con todos los países, las misma en su mayoría no superan el importe de $5.000, salvo algún que otro caso los días lunes y martes. La mayor cantidad de transacciones es de Brasil.
cant_sem=(df_ab.groupby(['country_name','week']).sum()).sort_values('quantity',ascending=False)
print(cant_sem['quantity'])
country_name week
Brazil 1 765
2 664
3 646
4 440
Argentina 2 413
1 275
3 160
4 99
Name: quantity, dtype: int64
El volumen de ventas en Brasil en cuanto a cantidad de productos se dan en las primeras 3 semanas, y en Argentina se da la segunda semana del mes.
br=my_df.loc[(my_df['country_name']=='Brazil')]
grupo_br =(br.groupby('article_name').sum()).sort_values('quantity', ascending=False).head(5)
print(grupo_br['quantity'])
article_name
Tablet 156
HDD 119
SDD 117
Chair 112
Pci Express Port 105
Name: quantity, dtype: int64
snNew.barplot(y=grupo_br.index,x=grupo_br['quantity'],palette='pastel')
plt.title('Productos mas demandados en Brazil')
plt.xlabel("")
plt.ylabel("")
pltNew.show()
# RESOLUCIÓN
ar=my_df.loc[(my_df['country_name']=='Argentina')]
grupo_ar =(ar.groupby('article_name').sum()).sort_values('quantity', ascending=False).head(5)
print(grupo_ar['quantity'])
article_name
CPU 104
SDD 73
HDD 68
Range Extender 58
Fan Cooler 47
Name: quantity, dtype: int64
snNew.barplot(y=grupo_ar.index,x=grupo_ar['quantity'],palette='pastel')
plt.title('Productos mas demandados en Argentina')
plt.xlabel("")
plt.ylabel("")
pltNew.show()