# Do an upgrade of pip
last_pip_version = '22.1.2'
# get pip version
installed_pip_version = !pip --version
installed_pip_version = installed_pip_version.fields(1)
installed_pip_version
if(last_pip_version != installed_pip_version[0]):
print(f'Installing version {last_pip_version}...')
!python3 -m pip install --upgrade pip
else:
print(f'Version {last_pip_version} is already installed')
# A veces necesitamos instalar nuevas librerías en nuestros proyectos
#list all installed packages
installed_pkgs = !pip list
#save installed packages names in a list
installed_pkgs_list = installed_pkgs.fields(0)
#delete installed_pkgs_arr[0] and [1] because they are not packages names
del installed_pkgs_list[0:2]
print(installed_pkgs_list)
#initialize variables of packages we require for this project
req_pkgs = ['openpyxl','pandas','numpy','matplotlib','seaborn','geopandas','geopy']
installed = []
#for each package in the list
for pkg in installed_pkgs_list:
#if pkg is already installed
if(pkg in req_pkgs):
print(f'{pkg} pkg is already installed')
installed.append(pkg)
#if a required pkg is not installed
for pkg in req_pkgs:
if(pkg not in installed):
print(f'Installing {pkg} pkg...')
!pip install {pkg}
# required pkgs imports
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import openpyxl
import seaborn as sns
import geopandas as gpd
from geopandas import GeoDataFrame
import geopy
# Levanto los datos en 3 diferentes dataframes
#--- DB ---
#sql connection
articles_data = sqlite3.connect('/work/data/articles.db')
#sql query
articles_query= "SELECT * from articles"
#create dataframe
df_articles_original = pd.read_sql(articles_query, articles_data)
#close connection
articles_data.close()
#--- XLSX ---
#file path
sellers_data = '/work/data/sellers.xlsx'
#create dataframe
df_sellers_original = pd.read_excel(sellers_data)
#--- CSV ---
#file path
orders_data = '/work/data/orders.csv'
#create dataframe
df_orders_original = pd.read_csv(orders_data)
#----- Exploración del df de artículos -----
df_articles = df_articles_original.copy()
print(df_articles.head())
article_id article_name unit_price
0 20015 Smartphone 525.00
1 20016 Full Pc 2127.81
2 20017 Monitor 230.00
3 20018 Tablet 130.00
4 20019 Desk 130.10
#check df columns dtypes
print(df_articles.dtypes)
article_id int64
article_name object
unit_price object
dtype: object
#check df info to get entries, nulls and dtypes
print(df_articles.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31 entries, 0 to 30
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 article_id 31 non-null int64
1 article_name 31 non-null object
2 unit_price 31 non-null object
dtypes: int64(1), object(2)
memory usage: 872.0+ bytes
None
#----- Exploración del df de vendedores -----
df_sellers = df_sellers_original.copy()
print(df_sellers.head())
seller_id seller_name
0 1 Aveline Swanwick
1 2 Jase Doy
2 3 Oliviero Charkham
3 4 Cornie Wynrehame
4 5 Ewell Peres
#check df columns dtypes
print(df_sellers.dtypes)
seller_id int64
seller_name object
dtype: object
#check df info to get entries, nulls and dtypes
print(df_sellers.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 seller_id 15 non-null int64
1 seller_name 15 non-null object
dtypes: int64(1), object(1)
memory usage: 368.0+ bytes
None
#----- Exploración del df de órdenes -----
df_orders = df_orders_original.copy()
print(df_orders.head())
order_id week article_id quantity seller_id country_name
0 15024 1 20039 10 10 Peru
1 15025 1 20029 15 5 Peru
2 15026 1 20024 5 14 Bolivia
3 15027 1 20018 9 14 Brazil
4 15028 1 20035 6 15 Mexico
#check df columns dtypes
print(df_orders.dtypes)
order_id int64
week int64
article_id int64
quantity int64
seller_id int64
country_name object
dtype: object
print(df_orders.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 order_id 1000 non-null int64
1 week 1000 non-null int64
2 article_id 1000 non-null int64
3 quantity 1000 non-null int64
4 seller_id 1000 non-null int64
5 country_name 1000 non-null object
dtypes: int64(5), object(1)
memory usage: 47.0+ KB
None
#set index to article_id and rename axis
df_articles.set_index('article_id', inplace=True)
df_articles.index.name = None
df_articles.rename_axis('article_id', axis="columns", inplace=True)
#convert unit_price column to float64
df_articles = df_articles.astype({'unit_price':'float64'}, copy=True, errors='raise')
print(df_articles.dtypes)
article_id
article_name object
unit_price float64
dtype: object
#set index to seller_id and rename axis
df_sellers.set_index('seller_id', inplace=True)
df_sellers.index.name = None
df_sellers.rename_axis('seller_id', axis="columns", inplace=True)
print(df_articles.dtypes)
article_id
article_name object
unit_price float64
dtype: object
#set index to order_id and rename axis
df_orders.set_index('order_id', inplace=True)
df_orders.index.name = None
df_orders.rename_axis('order_id', axis="columns", inplace=True)
print(df_orders.dtypes)
order_id
week int64
article_id int64
quantity int64
seller_id int64
country_name object
dtype: object
#merge df_orders with df_sellers on seller_id
my_df = pd.merge(df_orders, df_sellers, left_on='seller_id',right_on=df_sellers.index)
#merge my_df with df_articles on article_id
my_df = pd.merge(my_df, df_articles, left_on='article_id', right_on=df_articles.index)
#add total_amount column to my_df
my_df['total_amount'] = my_df['quantity'] * my_df['unit_price']
#set index to order_id and rename axis
my_df.set_index('article_id', inplace=True)
my_df.index.name = None
#my_df.rename_axis('article_id', axis="columns", inplace=True)
my_df
weekint64
1 - 4
quantityint64
1 - 15
20039
1
10
20039
3
15
20039
2
3
20039
2
7
20039
2
1
20039
4
10
20039
3
3
20039
3
13
20039
3
6
20039
3
1
# --- SEABORN PALETTE FOR PLOTS ---
palette='pastel'
color_blue = sns.color_palette(palette)[0]
color_orange = sns.color_palette(palette)[1]
color_green = sns.color_palette(palette)[2]
color_red = sns.color_palette(palette)[3]
color_purple = sns.color_palette(palette)[4]
color_brown = sns.color_palette(palette)[5]
color_pink = sns.color_palette(palette)[6]
color_grey = sns.color_palette(palette)[7]
color_yellow = sns.color_palette(palette)[8]
color_turquoise = sns.color_palette(palette)[9]
palette='dark'
color_blue_dark = sns.color_palette(palette)[0]
color_orange_dark = sns.color_palette(palette)[1]
color_green_dark = sns.color_palette(palette)[2]
color_red_dark = sns.color_palette(palette)[3]
color_purple_dark = sns.color_palette(palette)[4]
color_brown_dark = sns.color_palette(palette)[5]
color_pink_dark = sns.color_palette(palette)[6]
color_grey_dark = sns.color_palette(palette)[7]
color_yellow_dark = sns.color_palette(palette)[8]
color_turquoise_dark = sns.color_palette(palette)[9]
# RESOLUCIÓN ANALÍTICA
#groupby article_name on the column quantity and sum them
selled_articles = my_df.groupby(['article_name'])[['article_name','quantity']].sum(['quantity'])
selled_articles.index.name = 'article_name'
#get the most selled article name
most_selled = selled_articles.idxmax()
#print(most_selled['quantity'])
#get the most selled article quantity
most_selled_qty = selled_articles.max()
#print(most_selled_qty['quantity'])
print(f'(ES) El artículo más vendido fue "{most_selled["quantity"]}" con {most_selled_qty["quantity"]} unidades vendidas.')
print(f'(EN) The most selled article was "{most_selled["quantity"]}" with {most_selled_qty["quantity"]} units selled.')
(ES) El artículo más vendido fue "HDD" con 413 unidades vendidas.
(EN) The most selled article was "HDD" with 413 units selled.
# get top 5 articles
top_5_articles = selled_articles.sort_values(by="quantity", ascending=False).head()
# creating a np array for top_5_articles
top_5_articles_arr = np.array([top_5_articles.index, top_5_articles["quantity"]])
print(top_5_articles_arr)
[['HDD' 'Tablet' 'SDD' 'Mouse' 'Netbook']
[413 374 372 322 320]]
# RESOLUCIÓN GRÁFICA
# ----- CREATE PLOT FIGURE -----
plt.figure(figsize=(10,6))
# ----- CREATE PLOT BAR -----
top_5_articles_plot = plt.bar(
top_5_articles_arr[0], #x axis
top_5_articles_arr[1], #y axis
bottom=0, #y axis starts from
width=0.7, #bars width
align='center', #bars align
color=color_orange, #'orange', #bars color
edgecolor='black', #bars edgecolor (border)
linewidth=0.5, #bars edge line width
)
# --- PLOT LABEL ---
plt.bar_label(
top_5_articles_plot,
padding=-80,
color='white',
fontweight='bold'
)
# --- PLOT GRID ---
plt.grid(visible=False)
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Top 5 selled articles'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_orange_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# ----- AXIS X -----
# X LABEL TEXT
xlabel_text= 'Articles'
# X LABEL LOCATION
xlabel_loc = 'center'
# X LABEL PADDING
xlabel_pad = 20
# X LABEL FONT
xlabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE X LABEL
plt.xlabel(
xlabel=xlabel_text,
loc=xlabel_loc,
labelpad=xlabel_pad,
fontdict=xlabel_font
)
# CREATE X TICKS
plt.xticks(
top_5_articles_arr[0],
rotation=0
)
# ----- AXIS Y -----
# Y LABEL TEXT
ylabel_text= 'Qty'
# Y LABEL LOCATION
ylabel_loc = 'center'
# Y LABEL LOCATION
ylabel_pad = 20
# Y LABEL FONT
ylabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE Y LABEL
plt.ylabel(
ylabel=ylabel_text,
loc=ylabel_loc,
labelpad=ylabel_pad,
fontdict=ylabel_font,
)
# --- removing the top and right axes spines
sns.despine()
# --- SAVE FIG ---
plt.savefig('img/top_5_selled_articles.png')
# --- SHOW PLOT ---
plt.show()
# RESOLUCIÓN ANALÍTICA
#groupby article_name on the column total_amount and sum them
articles_income = my_df.groupby(['article_name'])[['article_name','total_amount']].sum(['total_amount'])
articles_income.index.name = 'article_name'
#get the most selled article name
most_selled = articles_income.idxmax()
#get the most selled article total amount
most_selled_total_amount = articles_income.max().astype(int)
print(f'(ES) El artículo más vendido fue "{most_selled["total_amount"]}" con un total de ${most_selled_total_amount["total_amount"]}.')
print(f'(EN) The most selled article was "{most_selled["total_amount"]}" with a total of ${most_selled_total_amount["total_amount"]}.')
(ES) El artículo más vendido fue "Full Pc" con un total de $538335.
(EN) The most selled article was "Full Pc" with a total of $538335.
# get top 5 most income articles
top_5_income_articles = articles_income.sort_values(by="total_amount", ascending=False).head()
# creating a np array for top_5_income_articles
top_5_income_articles_arr = np.array([top_5_income_articles.index, top_5_income_articles["total_amount"].astype(int)])
print(top_5_income_articles_arr)
[['Full Pc' 'Notebook' 'Smartphone' 'Chair' 'Tablet']
[538335 251000 152250 69477 48620]]
# RESOLUCIÓN GRÁFICA
# ----- CREATE PLOT FIGURE -----
plt.figure(figsize=(10,6))
# ----- CREATE PLOT BAR HORIZONTAL -----
top_5_income_articles_plot = plt.barh(
top_5_income_articles_arr[0], #x axis
top_5_income_articles_arr[1], #y axis
#bottom=0, #y axis starts from
#width=0.7, #bars width
align='center', #bars align
color=color_blue, #bars color
edgecolor='black', #bars edgecolor (border)
linewidth=0.5, #bars edge line width
)
# --- PLOT LABEL ---
plt.bar_label(
top_5_income_articles_plot,
padding=10,
color=color_blue,
fontweight='bold'
)
# --- PLOT GRID ---
plt.grid(visible=False)
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Top 5 articles income'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_blue_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# ----- AXIS X -----
# X LABEL TEXT
xlabel_text= 'Total amount ($)'
# X LABEL LOCATION
xlabel_loc = 'center'
# X LABEL PADDING
xlabel_pad = 20
# X LABEL FONT
xlabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE X LABEL
plt.xlabel(
xlabel=xlabel_text,
loc=xlabel_loc,
labelpad=xlabel_pad,
fontdict=xlabel_font
)
# ----- AXIS Y -----
# Y LABEL TEXT
ylabel_text= 'Articles'
# Y LABEL LOCATION
ylabel_loc = 'center'
# Y LABEL LOCATION
ylabel_pad = 20
# Y LABEL FONT
ylabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE Y LABEL
plt.ylabel(
ylabel=ylabel_text,
loc=ylabel_loc,
labelpad=ylabel_pad,
fontdict=ylabel_font,
)
# --- removing the top and right axes spines
sns.despine()
# --- SAVE FIG ---
plt.savefig('img/top_5_articles_income.png')
# --- SHOW PLOT ---
plt.show()
# RESOLUCIÓN ANALÍTICA
#groupby seller_name on the column total_amount and sum them
top_month_sellers = my_df.groupby(['seller_name'])[['seller_name','total_amount']].sum(['total_amount'])
top_month_sellers.index.name = 'seller_name'
#get the top month seller name
top_month_seller = top_month_sellers.idxmax()
#get the top month seller total amount
top_month_seller_total_amount = top_month_sellers.max().astype(int)
print(f'(ES) Se debería otorgar el bono al vendedor {top_month_seller["total_amount"]}. Su total de ingresos fue de ${top_month_seller_total_amount["total_amount"]}.')
print(f'(EN) The bonus should be awarded to the seller {top_month_seller["total_amount"]}. Their total revenue was ${top_month_seller_total_amount["total_amount"]}.')
(ES) Se debería otorgar el bono al vendedor Janel O'Curran. Su total de ingresos fue de $192832.
(EN) The bonus should be awarded to the seller Janel O'Curran. Their total revenue was $192832.
# get top 5 month sellers
top_5_month_sellers = top_month_sellers.sort_values(by="total_amount", ascending=False).head()
# creating a np array for top_5_month_sellers
top_5_month_sellers_arr = np.array([top_5_month_sellers.index, top_5_month_sellers["total_amount"].astype(int)])
top_5_sellers_names = top_5_month_sellers_arr[0]
top_5_sellers_amount = top_5_month_sellers_arr[1]
print(top_5_month_sellers_arr)
[["Janel O'Curran" 'Brockie Patience' 'Oliviero Charkham'
'Vasily Danilyuk' 'Daisie Slograve']
[192832 142709 141329 129157 120520]]
# RESOLUCIÓN GRÁFICA
# ----- CREATE PLOT FIGURE -----
plt.figure(figsize=(10,6))
# ----- FUNCTION FOR AUTOPCT PARAMETER -----
def create_autopct(values):
def my_autopct(pct):
total = sum(values)
val = int(round(pct*total/100.0))
return '{p:.2f}% \n\n(${v:d})'.format(p=pct,v=val)
return my_autopct
# ----- CREATE PLOT PIE -----
top_5_month_sellers_plot = plt.pie(
top_5_sellers_amount, #wedge sizes
labels=top_5_sellers_names, #labels of each wedge
colors= sns.color_palette('pastel'), #wedges colors
autopct=create_autopct(top_5_sellers_amount), #percentage
shadow=True, #shadow of the plot
explode=(0.1,0,0,0,0) #
)
plt.axis("equal")
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Top 5 sellers of the month'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_purple_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# --- SAVE FIG ---
plt.savefig('img/top_5_sellers_of_the_month.png')
# --- SHOW PLOT ---
plt.show()
# RESOLUCIÓN ANALÍTICA
#groupby week on the column total_amount and sum them
weeks_total_amount = my_df.groupby(['week'])[['week','total_amount']].sum(['total_amount'])
weeks_total_amount.index.name = 'week'
#get the top month seller name
week_most_income = weeks_total_amount.idxmax()
#get the top month seller total amount
week_most_income_total_amount = weeks_total_amount.max().astype(int)
print(f'(ES) La semana con mayor cantidad de ventas fue la {week_most_income["total_amount"]}°. El total de ingresos fue de ${week_most_income_total_amount["total_amount"]}.')
print(f'(EN) The week with the most sales was the {week_most_income["total_amount"]}° one. Total income was ${week_most_income_total_amount["total_amount"]}.')
(ES) La semana con mayor cantidad de ventas fue la 1°. El total de ingresos fue de $507458.
(EN) The week with the most sales was the 1° one. Total income was $507458.
# RESOLUCIÓN GRÁFICA
# RESOLUCIÓN GRÁFICA
# ----- CREATE PLOT FIGURE -----
plt.figure(figsize=(10,6))
# ----- CREATE FIGURE -----
ordered_weeks_by_income_plot = plt.plot(
ordered_week_names, #x axis
ordered_week_amount, #y axis
color=color_turquoise, #line color
linewidth=5, # edge line width
)
# --- PLOT GRID ---
plt.grid(visible=False)
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Weeks vs income'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_turquoise_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# ----- AXIS X -----
# X LABEL TEXT
xlabel_text= 'Weeks'
# X LABEL LOCATION
xlabel_loc = 'center'
# X LABEL PADDING
xlabel_pad = 20
# X LABEL FONT
xlabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE X LABEL
plt.xlabel(
xlabel=xlabel_text,
loc=xlabel_loc,
labelpad=xlabel_pad,
fontdict=xlabel_font
)
# CREATE X TICKS
plt.xticks(
ordered_week_names,
rotation=0
)
# ----- AXIS Y -----
# Y LABEL TEXT
ylabel_text= 'Total amount ($)'
# Y LABEL LOCATION
ylabel_loc = 'center'
# Y LABEL LOCATION
ylabel_pad = 20
# Y LABEL FONT
ylabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE Y LABEL
plt.ylabel(
ylabel=ylabel_text,
loc=ylabel_loc,
labelpad=ylabel_pad,
fontdict=ylabel_font,
)
# --- removing the top and right axes spines
sns.despine()
# --- SAVE FIG ---
plt.savefig('img/week_income.png')
# --- SHOW PLOT ---
plt.show()
# RESOLUCIÓN ANALÍTICA
#groupby country on the column total_amount and sum them
countries_total_amount = my_df.groupby(['country_name'])[['country_name','total_amount']].sum(['total_amount'])
countries_total_amount.index.name = 'country_name'
#get the top country name
country_most_income = countries_total_amount.idxmax()
#get the top country total amount
country_most_income_total_amount = countries_total_amount.max().astype(int)
print(f'(ES) El país con mayor cantidad de ventas fue {country_most_income["total_amount"]}. El total de ingresos fue de ${country_most_income_total_amount["total_amount"]}.')
print(f'(EN) The country with the most sales was {country_most_income["total_amount"]}. Total income was ${country_most_income_total_amount["total_amount"]}.')
(ES) El país con mayor cantidad de ventas fue Brazil. El total de ingresos fue de $441271.
(EN) The country with the most sales was Brazil. Total income was $441271.
# get world map from geopandas
world_map = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
#print(world_map.groupby(['continent']).sum())
# get america countries for map
america=['South America','North America']
america_map = world_map.query("continent in @america")
# create one figure and one subplot
fig, axes = plt.subplots(figsize = (12,8),facecolor = plt.cm.Blues(.2))
axes.set_facecolor(plt.cm.Blues(.2))
#create a countries column to use with the geopandas
countries_total_amount['countries'] = countries_total_amount.index
#create a geopandas dataframe from the countries_total_amount df
countries_total_amount = gpd.GeoDataFrame(countries_total_amount)
#add a geometry column to the countries_total_amount df
# this is required for using the geopandas
countries_total_amount['geometry'] = gpd.tools.geocode(countries_total_amount['countries']).geometry
#create a GeoDataFrame plot for the world map with the countries
america_map.plot(color="white",ax=axes)
# create a geopandas plot
countries_total_amount.plot(
column = 'total_amount',
cmap = 'Greens',
ax = axes,
legend = True,
markersize= 300,
legend_kwds = {
'label': "Total amount ($)"
},
missing_kwds = {
'facecolor':'Gray'
}
)
# create figure title
fig.suptitle(
'Total amount of sales by country',
y=1,
fontsize = 'xx-large',
fontweight = 'bold'
)
# --- SAVE FIG ---
plt.savefig('img/sales_per_country_map.png')
# --- SHOW PLOT ---
plt.show()
# get countries ordered by total_amount
top_5_countries = countries_total_amount.sort_values(by="total_amount", ascending=False).head()
# creating a np array for top_5_countries
top_5_countries_arr = np.array([top_5_countries.index, top_5_countries["total_amount"].astype(int)])
top_5_countries_names = top_5_countries_arr[0]
top_5_countries_amount = top_5_countries_arr[1]
print(top_5_countries_arr)
[['Brazil' 'Argentina' 'Colombia' 'Peru' 'Mexico']
[441271 205832 177514 161421 138619]]
# RESOLUCIÓN GRÁFICA
# ----- CREATE PLOT FIGURE -----
plt.figure(figsize=(10,6))
# ----- CREATE PLOT BAR -----
top_5_countries_plot = plt.bar(
top_5_countries_names, #x axis
top_5_countries_amount, #y axis
bottom=0, #y axis starts from
width=0.7, #bars width
align='center', #bars align
color=color_green, #bars color
edgecolor='black', #bars edgecolor (border)
linewidth=0.5, #bars edge line width
)
# --- PLOT LABEL ---
plt.bar_label(
top_5_countries_plot,
padding=-80,
color='white',
fontweight='bold'
)
# --- PLOT GRID ---
plt.grid(visible=False)
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Top 5 countries'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_green_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# ----- AXIS X -----
# X LABEL TEXT
xlabel_text= 'Countries'
# X LABEL LOCATION
xlabel_loc = 'center'
# X LABEL PADDING
xlabel_pad = 20
# X LABEL FONT
xlabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE X LABEL
plt.xlabel(
xlabel=xlabel_text,
loc=xlabel_loc,
labelpad=xlabel_pad,
fontdict=xlabel_font
)
# CREATE X TICKS
plt.xticks(
top_5_countries_arr[0],
rotation=0
)
# ----- AXIS Y -----
# Y LABEL TEXT
ylabel_text= 'Total amount ($)'
# Y LABEL LOCATION
ylabel_loc = 'center'
# Y LABEL LOCATION
ylabel_pad = 20
# Y LABEL FONT
ylabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE Y LABEL
plt.ylabel(
ylabel=ylabel_text,
loc=ylabel_loc,
labelpad=ylabel_pad,
fontdict=ylabel_font,
)
# --- removing the top and right axes spines
sns.despine()
# --- SAVE FIG ---
plt.savefig('img/top_5_countries.png')
# --- SHOW PLOT ---
plt.show()
# RESOLUCIÓN ANALÍTICA
#groupby country_name and seller_name on the column total_amount, then sum them
best_seller_by_country = my_df.groupby(['country_name','seller_name']).sum(['total_amount'])
#get countries from my_df
countries = my_df.groupby('country_name').sum().index.values
#create a new df
df_best_seller_by_country = pd.DataFrame(columns=['seller_name', 'total_amount'], index=countries)
for country in countries:
#search for country in df
loc_country = best_seller_by_country.loc[country]
#get max total_amount for that country
total_amount = loc_country.max()['total_amount']
#get max total_amount seller for that country
seller = loc_country[(loc_country['total_amount'] == total_amount)].index[0]
#add seller and total amount to final df
df_best_seller_by_country.loc[country] = {'seller_name': seller, 'total_amount': total_amount}
df_best_seller_by_country = df_best_seller_by_country.astype({'total_amount':'int'})
print(df_best_seller_by_country)
seller_name total_amount
Argentina Janel O'Curran 34971
Bolivia Kati Innot 10750
Brazil Daisie Slograve 65283
Chile Onida Cosely 6769
Colombia Brockie Patience 33654
Costa Rica Oliviero Charkham 24021
Ecuador Oliviero Charkham 6647
El Salvador Janel O'Curran 32967
Guatemala Vasily Danilyuk 27661
Honduras Oliviero Charkham 19403
Mexico Jase Doy 29096
Paraguay Aveline Swanwick 3220
Peru Brockie Patience 46089
Puerto Rico Ewell Peres 1256
Uruguay Janel O'Curran 9488
Venezuela Oliviero Charkham 33961
# RESOLUCIÓN ANALÍTICA
#groupby country_name and article_name on the column total_amount, then sum them
most_selled_article_by_country = my_df.groupby(['country_name','article_name']).sum(['total_amount'])
#get countries from my_df
countries = my_df.groupby('country_name').sum().index.values
#create a new df
df_most_selled_article_by_country = pd.DataFrame(columns=['article_name', 'total_amount'], index=countries)
for country in countries:
#search for country in df
loc_country = most_selled_article_by_country.loc[country]
#get max total_amount for that country
total_amount = loc_country.max()['total_amount']
#get max total_amount article for that country
article = loc_country[(loc_country['total_amount'] == total_amount)].index[0]
#add article and total amount to final df
df_most_selled_article_by_country.loc[country] = {'article_name': article, 'total_amount': total_amount}
df_most_selled_article_by_country = df_most_selled_article_by_country.astype({'total_amount':'int'})
print(df_most_selled_article_by_country)
article_name total_amount
Argentina Full Pc 95751
Bolivia Full Pc 10639
Brazil Full Pc 134052
Chile Smartphone 6300
Colombia Full Pc 72345
Costa Rica Full Pc 23405
Ecuador Smartphone 7875
El Salvador Full Pc 40428
Guatemala Full Pc 27661
Honduras Notebook 14000
Mexico Notebook 44000
Paraguay Monitor 4370
Peru Full Pc 51067
Puerto Rico CPU 1256
Uruguay Full Pc 8511
Venezuela Full Pc 44684
#groupby country_name and article_name on the column total_amount, then sum them
most_selled_article_by_country = my_df.groupby(['country_name','article_name']).sum(['total_amount']).reset_index()
most_selled_article_by_country = most_selled_article_by_country.sort_values(by='total_amount', ascending=False)
most_selled_article_by_country = most_selled_article_by_country.head(20)
#print(most_selled_article_by_country)
countries = most_selled_article_by_country['country_name']
articles = most_selled_article_by_country['article_name']
total_amount = most_selled_article_by_country['total_amount']
# ----- CREATE PLOT FIGURE -----
# create one figure and one subplot
fig, axes = plt.subplots(figsize = (12,8))
most_selled_article_by_country_plot = sns.barplot(
x=countries,
y=total_amount,
hue=articles,
palette="pastel",
edgecolor=".6",
data=most_selled_article_by_country,
dodge=False,
#ax=axes
)
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Most selled articles by country'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_brown_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# ----- AXIS X -----
# X LABEL TEXT
xlabel_text= 'Countries'
# X LABEL LOCATION
xlabel_loc = 'center'
# X LABEL PADDING
xlabel_pad = 20
# X LABEL FONT
xlabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE X LABEL
plt.xlabel(
xlabel=xlabel_text,
loc=xlabel_loc,
labelpad=xlabel_pad,
fontdict=xlabel_font
)
# ----- AXIS Y -----
# Y LABEL TEXT
ylabel_text= 'Total amount ($)'
# Y LABEL LOCATION
ylabel_loc = 'center'
# Y LABEL LOCATION
ylabel_pad = 20
# Y LABEL FONT
ylabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE Y LABEL
plt.ylabel(
ylabel=ylabel_text,
loc=ylabel_loc,
labelpad=ylabel_pad,
fontdict=ylabel_font,
)
plt.legend(title='Articles', loc='upper right')
# --- removing the top and right axes spines
sns.despine()
# --- SAVE FIG ---
plt.savefig('img/most_selled_articles_by_country.png')
# --- SHOW PLOT ---
plt.show()