# Do an upgrade of pip
last_pip_version = '22.1.2'
# get pip version
installed_pip_version = !pip --version
installed_pip_version = installed_pip_version.fields(1)
installed_pip_version
if(last_pip_version != installed_pip_version[0]):
print(f'Installing version {last_pip_version}...')
!python3 -m pip install --upgrade pip
else:
print(f'Version {last_pip_version} is already installed')
# A veces necesitamos instalar nuevas librerías en nuestros proyectos
#list all installed packages
installed_pkgs = !pip list
#save installed packages names in a list
installed_pkgs_list = installed_pkgs.fields(0)
#delete installed_pkgs_arr[0] and [1] because they are not packages names
del installed_pkgs_list[0:2]
print(installed_pkgs_list)
#initialize variables of packages we require for this project
req_pkgs = ['openpyxl','pandas','numpy','matplotlib','seaborn','geopandas','geopy']
installed = []
#for each package in the list
for pkg in installed_pkgs_list:
#if pkg is already installed
if(pkg in req_pkgs):
print(f'{pkg} pkg is already installed')
installed.append(pkg)
#if a required pkg is not installed
for pkg in req_pkgs:
if(pkg not in installed):
print(f'Installing {pkg} pkg...')
!pip install {pkg}
# required pkgs imports
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import openpyxl
import seaborn as sns
import geopandas as gpd
from geopandas import GeoDataFrame
import geopy
# Levanto los datos en 3 diferentes dataframes
#--- DB ---
#sql connection
articles_data = sqlite3.connect('/work/data/articles.db')
#sql query
articles_query= "SELECT * from articles"
#create dataframe
df_articles_original = pd.read_sql(articles_query, articles_data)
#close connection
articles_data.close()
#--- XLSX ---
#file path
sellers_data = '/work/data/sellers.xlsx'
#create dataframe
df_sellers_original = pd.read_excel(sellers_data)
#--- CSV ---
#file path
orders_data = '/work/data/orders.csv'
#create dataframe
df_orders_original = pd.read_csv(orders_data)
#----- Exploración del df de artículos -----
df_articles = df_articles_original.copy()
print(df_articles.head())
#check df columns dtypes
print(df_articles.dtypes)
#check df info to get entries, nulls and dtypes
print(df_articles.info())
#----- Exploración del df de vendedores -----
df_sellers = df_sellers_original.copy()
print(df_sellers.head())
#check df columns dtypes
print(df_sellers.dtypes)
#check df info to get entries, nulls and dtypes
print(df_sellers.info())
#----- Exploración del df de órdenes -----
df_orders = df_orders_original.copy()
print(df_orders.head())
#check df columns dtypes
print(df_orders.dtypes)
print(df_orders.info())
#set index to article_id and rename axis
df_articles.set_index('article_id', inplace=True)
df_articles.index.name = None
df_articles.rename_axis('article_id', axis="columns", inplace=True)
#convert unit_price column to float64
df_articles = df_articles.astype({'unit_price':'float64'}, copy=True, errors='raise')
print(df_articles.dtypes)
#set index to seller_id and rename axis
df_sellers.set_index('seller_id', inplace=True)
df_sellers.index.name = None
df_sellers.rename_axis('seller_id', axis="columns", inplace=True)
print(df_articles.dtypes)
#set index to order_id and rename axis
df_orders.set_index('order_id', inplace=True)
df_orders.index.name = None
df_orders.rename_axis('order_id', axis="columns", inplace=True)
print(df_orders.dtypes)
#merge df_orders with df_sellers on seller_id
my_df = pd.merge(df_orders, df_sellers, left_on='seller_id',right_on=df_sellers.index)
#merge my_df with df_articles on article_id
my_df = pd.merge(my_df, df_articles, left_on='article_id', right_on=df_articles.index)
#add total_amount column to my_df
my_df['total_amount'] = my_df['quantity'] * my_df['unit_price']
#set index to order_id and rename axis
my_df.set_index('article_id', inplace=True)
my_df.index.name = None
#my_df.rename_axis('article_id', axis="columns", inplace=True)
my_df
# --- SEABORN PALETTE FOR PLOTS ---
palette='pastel'
color_blue = sns.color_palette(palette)[0]
color_orange = sns.color_palette(palette)[1]
color_green = sns.color_palette(palette)[2]
color_red = sns.color_palette(palette)[3]
color_purple = sns.color_palette(palette)[4]
color_brown = sns.color_palette(palette)[5]
color_pink = sns.color_palette(palette)[6]
color_grey = sns.color_palette(palette)[7]
color_yellow = sns.color_palette(palette)[8]
color_turquoise = sns.color_palette(palette)[9]
palette='dark'
color_blue_dark = sns.color_palette(palette)[0]
color_orange_dark = sns.color_palette(palette)[1]
color_green_dark = sns.color_palette(palette)[2]
color_red_dark = sns.color_palette(palette)[3]
color_purple_dark = sns.color_palette(palette)[4]
color_brown_dark = sns.color_palette(palette)[5]
color_pink_dark = sns.color_palette(palette)[6]
color_grey_dark = sns.color_palette(palette)[7]
color_yellow_dark = sns.color_palette(palette)[8]
color_turquoise_dark = sns.color_palette(palette)[9]
# RESOLUCIÓN ANALÍTICA
#groupby article_name on the column quantity and sum them
selled_articles = my_df.groupby(['article_name'])[['article_name','quantity']].sum(['quantity'])
selled_articles.index.name = 'article_name'
#get the most selled article name
most_selled = selled_articles.idxmax()
#print(most_selled['quantity'])
#get the most selled article quantity
most_selled_qty = selled_articles.max()
#print(most_selled_qty['quantity'])
print(f'(ES) El artículo más vendido fue "{most_selled["quantity"]}" con {most_selled_qty["quantity"]} unidades vendidas.')
print(f'(EN) The most selled article was "{most_selled["quantity"]}" with {most_selled_qty["quantity"]} units selled.')
# get top 5 articles
top_5_articles = selled_articles.sort_values(by="quantity", ascending=False).head()
# creating a np array for top_5_articles
top_5_articles_arr = np.array([top_5_articles.index, top_5_articles["quantity"]])
print(top_5_articles_arr)
# RESOLUCIÓN GRÁFICA
# ----- CREATE PLOT FIGURE -----
plt.figure(figsize=(10,6))
# ----- CREATE PLOT BAR -----
top_5_articles_plot = plt.bar(
top_5_articles_arr[0], #x axis
top_5_articles_arr[1], #y axis
bottom=0, #y axis starts from
width=0.7, #bars width
align='center', #bars align
color=color_orange, #'orange', #bars color
edgecolor='black', #bars edgecolor (border)
linewidth=0.5, #bars edge line width
)
# --- PLOT LABEL ---
plt.bar_label(
top_5_articles_plot,
padding=-80,
color='white',
fontweight='bold'
)
# --- PLOT GRID ---
plt.grid(visible=False)
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Top 5 selled articles'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_orange_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# ----- AXIS X -----
# X LABEL TEXT
xlabel_text= 'Articles'
# X LABEL LOCATION
xlabel_loc = 'center'
# X LABEL PADDING
xlabel_pad = 20
# X LABEL FONT
xlabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE X LABEL
plt.xlabel(
xlabel=xlabel_text,
loc=xlabel_loc,
labelpad=xlabel_pad,
fontdict=xlabel_font
)
# CREATE X TICKS
plt.xticks(
top_5_articles_arr[0],
rotation=0
)
# ----- AXIS Y -----
# Y LABEL TEXT
ylabel_text= 'Qty'
# Y LABEL LOCATION
ylabel_loc = 'center'
# Y LABEL LOCATION
ylabel_pad = 20
# Y LABEL FONT
ylabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE Y LABEL
plt.ylabel(
ylabel=ylabel_text,
loc=ylabel_loc,
labelpad=ylabel_pad,
fontdict=ylabel_font,
)
# --- removing the top and right axes spines
sns.despine()
# --- SAVE FIG ---
plt.savefig('img/top_5_selled_articles.png')
# --- SHOW PLOT ---
plt.show()
# RESOLUCIÓN ANALÍTICA
#groupby article_name on the column total_amount and sum them
articles_income = my_df.groupby(['article_name'])[['article_name','total_amount']].sum(['total_amount'])
articles_income.index.name = 'article_name'
#get the most selled article name
most_selled = articles_income.idxmax()
#get the most selled article total amount
most_selled_total_amount = articles_income.max().astype(int)
print(f'(ES) El artículo más vendido fue "{most_selled["total_amount"]}" con un total de ${most_selled_total_amount["total_amount"]}.')
print(f'(EN) The most selled article was "{most_selled["total_amount"]}" with a total of ${most_selled_total_amount["total_amount"]}.')
# get top 5 most income articles
top_5_income_articles = articles_income.sort_values(by="total_amount", ascending=False).head()
# creating a np array for top_5_income_articles
top_5_income_articles_arr = np.array([top_5_income_articles.index, top_5_income_articles["total_amount"].astype(int)])
print(top_5_income_articles_arr)
# RESOLUCIÓN GRÁFICA
# ----- CREATE PLOT FIGURE -----
plt.figure(figsize=(10,6))
# ----- CREATE PLOT BAR HORIZONTAL -----
top_5_income_articles_plot = plt.barh(
top_5_income_articles_arr[0], #x axis
top_5_income_articles_arr[1], #y axis
#bottom=0, #y axis starts from
#width=0.7, #bars width
align='center', #bars align
color=color_blue, #bars color
edgecolor='black', #bars edgecolor (border)
linewidth=0.5, #bars edge line width
)
# --- PLOT LABEL ---
plt.bar_label(
top_5_income_articles_plot,
padding=10,
color=color_blue,
fontweight='bold'
)
# --- PLOT GRID ---
plt.grid(visible=False)
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Top 5 articles income'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_blue_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# ----- AXIS X -----
# X LABEL TEXT
xlabel_text= 'Total amount ($)'
# X LABEL LOCATION
xlabel_loc = 'center'
# X LABEL PADDING
xlabel_pad = 20
# X LABEL FONT
xlabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE X LABEL
plt.xlabel(
xlabel=xlabel_text,
loc=xlabel_loc,
labelpad=xlabel_pad,
fontdict=xlabel_font
)
# ----- AXIS Y -----
# Y LABEL TEXT
ylabel_text= 'Articles'
# Y LABEL LOCATION
ylabel_loc = 'center'
# Y LABEL LOCATION
ylabel_pad = 20
# Y LABEL FONT
ylabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE Y LABEL
plt.ylabel(
ylabel=ylabel_text,
loc=ylabel_loc,
labelpad=ylabel_pad,
fontdict=ylabel_font,
)
# --- removing the top and right axes spines
sns.despine()
# --- SAVE FIG ---
plt.savefig('img/top_5_articles_income.png')
# --- SHOW PLOT ---
plt.show()
# RESOLUCIÓN ANALÍTICA
#groupby seller_name on the column total_amount and sum them
top_month_sellers = my_df.groupby(['seller_name'])[['seller_name','total_amount']].sum(['total_amount'])
top_month_sellers.index.name = 'seller_name'
#get the top month seller name
top_month_seller = top_month_sellers.idxmax()
#get the top month seller total amount
top_month_seller_total_amount = top_month_sellers.max().astype(int)
print(f'(ES) Se debería otorgar el bono al vendedor {top_month_seller["total_amount"]}. Su total de ingresos fue de ${top_month_seller_total_amount["total_amount"]}.')
print(f'(EN) The bonus should be awarded to the seller {top_month_seller["total_amount"]}. Their total revenue was ${top_month_seller_total_amount["total_amount"]}.')
# get top 5 month sellers
top_5_month_sellers = top_month_sellers.sort_values(by="total_amount", ascending=False).head()
# creating a np array for top_5_month_sellers
top_5_month_sellers_arr = np.array([top_5_month_sellers.index, top_5_month_sellers["total_amount"].astype(int)])
top_5_sellers_names = top_5_month_sellers_arr[0]
top_5_sellers_amount = top_5_month_sellers_arr[1]
print(top_5_month_sellers_arr)
# RESOLUCIÓN GRÁFICA
# ----- CREATE PLOT FIGURE -----
plt.figure(figsize=(10,6))
# ----- FUNCTION FOR AUTOPCT PARAMETER -----
def create_autopct(values):
def my_autopct(pct):
total = sum(values)
val = int(round(pct*total/100.0))
return '{p:.2f}% \n\n(${v:d})'.format(p=pct,v=val)
return my_autopct
# ----- CREATE PLOT PIE -----
top_5_month_sellers_plot = plt.pie(
top_5_sellers_amount, #wedge sizes
labels=top_5_sellers_names, #labels of each wedge
colors= sns.color_palette('pastel'), #wedges colors
autopct=create_autopct(top_5_sellers_amount), #percentage
shadow=True, #shadow of the plot
explode=(0.1,0,0,0,0) #
)
plt.axis("equal")
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Top 5 sellers of the month'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_purple_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# --- SAVE FIG ---
plt.savefig('img/top_5_sellers_of_the_month.png')
# --- SHOW PLOT ---
plt.show()
# RESOLUCIÓN ANALÍTICA
#groupby week on the column total_amount and sum them
weeks_total_amount = my_df.groupby(['week'])[['week','total_amount']].sum(['total_amount'])
weeks_total_amount.index.name = 'week'
#get the top month seller name
week_most_income = weeks_total_amount.idxmax()
#get the top month seller total amount
week_most_income_total_amount = weeks_total_amount.max().astype(int)
print(f'(ES) La semana con mayor cantidad de ventas fue la {week_most_income["total_amount"]}°. El total de ingresos fue de ${week_most_income_total_amount["total_amount"]}.')
print(f'(EN) The week with the most sales was the {week_most_income["total_amount"]}° one. Total income was ${week_most_income_total_amount["total_amount"]}.')
# RESOLUCIÓN GRÁFICA
# RESOLUCIÓN GRÁFICA
# ----- CREATE PLOT FIGURE -----
plt.figure(figsize=(10,6))
# ----- CREATE FIGURE -----
ordered_weeks_by_income_plot = plt.plot(
ordered_week_names, #x axis
ordered_week_amount, #y axis
color=color_turquoise, #line color
linewidth=5, # edge line width
)
# --- PLOT GRID ---
plt.grid(visible=False)
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Weeks vs income'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_turquoise_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# ----- AXIS X -----
# X LABEL TEXT
xlabel_text= 'Weeks'
# X LABEL LOCATION
xlabel_loc = 'center'
# X LABEL PADDING
xlabel_pad = 20
# X LABEL FONT
xlabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE X LABEL
plt.xlabel(
xlabel=xlabel_text,
loc=xlabel_loc,
labelpad=xlabel_pad,
fontdict=xlabel_font
)
# CREATE X TICKS
plt.xticks(
ordered_week_names,
rotation=0
)
# ----- AXIS Y -----
# Y LABEL TEXT
ylabel_text= 'Total amount ($)'
# Y LABEL LOCATION
ylabel_loc = 'center'
# Y LABEL LOCATION
ylabel_pad = 20
# Y LABEL FONT
ylabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE Y LABEL
plt.ylabel(
ylabel=ylabel_text,
loc=ylabel_loc,
labelpad=ylabel_pad,
fontdict=ylabel_font,
)
# --- removing the top and right axes spines
sns.despine()
# --- SAVE FIG ---
plt.savefig('img/week_income.png')
# --- SHOW PLOT ---
plt.show()
# RESOLUCIÓN ANALÍTICA
#groupby country on the column total_amount and sum them
countries_total_amount = my_df.groupby(['country_name'])[['country_name','total_amount']].sum(['total_amount'])
countries_total_amount.index.name = 'country_name'
#get the top country name
country_most_income = countries_total_amount.idxmax()
#get the top country total amount
country_most_income_total_amount = countries_total_amount.max().astype(int)
print(f'(ES) El país con mayor cantidad de ventas fue {country_most_income["total_amount"]}. El total de ingresos fue de ${country_most_income_total_amount["total_amount"]}.')
print(f'(EN) The country with the most sales was {country_most_income["total_amount"]}. Total income was ${country_most_income_total_amount["total_amount"]}.')
# get world map from geopandas
world_map = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
#print(world_map.groupby(['continent']).sum())
# get america countries for map
america=['South America','North America']
america_map = world_map.query("continent in @america")
# create one figure and one subplot
fig, axes = plt.subplots(figsize = (12,8),facecolor = plt.cm.Blues(.2))
axes.set_facecolor(plt.cm.Blues(.2))
#create a countries column to use with the geopandas
countries_total_amount['countries'] = countries_total_amount.index
#create a geopandas dataframe from the countries_total_amount df
countries_total_amount = gpd.GeoDataFrame(countries_total_amount)
#add a geometry column to the countries_total_amount df
# this is required for using the geopandas
countries_total_amount['geometry'] = gpd.tools.geocode(countries_total_amount['countries']).geometry
#create a GeoDataFrame plot for the world map with the countries
america_map.plot(color="white",ax=axes)
# create a geopandas plot
countries_total_amount.plot(
column = 'total_amount',
cmap = 'Greens',
ax = axes,
legend = True,
markersize= 300,
legend_kwds = {
'label': "Total amount ($)"
},
missing_kwds = {
'facecolor':'Gray'
}
)
# create figure title
fig.suptitle(
'Total amount of sales by country',
y=1,
fontsize = 'xx-large',
fontweight = 'bold'
)
# --- SAVE FIG ---
plt.savefig('img/sales_per_country_map.png')
# --- SHOW PLOT ---
plt.show()
# get countries ordered by total_amount
top_5_countries = countries_total_amount.sort_values(by="total_amount", ascending=False).head()
# creating a np array for top_5_countries
top_5_countries_arr = np.array([top_5_countries.index, top_5_countries["total_amount"].astype(int)])
top_5_countries_names = top_5_countries_arr[0]
top_5_countries_amount = top_5_countries_arr[1]
print(top_5_countries_arr)
# RESOLUCIÓN GRÁFICA
# ----- CREATE PLOT FIGURE -----
plt.figure(figsize=(10,6))
# ----- CREATE PLOT BAR -----
top_5_countries_plot = plt.bar(
top_5_countries_names, #x axis
top_5_countries_amount, #y axis
bottom=0, #y axis starts from
width=0.7, #bars width
align='center', #bars align
color=color_green, #bars color
edgecolor='black', #bars edgecolor (border)
linewidth=0.5, #bars edge line width
)
# --- PLOT LABEL ---
plt.bar_label(
top_5_countries_plot,
padding=-80,
color='white',
fontweight='bold'
)
# --- PLOT GRID ---
plt.grid(visible=False)
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Top 5 countries'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_green_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# ----- AXIS X -----
# X LABEL TEXT
xlabel_text= 'Countries'
# X LABEL LOCATION
xlabel_loc = 'center'
# X LABEL PADDING
xlabel_pad = 20
# X LABEL FONT
xlabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE X LABEL
plt.xlabel(
xlabel=xlabel_text,
loc=xlabel_loc,
labelpad=xlabel_pad,
fontdict=xlabel_font
)
# CREATE X TICKS
plt.xticks(
top_5_countries_arr[0],
rotation=0
)
# ----- AXIS Y -----
# Y LABEL TEXT
ylabel_text= 'Total amount ($)'
# Y LABEL LOCATION
ylabel_loc = 'center'
# Y LABEL LOCATION
ylabel_pad = 20
# Y LABEL FONT
ylabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE Y LABEL
plt.ylabel(
ylabel=ylabel_text,
loc=ylabel_loc,
labelpad=ylabel_pad,
fontdict=ylabel_font,
)
# --- removing the top and right axes spines
sns.despine()
# --- SAVE FIG ---
plt.savefig('img/top_5_countries.png')
# --- SHOW PLOT ---
plt.show()
# RESOLUCIÓN ANALÍTICA
#groupby country_name and seller_name on the column total_amount, then sum them
best_seller_by_country = my_df.groupby(['country_name','seller_name']).sum(['total_amount'])
#get countries from my_df
countries = my_df.groupby('country_name').sum().index.values
#create a new df
df_best_seller_by_country = pd.DataFrame(columns=['seller_name', 'total_amount'], index=countries)
for country in countries:
#search for country in df
loc_country = best_seller_by_country.loc[country]
#get max total_amount for that country
total_amount = loc_country.max()['total_amount']
#get max total_amount seller for that country
seller = loc_country[(loc_country['total_amount'] == total_amount)].index[0]
#add seller and total amount to final df
df_best_seller_by_country.loc[country] = {'seller_name': seller, 'total_amount': total_amount}
df_best_seller_by_country = df_best_seller_by_country.astype({'total_amount':'int'})
print(df_best_seller_by_country)
# RESOLUCIÓN ANALÍTICA
#groupby country_name and article_name on the column total_amount, then sum them
most_selled_article_by_country = my_df.groupby(['country_name','article_name']).sum(['total_amount'])
#get countries from my_df
countries = my_df.groupby('country_name').sum().index.values
#create a new df
df_most_selled_article_by_country = pd.DataFrame(columns=['article_name', 'total_amount'], index=countries)
for country in countries:
#search for country in df
loc_country = most_selled_article_by_country.loc[country]
#get max total_amount for that country
total_amount = loc_country.max()['total_amount']
#get max total_amount article for that country
article = loc_country[(loc_country['total_amount'] == total_amount)].index[0]
#add article and total amount to final df
df_most_selled_article_by_country.loc[country] = {'article_name': article, 'total_amount': total_amount}
df_most_selled_article_by_country = df_most_selled_article_by_country.astype({'total_amount':'int'})
print(df_most_selled_article_by_country)
#groupby country_name and article_name on the column total_amount, then sum them
most_selled_article_by_country = my_df.groupby(['country_name','article_name']).sum(['total_amount']).reset_index()
most_selled_article_by_country = most_selled_article_by_country.sort_values(by='total_amount', ascending=False)
most_selled_article_by_country = most_selled_article_by_country.head(20)
#print(most_selled_article_by_country)
countries = most_selled_article_by_country['country_name']
articles = most_selled_article_by_country['article_name']
total_amount = most_selled_article_by_country['total_amount']
# ----- CREATE PLOT FIGURE -----
# create one figure and one subplot
fig, axes = plt.subplots(figsize = (12,8))
most_selled_article_by_country_plot = sns.barplot(
x=countries,
y=total_amount,
hue=articles,
palette="pastel",
edgecolor=".6",
data=most_selled_article_by_country,
dodge=False,
#ax=axes
)
# --- PLOT TITLE ---
# PLOT TITLE TEXT
plot_title = 'Most selled articles by country'
# PLOT TITLE LOCATION
plot_title_loc='center'
# PLOT TITLE FONT
plot_title_font={
'fontsize': 16,
'fontweight': 'bold',
'fontname':'monospace',
'color': color_brown_dark,
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# PLOT TITLE PADDING
plot_title_pad = 30
# CREATE PLOT TITLE
plt.title(
label=plot_title,
loc=plot_title_loc,
fontdict=plot_title_font,
pad=plot_title_pad
)
# ----- AXIS X -----
# X LABEL TEXT
xlabel_text= 'Countries'
# X LABEL LOCATION
xlabel_loc = 'center'
# X LABEL PADDING
xlabel_pad = 20
# X LABEL FONT
xlabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE X LABEL
plt.xlabel(
xlabel=xlabel_text,
loc=xlabel_loc,
labelpad=xlabel_pad,
fontdict=xlabel_font
)
# ----- AXIS Y -----
# Y LABEL TEXT
ylabel_text= 'Total amount ($)'
# Y LABEL LOCATION
ylabel_loc = 'center'
# Y LABEL LOCATION
ylabel_pad = 20
# Y LABEL FONT
ylabel_font={
'fontsize': 12,
'fontweight': 'normal',
'color': 'black',
'verticalalignment': 'center',
'horizontalalignment': 'center'
}
# CREATE Y LABEL
plt.ylabel(
ylabel=ylabel_text,
loc=ylabel_loc,
labelpad=ylabel_pad,
fontdict=ylabel_font,
)
plt.legend(title='Articles', loc='upper right')
# --- removing the top and right axes spines
sns.despine()
# --- SAVE FIG ---
plt.savefig('img/most_selled_articles_by_country.png')
# --- SHOW PLOT ---
plt.show()