Install Libraries
!pip install plotly
!pip install seaborn
!pip install world_bank_data
Import libraries
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import world_bank_data as wb
# from kaleido.scopes.plotly import PlotlyScope
Load data:
CO2_EMISSION_DATA = "EN.ATM.CO2E.KT"
COUNTRY_POPULATION_SIZE_DATA = "SP.POP.TOTL"
POVERTY_HEADCOUNT_RATIO_DATA = "SI.POV.DDAY"
COUNTRY_LAND_AREA_DATA = "AG.LND.TOTL.K2"
COUNTRY_GDP_DATA = "NY.GDP.MKTP.CD"
CO2 emissions data:
co2_emissions_series = (
wb.get_series(CO2_EMISSION_DATA, simplify_index=True)
.reset_index()
.rename(columns={CO2_EMISSION_DATA: "co2 emissions"})
)
co2_emissions_df = pd.DataFrame(co2_emissions_series)
co2_emissions_df.head()
Country population data:
# year 2019 has the most recent non-missing data
country_population_series = (
wb.get_series(COUNTRY_POPULATION_SIZE_DATA, simplify_index=True, date="2019")
.reset_index()
.rename(columns={COUNTRY_POPULATION_SIZE_DATA: "population size"})
)
country_population_df = pd.DataFrame(country_population_series)
country_population_df.head()
Country poverty percentage data:
poverty_percentages_series = (
wb.get_series(POVERTY_HEADCOUNT_RATIO_DATA, simplify_index=True)
.reset_index()
.rename(columns={POVERTY_HEADCOUNT_RATIO_DATA: "poverty percentage"})
)
poverty_percentages_df = pd.DataFrame(poverty_percentages_series)
poverty_percentages_df.head()
Country land area data:
# year 2017 has the most recent non-missing data
country_land_area_series = (
wb.get_series(COUNTRY_LAND_AREA_DATA, simplify_index=True, date="2017")
.reset_index()
.rename(columns={COUNTRY_LAND_AREA_DATA: "land area"})
)
country_land_area_df = pd.DataFrame(country_land_area_series)
country_land_area_df.head()
Country GDP data:
country_gdp_series = (
wb.get_series(COUNTRY_GDP_DATA, simplify_index=True)
.reset_index()
.rename(columns={COUNTRY_GDP_DATA: "GDP"})
)
country_gdp_df = pd.DataFrame(country_gdp_series)
country_gdp_df.head()
Country income group data:
countries_df = wb.get_countries().reset_index()
countries_df.head()
Preprocess data:
Convert dtyps
:
CO2 emissions data:
co2_emissions_df.dtypes
co2_emissions_df["Year"] = co2_emissions_df["Year"].astype(int)
co2_emissions_df.dtypes
Country population data:
country_population_df.dtypes
Country poverty percentage data:
poverty_percentages_series.dtypes
poverty_percentages_df["Year"] = poverty_percentages_df["Year"].astype(int)
poverty_percentages_df.dtypes
Country land area data:
country_land_area_df.dtypes
Country GDP data:
country_gdp_df.dtypes
country_gdp_df["Year"] = country_gdp_df["Year"].astype(int)
country_gdp_df.dtypes
Drop non-country rows
non_countries = countries_df.loc[lambda x: x["region"] == "Aggregates"]["name"].values
non_countries = np.append(non_countries, "Latin America & Caribbean")
non_countries = np.append(non_countries, "Sub-Saharan Africa")
for df in [
co2_emissions_df,
country_population_df,
poverty_percentages_df,
country_land_area_df,
country_gdp_df,
]:
print(f"{df.columns[-1]} shape before: {df.shape}")
df.drop(index=df[df["Country"].isin(non_countries)].index, inplace=True)
print(f"{df.columns[-1]} shape after: {df.shape}")
Create aggregated data:
co2_emissions_agg_df = (
co2_emissions_df.groupby(by="Country")
.agg({"co2 emissions": "sum"})
.reset_index()
.sort_values(by="co2 emissions", ascending=False)
)
co2_emissions_agg_df.head()
Merge data into two data frames:
Merge not-time based statistics:
columns_to_drop = [
"id",
"iso2Code",
"name",
"adminregion",
"lendingType",
"capitalCity",
"longitude",
"latitude",
]
overall_stats_df = (
co2_emissions_agg_df.merge(right=country_land_area_df, on="Country")
.merge(right=country_population_df, on="Country")
.merge(right=countries_df, left_on="Country", right_on="name")
.drop(columns=columns_to_drop)
.sort_values(by="co2 emissions", ascending=False)
)
overall_stats_df.head()
Merge time-based data frames:
time_series_df = (
co2_emissions_df.merge(right=country_gdp_df, on=["Country", "Year"])
.merge(right=poverty_percentages_df, on=["Country", "Year"])
.sort_values(by="Year")
)
time_series_df.head()
Create new statistics:
overall_stats_df.head()
CO2 emissions per square k.m:
overall_stats_df["emissions per area"] = (
overall_stats_df["co2 emissions"] / overall_stats_df["land area"]
)
CO2 emissions per capita:
overall_stats_df["emissions per capita"] = (
overall_stats_df["co2 emissions"] / overall_stats_df["population size"]
)
Data visualization:
def get_top_n_countries(n: int, by: str):
return overall_stats_df.sort_values(by=by, ascending=False)[:n]["Country"].values
Who emitted the most in the last years?
Let's show the counties which emitted the most at year 2016
px.bar(
data_frame=time_series_df.loc[lambda x: x["Year"] == 2016].sort_values(
by="co2 emissions", ascending=False
)[:15],
x="Country",
y="co2 emissions",
)
Which countries emitted the most CO2 in total?
Let's visualize emissions throughout history:
fig = px.pie(
data_frame=overall_stats_df.loc[
lambda x: x["Country"].isin(get_top_n_countries(10, "co2 emissions"))
],
names="Country",
values="co2 emissions",
color="Country",
color_discrete_sequence=px.colors.sequential.Reds_r,
)
fig.show()
px.bar(
data_frame=overall_stats_df.loc[
lambda x: x["Country"].isin(get_top_n_countries(10, "co2 emissions"))
],
x="Country",
y="co2 emissions",
color="co2 emissions",
)
Treemap chart:
# TODO: add treemap chart
Emissions per capita:
If a country has more people in genral, then its emissions will be of course higher.
overall_stats_df.head()
px.bar(
data_frame=overall_stats_df.loc[
lambda x: x["Country"].isin(
get_top_n_countries(n=10, by="emissions per capita")
)
].sort_values(by="emissions per capita", ascending=False),
x="Country",
y="emissions per capita",
)
Emissions per country area:
px.bar(
data_frame=overall_stats_df.loc[
lambda x: x["Country"].isin(get_top_n_countries(n=15, by="emissions per area"))
].sort_values(by="emissions per area", ascending=False),
x="Country",
y="emissions per area",
)
How CO2 emissions increased by country
px.line(
data_frame=time_series_df.loc[
lambda x: x["Country"].isin(get_top_n_countries(n=7, by="co2 emissions"))
],
x="Year",
y="co2 emissions",
line_group="Country",
color="Country",
)
How is CO2 emissions related with GDP and poverty:
fig = make_subplots(rows=3, cols=1)
temp_df = data_frame = time_series_df.loc[
lambda x: x["Country"].isin(get_top_n_countries(n=3, by="co2 emissions"))
]
fig.add_trace(go.Scatter(x=temp_df["Year"], y=temp_df["co2 emissions"]), row=1, col=1)
# fig.add_trace(go.Scatter(x=[1, 2, 3], y=[10, 20, 30]), row=2, col=1)
fig.add_trace(go.Scatter(x=temp_df["Year"], y=temp_df["GDP"]), row=2, col=1)
fig.show()
overall_stats_df.head()
country_gdp_2019_df = country_gdp_df.loc[lambda x: x["Year"] == 2019].sort_values(
by="GDP", ascending=False
)[["Country", "GDP"]]
overall_stats_df = overall_stats_df.merge(right=country_gdp_2019_df, on="Country")
overall_stats_df.head()
px.scatter(
data_frame=overall_stats_df[:30],
x="co2 emissions",
y="GDP",
color="incomeLevel",
symbol="incomeLevel",
)
high_income_countries_df = overall_stats_df.loc[
lambda x: x["incomeLevel"] == "High income"
]
px.scatter(
data_frame=high_income_countries_df,
x="co2 emissions",
y="GDP",
hover_data=["Country"],
)
upper_middle_income_countries_df = overall_stats_df.loc[
lambda x: x["incomeLevel"] == "Upper middle income"
]
upper_middle_income_countries_df.head()
px.scatter(
data_frame=upper_middle_income_countries_df,
x="co2 emissions",
y="GDP",
hover_data=["Country"],
)
lower_middle_income_countries_df = overall_stats_df.loc[
lambda x: x["incomeLevel"] == "Lower middle income"
]
px.scatter(data_frame=lower_middle_income_countries_df,
x='co2 emissions',
y='GDP',
hover_data=['Country'])
low_income_countries_df = overall_stats_df.loc[lambda x: x['incomeLevel'] == 'Low income']
px.scatter(data_frame=low_income_countries_df,
x='co2 emissions',
y='GDP',
hover_data=['Country'])