import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
world_population = pd.read_csv("./world_population.csv")
world_population
Country/Otherobject
Afghanistan0.5%
Albania0.5%
199 others99%
Population (2020)int64
97929 - 1439323776
0
Afghanistan
38928346
1
Albania
2877797
2
Algeria
43851044
3
Angola
32866272
4
Antigua and Barbuda
97929
5
Argentina
45195774
6
Armenia
2963243
7
Aruba
106766
8
Australia
25499884
9
Austria
9006398
display(world_population.dtypes, world_population.shape)
top_populated = world_population.sort_values(by="Population (2020)", ascending=False).iloc[:10,:2]
top_populated["Population (2020)"] = top_populated["Population (2020)"].apply(lambda x: np.round(x/1_000_000, 2))
top_populated.columns = ["Country", "Population"]
top_populated.reset_index(drop=True, inplace=True)
top_populated
Countryobject
China10%
India10%
8 others80%
Populationfloat64
128.93 - 1439.32
0
China
1439.32
1
India
1380
2
United States
331
3
Indonesia
273.52
4
Pakistan
220.89
5
Brazil
212.56
6
Nigeria
206.14
7
Bangladesh
164.69
8
Russia
145.93
9
Mexico
128.93
plt.figure(dpi=100, facecolor="#FBFEB7").add_axes(rect=[0, 0, 1.9, 1.4], facecolor="#FDFFC9")
colors = ["#910000", '#C88A0F', "#549431", "#532B72", "#B14BB9", "#822805", "#02FFE5", "#C8CA2B", "#6928BF", "#12B40D"]
bars = plt.bar(x=top_populated.Country, height=top_populated.Population, color=colors, width=0.7)
plt.title("Most populated countries in the world", fontdict={"size": 16, "weight": "bold"}, pad=11)
plt.xlabel("Country", fontdict={"size": 14})
plt.ylabel("Population [M]", fontdict={"size": 14})
plt.ylim(0.001)
plt.tick_params(size=7, width=1.2, labelsize=12)
for bar in bars:
height = bar.get_height()
label_x_position = bar.get_x() + bar.get_width()/2
plt.text(label_x_position, height, s=f"{height}", ha="center", va="bottom", fontdict={"size": 12})
plt.show()
newborns = world_population.sort_values(by="Fert. Rate", ascending=False).loc[:, ["Country/Other", "Fert. Rate"]].reset_index(drop=True)
newborns.columns=["country", "fertility_rate"]
newborns = newborns.iloc[:10]
colors = ["#6928BF", "#75B199", "#12B40D", "#FB443A", "#0E3C3C", "#604904", "#B14BB9", "#822805", "#02FFE5", "#C8CA2B"]
fig, ax = plt.subplots(figsize=(14, 8))
bars = ax.bar(x=newborns.country, height=newborns.fertility_rate, width=0.5, color=colors)
# ax.axes.get_yaxis().set_ticks([])
for bar in bars:
height = bar.get_height()
label_x_position = bar.get_x() + bar.get_width()/2
ax.text(label_x_position, height, s=f"{height}", ha="center", va="bottom", fontdict={"size":12})
fig.set_facecolor("#FBFEB7")
ax.set_facecolor("#FDFFC9")
plt.title("Countries with most newborns", fontdict={"size": 16, "weight":"bold"}, pad=11)
plt.xlabel("Country", fontdict={"size": 15})
plt.ylabel("Fertility rate", fontdict={"size": 15})
plt.ylim(0.001)
plt.tick_params(size=7, width=1.2, labelsize=14)
plt.show()
fig, ax = plt.subplots(figsize=(10,7))
y = world_population["Net Change"]
x = world_population["Migrants (net)"]
ax.scatter(x=x, y=y, c=colors[7], label="Original data")
fig.set_facecolor("#FBFEB7")
ax.set_facecolor("#FDFFC9")
plt.title("Amount of migrants Vs Change in population",fontdict={"size":16, "weight":"bold"}, pad=10)
plt.ylabel("Net change", fontdict={"size":14})
plt.xlabel("Migrants", fontdict={"size":14})
plt.tick_params(size=7, width=1.2, labelsize=12)
plt.show()
fig, ax = plt.subplots(figsize=(10,7))
y = world_population["Net Change"]
x = world_population["Migrants (net)"]
ax.scatter(x=x, y=y, c=colors[7], label="Original data")
fig.set_facecolor("#FBFEB7")
ax.set_facecolor("#FDFFC9")
plt.title("Amount of migrants Vs Change in population",fontdict={"size":16, "weight":"bold"}, pad=10)
plt.ylabel("Net change", fontdict={"size":14})
plt.xlabel("Migrants", fontdict={"size":14})
plt.xlim((-200_000,200_000))
plt.ylim((-2_000_000, 0.8*10_000_000))
plt.tick_params(size=7, width=1.2, labelsize=12)
# Regression line
res = stats.linregress(x, y)
plt.plot(x, res.slope*x + res.intercept, label='Fitted line', c='#549431')
plt.legend(facecolor="#FBFEB7", edgecolor="grey", fontsize=12)
plt.show()
print(f" R-squared: {res.rvalue**2}, correlation coefficient:{stats.pearsonr(x, y)[0]}, p-value: {stats.pearsonr(x, y)[1]}")
R-squared: 0.09129945800401301, correlation coefficient:-0.3021580017209755, p-value: 1.3039223376562615e-05
density = world_population.iloc[:,[0,4]]
density.columns = ["Country", "Density"]
density = density.sort_values(by="Density", ascending=False)
density.reset_index(drop=True, inplace=True)
density = density.iloc[:15]
density.sort_values(by="Density", inplace=True)
density
Countryobject
Mauritius6.7%
Lebanon6.7%
13 others86.7%
Densityint64
626 - 21645
14
Mauritius
626
13
Lebanon
667
12
Barbados
668
11
Taiwan
673
10
Mayotte
728
9
Micronesia
784
8
State of Palestine
847
7
Channel Islands
915
6
Bangladesh
1265
5
Malta
1380
plt.figure(dpi=70, facecolor="#FBFEB7").add_axes(rect=[0,0,1.2,1.2], facecolor="#FDFFC9")
colors = ["#910000", '#C88A0F', "#549431", "#532B72", "#B14BB9", "#822805", "#02FFE5", "#C8CA2B", "#6928BF", "#12B40D", "#FB443A", "#0E3C3C", "#604904", "#B14BB9", "#822805"]
bars = plt.barh(y=density.Country, width=density.Density, height=0.8, color=colors)
for bar in bars:
label_y_position = bar.get_y() + bar.get_height()/2
width = bar.get_width()
plt.text(width, label_y_position, s=f"{width}", ha="left", va="center", fontdict={"size":10})
plt.title("Most densely populated countries in the world", fontdict={"size":16, "style":"italic"}, pad=11)
plt.xlabel("Density (P/Km²)", fontdict={"size":14}, labelpad=10)
# plt.ylabel("Country", fontdict={"size":14})
plt.tick_params(size=7, width=1.2, labelsize=12)
plt.xticks([])
plt.xlim(0.001, 23_600)
plt.ylim(-0.8, label_y_position+0.8)
plt.show()