# packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
## Helper Functions
def get_df(dataset, region, series):
indexes = dataset.index
idxs = indexes[dataset['Region/Country/Area Name']==region]
df = dataset.iloc[idxs]
indexes = df.index
idxs = indexes[df['Series'] == series]
return idxs
# 2. Cursory examination of the dataset
# dataset.head()
# dataset.tail()
#Check first five elements
dataset.head()
#Check last five elements
dataset.tail()
# 3. Identifying a set of preliminary questions that we would like to answer with the dataset
"""
Qn1. Comparision between continents in Asylum Seekers
Qn2. What the difference between Asia and Africa in imgrants over time
Qn3. Take an example country and compare between male and female migrants over time
Q4. Correlation between different African regions and Africa in general
Qn5. What is the change in number of assylum seekers from war ridden countries
Qn4. Compare migrants from Africa vs those from sub-saharan Africa
"""
# 4. Load the data into appropriate pandas data frames
dataset = pd.read_csv("SYB63_327_202009_International Migrants and Refugees.csv", usecols=lambda x: 'Unnamed' not in x, skiprows=1)
dataset.drop('Footnotes', inplace=True, axis=1)
dataset.drop('Source', inplace=True, axis=1)
# 5.a. Understanding the data- attributes/fields, data types, data size, significant fields
dataset.info()
# 5.b. Understanding the basic statistics
dataset['Value'].describe()
# 5.c. Using relevant visualizations to help understand and communicate insights from the data.
axis1 = "International migrant stock: Both sexes (% total population)"
axis2 = "International migrant stock: Male (% total Population)"
axis3 = "International migrant stock: Female (% total Population)"
region = "Total, all countries or areas"
df = dataset.iloc[get_df(dataset, region, axis1)]
plt.plot(df['Year'], df['Value'])
df = dataset.iloc[get_df(dataset, region, axis2)]
plt.plot(df['Year'], df['Value'])
df = dataset.iloc[get_df(dataset, region, axis3)]
plt.plot(df['Year'], df['Value'])
plt.legend([axis1, axis2, axis3])
plt.title("International migrant stock in %", fontsize = 12)
plt.xlabel("Time")
plt.ylabel("% of population")
plt.show()
axis = "International migrant stock: Both sexes (% total population)"
region = "Total, all countries or areas"
df = dataset.iloc[get_df(dataset, region, axis)]
plt.plot(df['Year'], df['Value'])
plt.title("International migrant stock: Both sexes in %", fontsize = 12)
plt.xlabel("Time")
plt.ylabel("Number of people")
plt.show()
# 5.d. Check for missing values and outliers
# Check for missing values
value = df["Value"].isnull().values.any()
if value == "True":
print("There are outliers")
else:
print("There are no outliers")
#Check for outliers
print("Check for outliers")
sns.boxplot(x=df["Value"]);
# 5.e. Determine trends, patterns, and relationships among the data
axis = "International migrant stock: Both sexes (number)"
region = "Africa"
df = dataset.iloc[get_df(dataset, region, axis)]
plt.plot(df['Year'], df['Value'])
plt.title("International migrant stock: Both sexes in number", fontsize = 12)
plt.xlabel("Time")
plt.ylabel("Number of people")
plt.legend([axis])
plt.show()
indexes = dataset.index[dataset['Series']=="Asylum seekers, including pending cases (number)"]
main_df = dataset.iloc[indexes]
# 5.f. Answer the questions in 3) above (as well as any additional questions).
#Qn1. Comparision between continents in Asylum Seekers
continents = ["Africa", "Northern America", "Asia", "Europe"]
for continent in continents:
df = dataset.iloc[main_df.index[main_df['Region/Country/Area Name']==continent]]
plt.plot(list(df['Year']), list(df['Value']))
plt.legend(continents)
plt.title("Asylum Seekers per Continent")
plt.ylabel("Asylum seekers, including pending cases (number)")
plt.xlabel("Year")
plt.show()
# Qn2. What the difference between Asia and Africa in imgrants over time
regions = ["Africa", "Asia"]
for region in regions:
df = dataset.iloc[main_df.index[main_df['Region/Country/Area Name'] == region]]
plt.plot(df['Year'], df['Value'])
plt.legend(regions)
plt.ylabel("Total asylum seekers in millions")
plt.xlabel("Year")
plt.title("Asylum seekers comparison between Africa and Asia")
plt.show()
# Qn3. Take an example country and compare between male and female migrants over time
country = "Congo"
male_migrants_indexes = dataset.index[dataset['Series']=="International migrant stock: Male (% total Population)"]
female_migrants_indexes = dataset.index[dataset['Series']=="International migrant stock: Female (% total Population)"]
male_df = dataset.iloc[male_migrants_indexes]
female_df = dataset.iloc[female_migrants_indexes]
# male migrants plot
df1 = dataset.iloc[male_df.index[male_df['Region/Country/Area Name'] == country]]
plt.plot(df1['Year'], df1['Value'])
# female migrants plot
df2 = dataset.iloc[female_df.index[female_df['Region/Country/Area Name'] == country]]
plt.plot(df2['Year'], df2['Value'])
# plot structure
plt.ylabel("% total Population")
plt.xlabel("Year")
plt.title("Male and female migrants comparison in Congo")
plt.legend(["Male", "Female"])
plt.show()
# Qn4. Compare migrants from Africa vs those from sub-saharan Africa
migr_indexes = dataset.index[dataset['Series']=="International migrant stock: Both sexes (number)"]
migr_df = dataset.iloc[migr_indexes]
migrants_regions = ["Africa", "Sub-Saharan Africa"]
for migrant_region in migrants_regions:
df = dataset.iloc[migr_df.index[migr_df['Region/Country/Area Name'] == migrant_region]]
plt.plot(df['Year'], df['Value'])
plt.legend(migrants_regions)
plt.ylabel("Migrants, including pending cases (number)")
plt.xlabel("Year")
plt.title("Migrants comparison between Africa and Sub-Saharan Africa")
plt.show()
def corr_regions(dataset, regs):
regs_df=[]
for reg in regs:
tmp_df = dataset.iloc[migr_df.index[migr_df['Region/Country/Area Name'] == reg]]
regs_df.append(tmp_df)
df = regs_df[0]
for idx, reg_df in enumerate(regs_df):
df[str(str(regs[idx]) + ' Value')] = list(reg_df['Value'])
regss=["Region/Country/Area", "Region/Country/Area Name", "Year", "Series", "Value"]
for reg in regss:
df = df.drop(reg, axis=1)
corr = df.corr()
sns.heatmap(corr, annot=True)
plt.show()
# Qn5. What is the change in number of assylum seekers from war ridden countries
war_ridden_countries = ["Iraq", "Syria", "Afghanistan", "Yemen", "Sudan", "Pakistan", "Somalia"]
for war_ridden_country in war_ridden_countries:
# main_df from Asylum seekers
df = dataset.iloc[main_df.index[main_df['Region/Country/Area Name'] == war_ridden_country]]
plt.plot(df['Year'], df['Value'])
plt.legend(war_ridden_countries)
plt.ylabel("Asylum seekers, including pending cases (number)")
plt.xlabel("Year")
plt.title("Asylum seekers comparison between war ridden countries")
plt.show()
#Qn6. Correlation between different African regions and Africa in general
corr_regions(dataset, ["Africa", "Sub-Saharan Africa", "Southern Africa", "Middle Africa", "Eastern Africa", "Northern Africa"])