Analyzing Immigration to Canada from 1980 to 2013

import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.simplefilter(action='ignore', category=FutureWarning)

can_df = pd.read_excel('../data/canada.xlsx', sheet_name='Canada by Citizenship', skiprows=range(20), skipfooter=2 ) can_df.head()

can_df.info()

can_df.drop(['AREA','REG','DEV','Type','Coverage'], axis=1, inplace=True) can_df.rename(columns = {'OdName':'Country', 'AreaName':'Continent', 'RegName':'Region'},inplace=True) can_df.head(2)

can_df['Total'] = can_df.iloc[:,4:].sum(axis=1)

can_df.set_index('Country', inplace=True)

can_df.columns = list(map(str, can_df.columns)) # to facilitate plotting years = list(map(str, range(1980, 2014)))

new_index = [] for country in can_df.index: if country == 'United Kingdom of Great Britain and Northern Ireland': country ='UK' new_index.append(country) can_df.index = new_index

can_df.describe()

def make_basic_continent_stat(continent): """a function that takes a name of a continent and return the basic statistics of the immigration numbers of the top 3 countries in that continent""" print(f'{continent} Total Immigration Basic Sttaistics') df = can_df[can_df['Continent']==continent]['Total'].describe() return df

def make_basic_stat(continent,k): """a function that takes a name of a continent and return the basic statistics of the immigration numbers of the top 3 countries in that continent""" df = (can_df[can_df['Continent']==continent]. sort_values('Total',ascending=False). head(k). loc[:,years].T).describe() return df def make_bar_pie_df(continent,k): """a function that takes a name of a continent and return a dataframe that shows the total number of immigrants of the top 3 countries in a continent with those countries as index. This dataframe is suaitbale for bar chart and pie chart plots""" df = can_df[can_df['Continent']==continent]['Total'].\ to_frame().\ sort_values('Total',ascending=False).\ head(k) return df def make_line_box_df(continent,k): """a function that takes a name of a continent and return a dataframe that show the name of the country and the number of immigrants for the top 3 countries in the continent with years as index. This dataframe is suitable for box and line plots""" df = (can_df[can_df['Continent']==continent]. sort_values('Total',ascending=False). head(k). loc[:,years].T) df.columns.name='' df.reset_index(inplace=True) df.rename(columns={'index':'Year'}, inplace=True) df= df.melt(id_vars='Year',var_name='Country',value_name='Number of Immigrants').\ set_index('Year') df.index = [int(val) for val in df.index] return df def make_vis(df1,df2,continent,k): chart_order = df1.index plt.figure(figsize=(26,18), facecolor='#C4A484') plt.suptitle(f"Visalization of Immigrants Numbers from Top {k} Countries in {continent} (1980-2013)", size=22) plt.subplot(2,2,1) sns.barplot(x=df1.index,y='Total', data=df1, order=chart_order) plt.title('Bar Plot', size=20) plt.xlabel('Country',size=18,color='black') plt.ylabel('Number of Immigrants',size=18,color='black') plt.xticks(size = 14) plt.yticks(size = 14) plt.subplot(2,2,2) plt.pie(x = df1['Total'], labels = df1.index, radius= 1.1, startangle= 90, counterclock= False, autopct= '%2.1F%%', textprops={'fontsize': 18}) plt.title('Pie Plot', size=20) plt.subplot(2,2,3) sns.boxplot(x='Number of Immigrants', y = 'Country', data=df2) plt.title('Box Plot', size = 20) plt.xlabel('Number of Immigrants', size = 18,color='black') plt.ylabel('Country', size = 18,color='black') plt.xticks(size = 14) plt.yticks(size = 14) plt.subplot(2,2,4) ax = sns.lineplot(data=df2, x = df2.index, y='Number of Immigrants', hue='Country') plt.title('Line Plot',size=20) plt.xlabel('Year', size=18,color='black') plt.ylabel('Number of Immigrants',size=18,color='black') plt.xticks(size = 14) plt.yticks(size = 14) plt.setp(ax.get_legend().get_texts(), fontsize='18') plt.setp(ax.get_legend().get_title(), fontsize='22')

make_basic_continent_stat('Africa')

make_basic_stat('Africa',3)

africa_bar_pie_df = make_bar_pie_df('Africa',3) africa_line_box_df = make_line_box_df('Africa',3) make_vis(africa_bar_pie_df,africa_line_box_df,'Africa',3)

make_basic_continent_stat('Latin America and the Caribbean')

make_basic_stat('Latin America and the Caribbean',3)

latin_america_bar_pie_df = make_bar_pie_df('Latin America and the Caribbean',3) latin_america_line_box_df = make_line_box_df('Latin America and the Caribbean',3) make_vis(latin_america_bar_pie_df,latin_america_line_box_df,'Latin America and the Caribbean',3)

make_basic_continent_stat('Europe')

make_basic_stat('Europe',2)

europe_bar_pie_df = make_bar_pie_df('Europe',2) europe_line_box_df = make_line_box_df('Europe',2) make_vis(europe_bar_pie_df,europe_line_box_df,'Europe',2)

can_df.loc['Croatia', years].plot(figsize=(14,4), title='Immigration from Croatia') plt.xlabel('Years') plt.ylabel('Number of Immigrants') plt.show()

from IPython import display display.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/GDP_of_Croatia_at_constant_prices.png/440px-GDP_of_Croatia_at_constant_prices.png")

balkan_df = can_df.loc[['Bosnia and Herzegovina','Croatia','Serbia'],years].T balkan_df.plot(figsize=(14,6), title='Immigration from Bosnia and Herzegovina, Croatia and Serbia to Canada (1981-2013)') plt.show()

can_df.Continent.value_counts().index

can_df[can_df.Continent == 'Northern America'].index

can_df.drop('Canada', axis = 0, inplace=True)

can_df.loc['Mexico', 'Continent'] = 'Northern America'

can_df[can_df['Continent']=='Northern America']

make_basic_stat('Northern America')

make_vis(make_bar_pie_df('Northern America'),make_line_box_df('Northern America'),'Northern America')

usa_df = pd.read_excel('../data/United States of America.xlsx', sheet_name='USA by Place of birth', skiprows=range(20), skipfooter=2 ) usa_df.drop(['AREA','REG','DEV','Type','Coverage'], axis=1, inplace=True) usa_df.rename(columns = {'OdName':'Country', 'AreaName':'Continent', 'RegName':'Region'},inplace=True) usa_df['Total'] = can_df.iloc[:,4:].sum(axis=1) usa_df.set_index('Country', inplace=True) usa_df.columns = list(map(str, can_df.columns)) usa_df.head(3)

can_df.loc['United States of America', years].plot(label='Immigration From USA to Canada') usa_df.loc['Canada', years].plot(label = 'Immigration from Canda to USA') plt.title('Immigration Comparison btw. USA to Canada and Canada to USA') plt.legend() plt.show()

make_basic_continent_stat('Asia')

make_basic_stat('Asia',3)

asia_bar_pie_df = make_bar_pie_df('Asia',3) asia_line_box_df = make_line_box_df('Asia',3) make_vis(asia_bar_pie_df,asia_line_box_df,'Asia',3)

import datetime as dt import itertools import statsmodels.graphics.tsaplots as sgt import statsmodels.tsa.stattools as sts from statsmodels.tsa.ar_model import AR from statsmodels.tsa.arima_model import ARIMA from pmdarima.arima.utils import ndiffs from statsmodels.tsa.seasonal import seasonal_decompose from sklearn.metrics import mean_squared_error

can_df['Total'].sort_values(ascending=False).head(1).index

ts = can_df.loc['India',years].to_frame() ts.head()

# ts.index = [dt.datetime.strptime(val,"%Y") for val in ts.index] ts.index = [dt.datetime(int(val),12,31) for val in ts.index] # ts.index = ts.index.to_period('Y') ts.rename(columns = {'India':'Number of Immigrants'}, inplace=True) ts.head()

ts['Number of Immigrants'].plot(figsize=(14,6),title='Immigration from India') plt.xlabel('Years') plt.ylabel('Number of Immigrants') plt.show()

# it can be seen from the previous plot that the mean is not constant. So, this time series is not # stationary t_stat,p_value,lags, observation_num,t_critical_vals,ic = sts.adfuller(ts['Number of Immigrants']) print('P_value: ',p_value) if p_value <= 0.05: print('There is a significant evidence that this time series is stationary') else: print('There is no significant evidence that this time series is stationary')

plt.rcParams["figure.figsize"] = (14,6) seasonal_decompose(ts['Number of Immigrants']).plot() plt.show() plt.close()

fig, (ax1,ax2) = plt.subplots(1,2,figsize=(14,4)) # ax1.plot(ts['Number of Immigrants']) sgt.plot_acf(ts['Number of Immigrants'],zero=False, ax=ax1) sgt.plot_pacf(ts['Number of Immigrants'],zero=False, ax=ax2) plt.show() plt.close()

diff_num = ndiffs(ts['Number of Immigrants'], test='adf') diff_num

# it can be seen taht d = 1

ts_diff = ts.diff(1)[1:] ts_diff.rename(columns = {'Number of Immigrants':' Difference in NUmber of Immigrants'}, inplace=True) ts_diff.head()

t_stat,p_value,lags, observation_num,t_critical_vals,ic = sts.adfuller(ts_diff) print('P_value: ',p_value) if p_value <= 0.05: print('There is a significant evidence that this time series is stationary') else: print('There is no significant evidence that this time series isstationary')

fig, (ax1,ax2) = plt.subplots(1,2,figsize=(14,4)) ts_diff.plot(ax = ax1, title = 'ts-diff', legend = False) sgt.plot_pacf(ts_diff,zero=False, lags=15, ax = ax2) plt.show() plt.close()

# from the partial autocorrelation function, it can be seen that p = 2

fig, (ax1,ax2) = plt.subplots(1,2,figsize=(14,4)) ts_diff.plot(ax = ax1, title = 'ts-diff', legend = False) sgt.plot_acf(ts_diff,zero=False, lags=15, ax = ax2) plt.show() plt.close()

# it can be seen from the partial autocorrelation plot that q = 2

x_train = ts['Number of Immigrants'].values[0:30] # 30 data point x_test = ts['Number of Immigrants'].values[30:] # 3 data point

model = ARIMA(x_train, order = (2,1,2)) model_fit = model.fit(disp=0) print(model_fit.summary())

residuals = pd.DataFrame(model_fit.resid)

fig, (ax1,ax2) = plt.subplots(1,2,figsize=(14,4)) ax1.plot(residuals) ax1.set_title('Residuals') ax2.hist(residuals, bins = 10) ax2.set_title('Residuals Distribution') plt.show()

model_fit.plot_predict();

steps = 4 fc, se, conf = model_fit.forecast(steps)

conf

conf[:,0]

result = pd.DataFrame({ 'Forecaste':fc, 'Lower Forecast Limit':conf[:,0], 'Actual':x_test, 'Upper Forecast Limit':conf[:,1]}) result

result.plot(color=['black','red','blue','red']) plt.show()

p=d=q=range(0,3) pdq = list(itertools.product(p,d,q))

param_aic = {} for param in pdq: try: model_arima = ARIMA(x_train, order = param) model_arima_fit = model_arima.fit() param_aic[(param[0],1,param[2])] = model_arima_fit.aic except: continue

best_param = min(param_aic, key= param_aic.get) best_param,param_aic[best_param]

model1 = ARIMA(x_train, order = best_param) model_fit1 = model1.fit() print(model_fit1.summary())

model_fit1.plot_predict();

steps = 4 fc1, se1, conf1 = model_fit1.forecast(steps)

result1 = pd.DataFrame({ 'Forecaste':fc1, 'Lower Forecast Limit':conf1[:,0], 'Actual':x_test, 'Upper Forecast Limit':conf1[:,1]}) result1.index = ts['Number of Immigrants'][30:].index result1

result1.plot(color=['black','red','blue','red']) plt.show()