import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
can_df = pd.read_excel('../data/canada.xlsx',
sheet_name='Canada by Citizenship',
skiprows=range(20),
skipfooter=2
)
can_df.head()
can_df.info()
can_df.drop(['AREA','REG','DEV','Type','Coverage'], axis=1, inplace=True)
can_df.rename(columns = {'OdName':'Country', 'AreaName':'Continent', 'RegName':'Region'},inplace=True)
can_df.head(2)
can_df['Total'] = can_df.iloc[:,4:].sum(axis=1)
can_df.set_index('Country', inplace=True)
can_df.columns = list(map(str, can_df.columns))
# to facilitate plotting
years = list(map(str, range(1980, 2014)))
new_index = []
for country in can_df.index:
if country == 'United Kingdom of Great Britain and Northern Ireland':
country ='UK'
new_index.append(country)
can_df.index = new_index
can_df.describe()
def make_basic_continent_stat(continent):
"""a function that takes a name of a continent and return the basic statistics of the immigration numbers
of the top 3 countries in that continent"""
print(f'{continent} Total Immigration Basic Sttaistics')
df = can_df[can_df['Continent']==continent]['Total'].describe()
return df
def make_basic_stat(continent,k):
"""a function that takes a name of a continent and return the basic statistics of the immigration numbers
of the top 3 countries in that continent"""
df = (can_df[can_df['Continent']==continent].
sort_values('Total',ascending=False).
head(k).
loc[:,years].T).describe()
return df
def make_bar_pie_df(continent,k):
"""a function that takes a name of a continent and return a dataframe that shows the total number
of immigrants of the top 3 countries in a continent with those countries as index. This dataframe
is suaitbale for bar chart and pie chart plots"""
df = can_df[can_df['Continent']==continent]['Total'].\
to_frame().\
sort_values('Total',ascending=False).\
head(k)
return df
def make_line_box_df(continent,k):
"""a function that takes a name of a continent and return a dataframe that show the name of the
country and the number of immigrants for the top 3 countries in the continent with years as index.
This dataframe is suitable for box and line plots"""
df = (can_df[can_df['Continent']==continent].
sort_values('Total',ascending=False).
head(k).
loc[:,years].T)
df.columns.name=''
df.reset_index(inplace=True)
df.rename(columns={'index':'Year'}, inplace=True)
df= df.melt(id_vars='Year',var_name='Country',value_name='Number of Immigrants').\
set_index('Year')
df.index = [int(val) for val in df.index]
return df
def make_vis(df1,df2,continent,k):
chart_order = df1.index
plt.figure(figsize=(26,18), facecolor='#C4A484')
plt.suptitle(f"Visalization of Immigrants Numbers from Top {k} Countries in {continent} (1980-2013)",
size=22)
plt.subplot(2,2,1)
sns.barplot(x=df1.index,y='Total',
data=df1,
order=chart_order)
plt.title('Bar Plot', size=20)
plt.xlabel('Country',size=18,color='black')
plt.ylabel('Number of Immigrants',size=18,color='black')
plt.xticks(size = 14)
plt.yticks(size = 14)
plt.subplot(2,2,2)
plt.pie(x = df1['Total'],
labels = df1.index,
radius= 1.1,
startangle= 90,
counterclock= False,
autopct= '%2.1F%%',
textprops={'fontsize': 18})
plt.title('Pie Plot', size=20)
plt.subplot(2,2,3)
sns.boxplot(x='Number of Immigrants',
y = 'Country',
data=df2)
plt.title('Box Plot', size = 20)
plt.xlabel('Number of Immigrants', size = 18,color='black')
plt.ylabel('Country', size = 18,color='black')
plt.xticks(size = 14)
plt.yticks(size = 14)
plt.subplot(2,2,4)
ax = sns.lineplot(data=df2,
x = df2.index,
y='Number of Immigrants',
hue='Country')
plt.title('Line Plot',size=20)
plt.xlabel('Year', size=18,color='black')
plt.ylabel('Number of Immigrants',size=18,color='black')
plt.xticks(size = 14)
plt.yticks(size = 14)
plt.setp(ax.get_legend().get_texts(), fontsize='18')
plt.setp(ax.get_legend().get_title(), fontsize='22')
make_basic_continent_stat('Africa')
make_basic_stat('Africa',3)
africa_bar_pie_df = make_bar_pie_df('Africa',3)
africa_line_box_df = make_line_box_df('Africa',3)
make_vis(africa_bar_pie_df,africa_line_box_df,'Africa',3)
make_basic_continent_stat('Latin America and the Caribbean')
make_basic_stat('Latin America and the Caribbean',3)
latin_america_bar_pie_df = make_bar_pie_df('Latin America and the Caribbean',3)
latin_america_line_box_df = make_line_box_df('Latin America and the Caribbean',3)
make_vis(latin_america_bar_pie_df,latin_america_line_box_df,'Latin America and the Caribbean',3)
make_basic_continent_stat('Europe')
make_basic_stat('Europe',2)
europe_bar_pie_df = make_bar_pie_df('Europe',2)
europe_line_box_df = make_line_box_df('Europe',2)
make_vis(europe_bar_pie_df,europe_line_box_df,'Europe',2)
can_df.loc['Croatia', years].plot(figsize=(14,4), title='Immigration from Croatia')
plt.xlabel('Years')
plt.ylabel('Number of Immigrants')
plt.show()
from IPython import display
display.Image("https://upload.wikimedia.org/wikipedia/commons/thumb/f/f3/GDP_of_Croatia_at_constant_prices.png/440px-GDP_of_Croatia_at_constant_prices.png")
balkan_df = can_df.loc[['Bosnia and Herzegovina','Croatia','Serbia'],years].T
balkan_df.plot(figsize=(14,6), title='Immigration from Bosnia and Herzegovina, Croatia and Serbia to Canada (1981-2013)')
plt.show()
can_df.Continent.value_counts().index
can_df[can_df.Continent == 'Northern America'].index
can_df.drop('Canada', axis = 0, inplace=True)
can_df.loc['Mexico', 'Continent'] = 'Northern America'
can_df[can_df['Continent']=='Northern America']
make_basic_stat('Northern America')
make_vis(make_bar_pie_df('Northern America'),make_line_box_df('Northern America'),'Northern America')
usa_df = pd.read_excel('../data/United States of America.xlsx',
sheet_name='USA by Place of birth',
skiprows=range(20),
skipfooter=2
)
usa_df.drop(['AREA','REG','DEV','Type','Coverage'], axis=1, inplace=True)
usa_df.rename(columns = {'OdName':'Country', 'AreaName':'Continent', 'RegName':'Region'},inplace=True)
usa_df['Total'] = can_df.iloc[:,4:].sum(axis=1)
usa_df.set_index('Country', inplace=True)
usa_df.columns = list(map(str, can_df.columns))
usa_df.head(3)
can_df.loc['United States of America', years].plot(label='Immigration From USA to Canada')
usa_df.loc['Canada', years].plot(label = 'Immigration from Canda to USA')
plt.title('Immigration Comparison btw. USA to Canada and Canada to USA')
plt.legend()
plt.show()
make_basic_continent_stat('Asia')
make_basic_stat('Asia',3)
asia_bar_pie_df = make_bar_pie_df('Asia',3)
asia_line_box_df = make_line_box_df('Asia',3)
make_vis(asia_bar_pie_df,asia_line_box_df,'Asia',3)
import datetime as dt
import itertools
import statsmodels.graphics.tsaplots as sgt
import statsmodels.tsa.stattools as sts
from statsmodels.tsa.ar_model import AR
from statsmodels.tsa.arima_model import ARIMA
from pmdarima.arima.utils import ndiffs
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error
can_df['Total'].sort_values(ascending=False).head(1).index
ts = can_df.loc['India',years].to_frame()
ts.head()
# ts.index = [dt.datetime.strptime(val,"%Y") for val in ts.index]
ts.index = [dt.datetime(int(val),12,31) for val in ts.index]
# ts.index = ts.index.to_period('Y')
ts.rename(columns = {'India':'Number of Immigrants'}, inplace=True)
ts.head()
ts['Number of Immigrants'].plot(figsize=(14,6),title='Immigration from India')
plt.xlabel('Years')
plt.ylabel('Number of Immigrants')
plt.show()
# it can be seen from the previous plot that the mean is not constant. So, this time series is not
# stationary
t_stat,p_value,lags, observation_num,t_critical_vals,ic = sts.adfuller(ts['Number of Immigrants'])
print('P_value: ',p_value)
if p_value <= 0.05:
print('There is a significant evidence that this time series is stationary')
else:
print('There is no significant evidence that this time series is stationary')
plt.rcParams["figure.figsize"] = (14,6)
seasonal_decompose(ts['Number of Immigrants']).plot()
plt.show()
plt.close()
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(14,4))
# ax1.plot(ts['Number of Immigrants'])
sgt.plot_acf(ts['Number of Immigrants'],zero=False, ax=ax1)
sgt.plot_pacf(ts['Number of Immigrants'],zero=False, ax=ax2)
plt.show()
plt.close()
diff_num = ndiffs(ts['Number of Immigrants'], test='adf')
diff_num
# it can be seen taht d = 1
ts_diff = ts.diff(1)[1:]
ts_diff.rename(columns = {'Number of Immigrants':' Difference in NUmber of Immigrants'}, inplace=True)
ts_diff.head()
t_stat,p_value,lags, observation_num,t_critical_vals,ic = sts.adfuller(ts_diff)
print('P_value: ',p_value)
if p_value <= 0.05:
print('There is a significant evidence that this time series is stationary')
else:
print('There is no significant evidence that this time series isstationary')
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(14,4))
ts_diff.plot(ax = ax1, title = 'ts-diff', legend = False)
sgt.plot_pacf(ts_diff,zero=False, lags=15, ax = ax2)
plt.show()
plt.close()
# from the partial autocorrelation function, it can be seen that p = 2
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(14,4))
ts_diff.plot(ax = ax1, title = 'ts-diff', legend = False)
sgt.plot_acf(ts_diff,zero=False, lags=15, ax = ax2)
plt.show()
plt.close()
# it can be seen from the partial autocorrelation plot that q = 2
x_train = ts['Number of Immigrants'].values[0:30] # 30 data point
x_test = ts['Number of Immigrants'].values[30:] # 3 data point
model = ARIMA(x_train, order = (2,1,2))
model_fit = model.fit(disp=0)
print(model_fit.summary())
residuals = pd.DataFrame(model_fit.resid)
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(14,4))
ax1.plot(residuals)
ax1.set_title('Residuals')
ax2.hist(residuals, bins = 10)
ax2.set_title('Residuals Distribution')
plt.show()
model_fit.plot_predict();
steps = 4
fc, se, conf = model_fit.forecast(steps)
fc
conf
conf[:,0]
result = pd.DataFrame({ 'Forecaste':fc,
'Lower Forecast Limit':conf[:,0],
'Actual':x_test,
'Upper Forecast Limit':conf[:,1]})
result
result.plot(color=['black','red','blue','red'])
plt.show()
p=d=q=range(0,3)
pdq = list(itertools.product(p,d,q))
param_aic = {}
for param in pdq:
try:
model_arima = ARIMA(x_train, order = param)
model_arima_fit = model_arima.fit()
param_aic[(param[0],1,param[2])] = model_arima_fit.aic
except:
continue
best_param = min(param_aic, key= param_aic.get)
best_param,param_aic[best_param]
model1 = ARIMA(x_train, order = best_param)
model_fit1 = model1.fit()
print(model_fit1.summary())
model_fit1.plot_predict();
steps = 4
fc1, se1, conf1 = model_fit1.forecast(steps)
result1 = pd.DataFrame({ 'Forecaste':fc1,
'Lower Forecast Limit':conf1[:,0],
'Actual':x_test,
'Upper Forecast Limit':conf1[:,1]})
result1.index = ts['Number of Immigrants'][30:].index
result1
result1.plot(color=['black','red','blue','red'])
plt.show()