1. Introduction to Data Analysis with Pandas

# Let's import pandas and some other basic packages we will use from __future__ import division %matplotlib inline import os import matplotlib.pyplot as plt import numpy as np np.random.seed(123456) import pandas as pd

Run to view results

first_names = pd.Series(['Taylor', 'John', 'Fela', 'Manu', 'Şebnem'], name='First Name') first_names

Run to view results

last_names = pd.Series({0:'Swift', 1:'Lennon', 2:'Kuti', 3:'Chau', 4:'Ferrah'}, name='Last Name') last_names

Run to view results

age = pd.Series(np.random.randint(33, 85, size=5), name='Age') age

Run to view results

musicians = pd.DataFrame({'First Name':first_names, 'Last Name':last_names, 'Age':age}) musicians

Run to view results

musicians.dtypes

Run to view results

musicians['Age'] = musicians['Age'].astype(int)

Run to view results

musicians.dtypes

Run to view results

musicians = pd.DataFrame([first_names, last_names, age]).T musicians

Run to view results

musicians.dtypes

Run to view results

musicians['Age'] = musicians['Age'].astype(int)

Run to view results

musicians.dtypes

Run to view results

random_data = pd.DataFrame(np.random.normal(size=(5,3)), columns=['Random Series 1', 'Random Series 2', 'Random Series 3']) random_data

Run to view results

pathout = './data/' musicians.to_csv(pathout + 'musicians.csv', encoding='utf8', index=False) musicians.to_excel(pathout + 'musicians.xlsx', index=False)

Run to view results

pathout = './data/' musicians_csv = pd.read_csv(pathout + 'musicians.csv', encoding='utf8') musicians_csv

Run to view results

musicians_excel = pd.read_excel(pathout + 'musicians.xlsx') musicians_excel

Run to view results

musicians['Gender'] = ['Female', 'Male', 'Male', 'Male', 'Female'] musicians

Run to view results

random_data['New Series'] = random_data['Random Series 1'] * random_data['Random Series 2'] random_data

Run to view results

musicians['Young Female'] = (musicians['Age']<35) * (musicians['Gender']=='Female') musicians['Young Female (as integer)'] = (musicians['Age']<35) * (musicians['Gender']=='Female').astype(int) musicians

Run to view results

random_data['Some Transformation'] = random_data.apply(lambda x: x['New Series'] - x['Random Series 3'], axis=1) random_data

Run to view results

musicians['First Name']

Run to view results

musicians[['First Name', 'Age']]

Run to view results

musicians.iloc[0]

Run to view results

musicians['First Name'].loc[musicians['Age']<50]

Run to view results

musicians.loc[(musicians['Age']>=50) & (musicians['Gender']=='Female') , ['First Name', 'Last Name']]

Run to view results

musicians.describe()

Run to view results

musicians['Mean Age'] = musicians['Age'].mean() musicians

Run to view results

musicians_gender = musicians.groupby(['Gender']).mean(numeric_only=True) musicians_gender

Run to view results

musicians_gender.loc['Female']

Run to view results

musicians_gender = musicians_gender.reset_index() musicians_gender

Run to view results

random_data.plot.scatter(x='New Series', y='Some Transformation', color='r', s=50, label='Very Important Relation!')

Run to view results

# Import display options for showing websites from IPython.display import IFrame url = 'https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes' IFrame(url, width=800, height=400)

Run to view results

isocodes = pd.read_html(url, encoding='utf-8')[0] isocodes

Run to view results

isocodes.columns

Run to view results

isocodes = isocodes.droplevel(0, axis=1) isocodes.head()

Run to view results

mycols = isocodes.columns mycols = [c[:c.find('[')] for c in mycols] mycols

Run to view results

isocodes.columns = mycols isocodes.head()

Run to view results

isocodes['Alpha-2 code original'] = isocodes['Alpha-2 code'] isocodes['Alpha-2 code'] = isocodes['Subdivision code links'].apply(lambda x: x[x.find(':')+1:]) isocodes.head()

Run to view results

url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)_per_capita' IFrame(url, width=800, height=400)

Run to view results

gdppc_wiki = pd.read_html(url, encoding='utf-8')[1] gdppc_wiki

Run to view results

gdppc_wiki.columns = ['Country/Territory', 'UN Region', 'gdppc_IMF', 'year_IMF', 'gdppc_WB', 'year_WB', 'gdppc_CIA', 'year_CIA'] gdppc_wiki.head()

Run to view results

gdppc_wiki['country_name'] = gdppc_wiki['Country/Territory'].str.replace('*', '', regex=True).str.strip() gdppc_wiki.head()

Run to view results

gdppc_wiki.dtypes

Run to view results

for c in gdppc_wiki.columns[2:-1]: if gdppc_wiki[c].dtype=='O': gdppc_wiki[c] = pd.to_numeric(gdppc_wiki[c].str.replace('—', 'nan'), errors='coerce') if c.startswith('year'): gdppc_wiki[c] = gdppc_wiki[c].astype('Int64')

Run to view results

gdppc_wiki.dtypes

Run to view results

isocodes.head(2)

Run to view results

gdppc_wiki.head(1)

Run to view results

merged = isocodes.merge(gdppc_wiki, left_on='Country name', right_on='country_name') merged

Run to view results

merged.shape

Run to view results

isocodes_names = set(isocodes['Country name']) gdppc_wiki_names = set(gdppc_wiki['country_name'])

Run to view results

isocodes_names.difference(gdppc_wiki_names)

Run to view results

gdppc_wiki_names.difference(isocodes_names)

Run to view results

# Set the size of the figure and get a figure and axis object fig, ax = plt.subplots(figsize=(10,6)) merged.gdppc_CIA.plot.kde(ax=ax, label='CIA') merged.gdppc_IMF.plot.kde(ax=ax, label='IMF') merged.gdppc_WB.plot.kde(ax=ax, label='WB') ax.legend()

Run to view results

# Set the size of the figure and get a figure and axis object fig, ax = plt.subplots(figsize=(10,6)) merged.gdppc_CIA.plot.hist(ax=ax, label='CIA') merged.gdppc_IMF.plot.hist(ax=ax, label='IMF', alpha=0.6) merged.gdppc_WB.plot.hist(ax=ax, label='WB', alpha=0.3) ax.legend()

Run to view results

# Set the size of the figure and get a figure and axis object fig, ax = plt.subplots(figsize=(10,6)) merged.plot.scatter(x='gdppc_WB', y='gdppc_CIA', ax=ax, label='WB-CIA', c='r') merged.plot.scatter(x='gdppc_WB', y='gdppc_IMF', ax=ax, label='WB-IMF', c='b') ax.set_xlabel('World Bank') ax.set_ylabel('Other Source') ax.legend(loc='lower right')

Run to view results

countries = pd.Series(['Colombia', 'Turkey', 'United States', 'Germany', 'Chile'], name='country') countries

Run to view results

print('\n', 'There are ', countries.shape[0], 'countries in this series.')

Run to view results

countries.apply(len)

Run to view results

np.random.seed(123456) data = pd.Series(np.random.normal(size=(countries.shape)), name='noise') data

Run to view results

print('\n', 'The average in this sample is ', data.mean()) print('\n', 'The average in this sample is ', "{:.2f}".format(data.mean())) print('\n', 'The maximum in this sample is ', "{:.2f}".format(data.max())) print('\n', 'The standard deviation in this sample is ', "{:.2f}".format(data.std()))

Run to view results

data.apply(np.exp)

Run to view results

df = pd.DataFrame([countries, data]) df

Run to view results

df = df.T df

Run to view results

df = pd.concat([countries, data], axis=1) df

Run to view results

df = pd.DataFrame({'country':countries, 'noise':data}) df

Run to view results

df['noise_sq'] = df.noise**2 df['noise and its square'] = df.noise + df.noise_sq df['name length'] = df.country.apply(len) df

Run to view results

south_america = ['Colombia', 'Chile']

Run to view results

df['South America Logical'] = df.country.apply(lambda x: x in south_america) df

Run to view results

mydict = {True:1, False:0} df['South America Dict'] = df['South America Logical'].map(mydict) df

Run to view results

df['South America'] = df.country.apply(lambda x: x in south_america).astype(int) df

Run to view results

import pandas as pd url = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population' tables = pd.read_html(url) pop_data = tables[0] pop_data = pop_data.dropna(subset=['Country/Territory']) pop_data['Country/Territory'] = pop_data['Country/Territory'].str.strip() pop = pd.DataFrame({ 'Country': pop_data['Country/Territory'], 'Population': pop_data['Population'], }) print(pop.head())

Run to view results

import pandas as pd merged_data = pd.merge(isocodes, pop, on='Country', how='inner') print(merged_data.head())

Run to view results

merged_data = pd.merge(isocodes, gdp_per_capita, on='Country', how='inner') merged_data = pd.merge(merged_data, pop, on='Country', how='inner') print(merged_data.head())

Run to view results

import os path = './data/' pathout = './output/' pathgraphs = './graphs/' os.makedirs(path, exist_ok=True) os.makedirs(pathout, exist_ok=True) os.makedirs(pathgraphs, exist_ok=True)

Run to view results

filename = 'Wiki_Data' merged_data.to_csv(os.path.join(pathout, f'{filename}.csv'), index=False) merged_data.to_excel(os.path.join(pathout, f'{filename}.xlsx'), index=False) merged_data.to_stata(os.path.join(pathout, f'{filename}.dta'), write_index=False)

Run to view results

import seaborn as sns import matplotlib.pyplot as plt sns.regplot(x='GDP per capita', y='Population', data=merged_data) plt.savefig(os.path.join(pathgraphs, 'scatterplot.png')) plt.savefig(os.path.join(pathgraphs, 'scatterplot.pdf')) plt.savefig(os.path.join(pathgraphs, 'scatterplot.jpg')) sns.lmplot(x='GDP per capita', y='Population', data=merged_data) plt.savefig(os.path.join(pathgraphs, 'lmplot.png')) plt.savefig(os.path.join(pathgraphs, 'lmplot.pdf')) plt.savefig(os.path.join(pathgraphs, 'lmplot.jpg')) sns.residplot(x='GDP per capita', y='Population', data=merged_data) plt.savefig(os.path.join(pathgraphs, 'residplot.png')) plt.savefig(os.path.join(pathgraphs, 'residplot.pdf')) plt.savefig(os.path.join(pathgraphs, 'residplot.jpg')) sns.jointplot(x='GDP per capita', y='Population', data=merged_data, kind='reg') plt.savefig(os.path.join(pathgraphs, 'jointplot.png')) plt.savefig(os.path.join(pathgraphs, 'jointplot.pdf')) plt.savefig(os.path.join(pathgraphs, 'jointplot.jpg'))

Run to view results

sns.regplot(x='GDP per capita', y='Population Growth', data=merged_data) plt.savefig(os.path.join(pathgraphs, 'scatterplot_growth.png')) plt.savefig(os.path.join(pathgraphs, 'scatterplot_growth.pdf')) plt.savefig(os.path.join(pathgraphs, 'scatterplot_growth.jpg')) sns.lmplot(x='GDP per capita', y='Population Growth', data=merged_data) plt.savefig(os.path.join(pathgraphs, 'lmplot_growth.png')) plt.savefig(os.path.join(pathgraphs, 'lmplot_growth.pdf')) plt.savefig(os.path.join(pathgraphs, 'lmplot_growth.jpg')) sns.residplot(x='GDP per capita', y='Population Growth', data=merged_data) plt.savefig(os.path.join(pathgraphs, 'residplot_growth.png')) plt.savefig(os.path.join(pathgraphs, 'residplot_growth.pdf')) plt.savefig(os.path.join(pathgraphs, 'residplot_growth.jpg')) sns.jointplot(x='GDP per capita', y='Population Growth', data=merged_data, kind='reg') plt.savefig(os.path.join(pathgraphs, 'jointplot_growth.png')) plt.savefig(os.path.join(pathgraphs, 'jointplot_growth.pdf')) plt.savefig(os.path.join(pathgraphs, 'jointplot_growth.jpg'))

Run to view results