# Let's import pandas and some other basic packages we will use
from __future__ import division
%matplotlib inline
import os
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(123456)
import pandas as pd
first_names = pd.Series(['Taylor', 'John', 'Fela', 'Manu', 'Şebnem'], name='First Name')
first_names
last_names = pd.Series({0:'Swift',
1:'Lennon',
2:'Kuti',
3:'Chau',
4:'Ferrah'}, name='Last Name')
last_names
age = pd.Series(np.random.randint(33, 85, size=5), name='Age')
age
musicians = pd.DataFrame({'First Name':first_names,
'Last Name':last_names,
'Age':age})
musicians
musicians.dtypes
musicians['Age'] = musicians['Age'].astype(int)
musicians.dtypes
musicians = pd.DataFrame([first_names, last_names, age]).T
musicians
musicians.dtypes
musicians['Age'] = musicians['Age'].astype(int)
musicians.dtypes
random_data = pd.DataFrame(np.random.normal(size=(5,3)), columns=['Random Series 1', 'Random Series 2', 'Random Series 3'])
random_data
pathout = './data/'
musicians.to_csv(pathout + 'musicians.csv', encoding='utf8', index=False)
musicians.to_excel(pathout + 'musicians.xlsx', index=False)
pathout = './data/'
musicians_csv = pd.read_csv(pathout + 'musicians.csv', encoding='utf8')
musicians_csv
musicians_excel = pd.read_excel(pathout + 'musicians.xlsx')
musicians_excel
musicians['Gender'] = ['Female', 'Male', 'Male', 'Male', 'Female']
musicians
random_data['New Series'] = random_data['Random Series 1'] * random_data['Random Series 2']
random_data
musicians['Young Female'] = (musicians['Age']<35) * (musicians['Gender']=='Female')
musicians['Young Female (as integer)'] = (musicians['Age']<35) * (musicians['Gender']=='Female').astype(int)
musicians
random_data['Some Transformation'] = random_data.apply(lambda x: x['New Series'] - x['Random Series 3'], axis=1)
random_data
musicians['First Name']
musicians[['First Name', 'Age']]
musicians.iloc[0]
musicians['First Name'].loc[musicians['Age']<50]
musicians.loc[(musicians['Age']>=50) & (musicians['Gender']=='Female') , ['First Name', 'Last Name']]
musicians.describe()
musicians['Mean Age'] = musicians['Age'].mean()
musicians
musicians_gender = musicians.groupby(['Gender']).mean(numeric_only=True)
musicians_gender
musicians_gender.loc['Female']
musicians_gender = musicians_gender.reset_index()
musicians_gender
random_data.plot.scatter(x='New Series', y='Some Transformation',
color='r', s=50, label='Very Important Relation!')
# Import display options for showing websites
from IPython.display import IFrame
url = 'https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes'
IFrame(url, width=800, height=400)
isocodes = pd.read_html(url, encoding='utf-8')[0]
isocodes
isocodes.columns
isocodes = isocodes.droplevel(0, axis=1)
isocodes.head()
mycols = isocodes.columns
mycols = [c[:c.find('[')] for c in mycols]
mycols
isocodes.columns = mycols
isocodes.head()
isocodes['Alpha-2 code original'] = isocodes['Alpha-2 code']
isocodes['Alpha-2 code'] = isocodes['Subdivision code links'].apply(lambda x: x[x.find(':')+1:])
isocodes.head()
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(PPP)_per_capita'
IFrame(url, width=800, height=400)
gdppc_wiki = pd.read_html(url, encoding='utf-8')[1]
gdppc_wiki
gdppc_wiki.columns = ['Country/Territory', 'UN Region', 'gdppc_IMF', 'year_IMF',
'gdppc_WB', 'year_WB', 'gdppc_CIA', 'year_CIA']
gdppc_wiki.head()
gdppc_wiki['country_name'] = gdppc_wiki['Country/Territory'].str.replace('*', '', regex=True).str.strip()
gdppc_wiki.head()
gdppc_wiki.dtypes
for c in gdppc_wiki.columns[2:-1]:
if gdppc_wiki[c].dtype=='O':
gdppc_wiki[c] = pd.to_numeric(gdppc_wiki[c].str.replace('—', 'nan'), errors='coerce')
if c.startswith('year'):
gdppc_wiki[c] = gdppc_wiki[c].astype('Int64')
gdppc_wiki.dtypes
isocodes.head(2)
gdppc_wiki.head(1)
merged = isocodes.merge(gdppc_wiki, left_on='Country name', right_on='country_name')
merged
merged.shape
isocodes_names = set(isocodes['Country name'])
gdppc_wiki_names = set(gdppc_wiki['country_name'])
isocodes_names.difference(gdppc_wiki_names)
gdppc_wiki_names.difference(isocodes_names)
# Set the size of the figure and get a figure and axis object
fig, ax = plt.subplots(figsize=(10,6))
merged.gdppc_CIA.plot.kde(ax=ax, label='CIA')
merged.gdppc_IMF.plot.kde(ax=ax, label='IMF')
merged.gdppc_WB.plot.kde(ax=ax, label='WB')
ax.legend()
# Set the size of the figure and get a figure and axis object
fig, ax = plt.subplots(figsize=(10,6))
merged.gdppc_CIA.plot.hist(ax=ax, label='CIA')
merged.gdppc_IMF.plot.hist(ax=ax, label='IMF', alpha=0.6)
merged.gdppc_WB.plot.hist(ax=ax, label='WB', alpha=0.3)
ax.legend()
# Set the size of the figure and get a figure and axis object
fig, ax = plt.subplots(figsize=(10,6))
merged.plot.scatter(x='gdppc_WB', y='gdppc_CIA', ax=ax, label='WB-CIA', c='r')
merged.plot.scatter(x='gdppc_WB', y='gdppc_IMF', ax=ax, label='WB-IMF', c='b')
ax.set_xlabel('World Bank')
ax.set_ylabel('Other Source')
ax.legend(loc='lower right')
countries = pd.Series(['Colombia', 'Turkey', 'United States', 'Germany', 'Chile'], name='country')
countries
print('\n', 'There are ', countries.shape[0], 'countries in this series.')
countries.apply(len)
np.random.seed(123456)
data = pd.Series(np.random.normal(size=(countries.shape)), name='noise')
data
print('\n', 'The average in this sample is ', data.mean())
print('\n', 'The average in this sample is ', "{:.2f}".format(data.mean()))
print('\n', 'The maximum in this sample is ', "{:.2f}".format(data.max()))
print('\n', 'The standard deviation in this sample is ', "{:.2f}".format(data.std()))
data.apply(np.exp)
df = pd.DataFrame([countries, data])
df
df = df.T
df
df = pd.concat([countries, data], axis=1)
df
df = pd.DataFrame({'country':countries,
'noise':data})
df
df['noise_sq'] = df.noise**2
df['noise and its square'] = df.noise + df.noise_sq
df['name length'] = df.country.apply(len)
df
south_america = ['Colombia', 'Chile']
df['South America Logical'] = df.country.apply(lambda x: x in south_america)
df
mydict = {True:1,
False:0}
df['South America Dict'] = df['South America Logical'].map(mydict)
df
df['South America'] = df.country.apply(lambda x: x in south_america).astype(int)
df
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population'
tables = pd.read_html(url)
pop_data = tables[0]
pop_data = pop_data.dropna(subset=['Country/Territory'])
pop_data['Country/Territory'] = pop_data['Country/Territory'].str.strip()
pop = pd.DataFrame({
'Country': pop_data['Country/Territory'],
'Population': pop_data['Population'],
})
print(pop.head())
import pandas as pd
merged_data = pd.merge(isocodes, pop, on='Country', how='inner')
print(merged_data.head())
merged_data = pd.merge(isocodes, gdp_per_capita, on='Country', how='inner')
merged_data = pd.merge(merged_data, pop, on='Country', how='inner')
print(merged_data.head())
import os
path = './data/'
pathout = './output/'
pathgraphs = './graphs/'
os.makedirs(path, exist_ok=True)
os.makedirs(pathout, exist_ok=True)
os.makedirs(pathgraphs, exist_ok=True)
filename = 'Wiki_Data'
merged_data.to_csv(os.path.join(pathout, f'{filename}.csv'), index=False)
merged_data.to_excel(os.path.join(pathout, f'{filename}.xlsx'), index=False)
merged_data.to_stata(os.path.join(pathout, f'{filename}.dta'), write_index=False)
import seaborn as sns
import matplotlib.pyplot as plt
sns.regplot(x='GDP per capita', y='Population', data=merged_data)
plt.savefig(os.path.join(pathgraphs, 'scatterplot.png'))
plt.savefig(os.path.join(pathgraphs, 'scatterplot.pdf'))
plt.savefig(os.path.join(pathgraphs, 'scatterplot.jpg'))
sns.lmplot(x='GDP per capita', y='Population', data=merged_data)
plt.savefig(os.path.join(pathgraphs, 'lmplot.png'))
plt.savefig(os.path.join(pathgraphs, 'lmplot.pdf'))
plt.savefig(os.path.join(pathgraphs, 'lmplot.jpg'))
sns.residplot(x='GDP per capita', y='Population', data=merged_data)
plt.savefig(os.path.join(pathgraphs, 'residplot.png'))
plt.savefig(os.path.join(pathgraphs, 'residplot.pdf'))
plt.savefig(os.path.join(pathgraphs, 'residplot.jpg'))
sns.jointplot(x='GDP per capita', y='Population', data=merged_data, kind='reg')
plt.savefig(os.path.join(pathgraphs, 'jointplot.png'))
plt.savefig(os.path.join(pathgraphs, 'jointplot.pdf'))
plt.savefig(os.path.join(pathgraphs, 'jointplot.jpg'))
sns.regplot(x='GDP per capita', y='Population Growth', data=merged_data)
plt.savefig(os.path.join(pathgraphs, 'scatterplot_growth.png'))
plt.savefig(os.path.join(pathgraphs, 'scatterplot_growth.pdf'))
plt.savefig(os.path.join(pathgraphs, 'scatterplot_growth.jpg'))
sns.lmplot(x='GDP per capita', y='Population Growth', data=merged_data)
plt.savefig(os.path.join(pathgraphs, 'lmplot_growth.png'))
plt.savefig(os.path.join(pathgraphs, 'lmplot_growth.pdf'))
plt.savefig(os.path.join(pathgraphs, 'lmplot_growth.jpg'))
sns.residplot(x='GDP per capita', y='Population Growth', data=merged_data)
plt.savefig(os.path.join(pathgraphs, 'residplot_growth.png'))
plt.savefig(os.path.join(pathgraphs, 'residplot_growth.pdf'))
plt.savefig(os.path.join(pathgraphs, 'residplot_growth.jpg'))
sns.jointplot(x='GDP per capita', y='Population Growth', data=merged_data, kind='reg')
plt.savefig(os.path.join(pathgraphs, 'jointplot_growth.png'))
plt.savefig(os.path.join(pathgraphs, 'jointplot_growth.pdf'))
plt.savefig(os.path.join(pathgraphs, 'jointplot_growth.jpg'))