# Loading in required libraries
import pandas as pd
import seaborn as sns
import numpy as np
# Reading in the Nobel Prize data
nobel = pd.read_csv("datasets/nobel.csv")
# Taking a look at the first several winners
nobel.head(6)
# Display the number of (possibly shared) Nobel Prizes handed
# out between 1901 and 2016
num_prizes = len(nobel)
display(num_prizes)
# Display the number of prizes won by male and female recipients.
sex_counts = nobel['sex'].value_counts()
display(sex_counts)
# Display the number of prizes won by the top 10 nationalities.
top_countries = nobel['birth_country'].value_counts().head(10)
top_countries
# Calculating the proportion of USA born winners per decade
nobel['usa_born_winner'] = nobel['birth_country'] == 'United States of America'
nobel['decade'] = np.floor(nobel['year']/10).astype(int) * 10
prop_usa_winners = nobel.groupby('decade', as_index=False)['usa_born_winner'].mean()
# Display the proportions of USA born winners per decade
prop_usa_winners[['decade', 'usa_born_winner']]
# Setting the plotting theme
sns.set()
# and setting the size of all plots.
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [11, 7]
# Plotting USA born winners
ax = sns.lineplot(data=prop_usa_winners, x='decade', y='usa_born_winner')
# Adding %-formatting to the y-axis
from matplotlib.ticker import PercentFormatter
ax.yaxis.set_major_formatter(PercentFormatter(1.0))
# Calculating the proportion of female laureates per decade
nobel['female_winner'] = nobel['sex'] == 'Female'
prop_female_winners = nobel.groupby(['decade', 'category'], as_index=False)['female_winner'].mean()
# Plotting USA born winners with % winners on the y-axis
ax = sns.lineplot(data=prop_female_winners, x='decade', y='female_winner', hue='category')
ax.yaxis.set_major_formatter(PercentFormatter(1.0))
# Picking out the first woman to win a Nobel Prize
nobel_female = nobel[nobel['sex'] == 'Female']
nobel_female.nsmallest(1, columns='year')
nobel.groupby('full_name').filter(lambda group: len(group) >= 2)
# Converting birth_date from String to datetime
nobel['birth_date'] = pd.to_datetime(nobel['birth_date'], errors='coerce')
# Calculating the age of Nobel Prize winners
nobel['age'] = nobel['year'] - nobel['birth_date'].dt.year
# Plotting the age of Nobel Prize winners
sns.lmplot(data=nobel, x='year', y='age', lowess=True, aspect=2, line_kws={'color': 'black'})
# Same plot as above, but separate plots for each type of Nobel Prize
sns.lmplot(x='year', y='age', data=nobel, row='category', aspect=2)
# The oldest winner of a Nobel Prize as of 2016
display(nobel.nlargest(1, 'age'))
# The youngest winner of a Nobel Prize as of 2016
nobel.nsmallest(1, 'age')
# The name of the youngest winner of the Nobel Prize as of 2016
youngest_winner = nobel.nsmallest(1, 'age')['full_name'].values[0].split()[0]
youngest_winner