Structured data analysis 6: Titanic

# Run this cell to start. import numpy as np import pandas as pd # Safe settings for Pandas. pd.set_option('mode.chained_assignment', 'raise') %matplotlib inline import matplotlib.pyplot as plt plt.style.use('fivethirtyeight') # Load the OKpy test library and tests. from client.api.notebook import Notebook ok = Notebook('titanic.ok')

titanic = pd.read_csv('titanic_stlearn.csv') titanic.head()

# Test you are on the right track. _ = ok.grade('q_01_titanic')

gender_by_survived = pd.crosstab( (titanic['gender']), (titanic['survived'])) # Show the table in the notebook gender_by_survived

# Check you are on the right track. _ = ok.grade('q_02_gender_by_survived')

gender_by_survived_p = pd.crosstab((titanic['gender']), (titanic['survived']), normalize= 'index') # Show the table in the notebook gender_by_survived_p

# Check you are on the right track. _ = ok.grade('q_03_gender_by_survived_p')

# Run this cell. mwc = titanic['gender'].copy() mwc.head()

#- Your code here. # Show the unique values and counts for the "mwc" Series. is_child= titanic['age']<15 mwc[is_child] = "child" mwc.value_counts()

_ = ok.grade('q_04_mwc')

mwc_by_survived_p = pd.crosstab(mwc, (titanic['survived']), normalize='index') mwc_by_survived_p

_ = ok.grade('q_05_mwc_p')

# Run this cell. titanic['class'].value_counts()

# Run this cell to create example row classification function def classify_mf_child(row): if row.loc['age'] >= 15: return 'adult' if row.loc['gender'] == 'female': return 'female child' return 'male child'

classify_mf_child(titanic.iloc[0])

classify_mf_child(titanic.iloc[1])

mf_child = titanic.apply(classify_mf_child, axis='columns') mf_child.head()

a = 'Bah humbug' 'humbug' in a

pd.isna(np.nan)

def classify_role(row): if "victualling" in row.loc['class']: return "catering" elif "restaurant" in row.loc['class']: return "catering" elif "engineering" in row.loc['class']: return "engineering" elif "deck" in row.loc['class']: return "deck" elif "Andrews, Mr. Thomas" in row.loc["name"]: return "guarantee" elif "Campbell, Mr. William Henry" in row.loc["name"]: return "guarantee" elif "Chisholm, Mr." in row.loc["name"]: return "guarantee" elif "Cunningham, Mr." in row.loc["name"]: return "guarantee" elif "Frost, Mr." in row.loc["name"]: return "guarantee" elif "Knight, Mr." in row.loc["name"]: return "guarantee" elif "Parkes, Mr." in row.loc["name"]: return "guarantee" elif "Parr, Mr." in row.loc["name"]: return "guarantee" elif "Watson, Mr." in row.loc["name"]: return "guarantee" elif "Brailey, Mr." in row.loc["name"]: return "musician" elif "Bricoux, Mr." in row.loc["name"]: return "musician" elif "Clarke, Mr." in row.loc["name"]: return "musician" elif "Hartley, Mr." in row.loc["name"]: return "musician" elif "Hume, Mr." in row.loc["name"]: return "musician" elif "Kins, Mr." in row.loc["name"]: return "musician" elif "Taylor, Mr. Percy" in row.loc["name"]: return "musician" elif "Woodward, Mr." in row.loc["name"]: return "musician" elif "3" in row.loc["class"]: return "3rd" elif "2" in row.loc["class"]: return "2nd" else: return "1st"

print(classify_role(titanic.iloc[0])) # Should show '3rd' print(classify_role(titanic.iloc[6])) # Should show '2nd' print(classify_role(titanic.iloc[-1])) # Should show 'catering' print(classify_role(titanic.iloc[-3])) # Should show 'engineering' print(classify_role(titanic.iloc[-4])) # Should show 'catering' print(classify_role(titanic.iloc[-5])) # Should show 'deck' is_brailey = titanic['name'].str.startswith('Brailey') print(classify_role(titanic[is_brailey].iloc[0])) # Should show 'musician' is_andrews = titanic['name'] == 'Andrews, Mr. Thomas' print(classify_role(titanic[is_andrews].iloc[0])) # Should show 'guarantee'

# This test runs the tests above, and some extra besides. _ = ok.grade('q_06_classify_role')

role_by_survived= titanic.apply(classify_role, axis= 'columns') male_p= titanic['gender']== "male" survived= titanic['survived'] role_by_survived_p = pd.crosstab(role_by_survived, (survived[male_p]), normalize= 'index') role_by_survived_p

# For your convenience, you can run this cell to run all the tests at once! import os _ = [ok.grade(q[:-3]) for q in os.listdir("tests") if q.startswith('q')]