Working with Pandas- Titanic Dataset

import pandas as pd import seaborn as sns import numpy as np import matplotlib.pyplot as plt

# load dataset titanic df = sns.load_dataset('titanic')

#printing first 5 rows df.head()

#printing last 5 rows df.tail()

df.describe()

df.dtypes

df.isna().sum()

#to check if all values of pclass and class column matches with each other first=np.argwhere((df['pclass'].to_numpy()==1) & (df['class'].to_numpy() != 'First')).shape[0] second= np.argwhere((df['pclass'].to_numpy()==2) & (df['class'].to_numpy() != 'Second')).shape[0] third= np.argwhere((df['pclass'].to_numpy()==3) & (df['class'].to_numpy() != 'Third')).shape[0] print("first: %d, second: %d, third: %d" %(first, second,third))

#since plass and class column values gives the same info, we can drop one of them df=df.drop('pclass',axis= 1)

#to check if the missing values for embark and embarked column are for the same person, # otherwise both columns could be filled based on the other column's value df['embarked'][(df['embarked'].isnull()) & (df['embark_town'].isnull())]

#to check all values of embarked and embark_towns column matches with each other S=np.argwhere((df['embarked'].to_numpy()=='S') & (df['embark_town'].to_numpy() != 'Southampton')).shape[0] C= np.argwhere((df['embarked'].to_numpy()=='C') & (df['embark_town'].to_numpy() != 'Cherbourg')).shape[0] Q= np.argwhere((df['embarked'].to_numpy()=='Q') & (df['embark_town'].to_numpy() != 'Queenstown')).shape[0] print("S: %d, C: %d, Q: %d" %(S, C,Q))

#since embarked and embark_town column values gives the same info, we can drop one of them df=df.drop('embarked',axis= 1)

#to crosscheck the data is correct in survived column by crosschecking it withp alive column. df['survived'][(df['survived']==1) & (df['alive']=='no')]

#drooping alive clumn as survived provided the same info df= df.drop('alive', axis=1)

#to check if there are any missing values for age of a child so that they could be filled appropriately df['age'][(df['age'].isnull()) & (df['who']=='child')]

#no missing values for age of a child, now look at distplot of age sns.displot(df['age'])

#The distplot of age is approximately normal, so it would be approriate to fill the missing values with mean df['age']=df['age'].fillna(df['age'].mean())

df['deck'].unique()

#printing highest fare df['fare'].max()

#printing all survivor's details survivors_details=df[df['survived']==1] survivors_details

#printing total passengers per class. df['class'].value_counts()

#to find each class's highest fare df.groupby('class')['fare'].max()

#to find average age of passengers by class df.groupby('class')['age'].mean()

#sorting passengers by fare and age df=df.sort_values(by=['fare','age']) df