import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
# load dataset titanic
df = sns.load_dataset('titanic')
#printing first 5 rows
df.head()
#printing last 5 rows
df.tail()
df.describe()
df.dtypes
df.isna().sum()
#to check if all values of pclass and class column matches with each other
first=np.argwhere((df['pclass'].to_numpy()==1) & (df['class'].to_numpy() != 'First')).shape[0]
second= np.argwhere((df['pclass'].to_numpy()==2) & (df['class'].to_numpy() != 'Second')).shape[0]
third= np.argwhere((df['pclass'].to_numpy()==3) & (df['class'].to_numpy() != 'Third')).shape[0]
print("first: %d, second: %d, third: %d" %(first, second,third))
#since plass and class column values gives the same info, we can drop one of them
df=df.drop('pclass',axis= 1)
#to check if the missing values for embark and embarked column are for the same person,
# otherwise both columns could be filled based on the other column's value
df['embarked'][(df['embarked'].isnull()) & (df['embark_town'].isnull())]
#to check all values of embarked and embark_towns column matches with each other
S=np.argwhere((df['embarked'].to_numpy()=='S') & (df['embark_town'].to_numpy() != 'Southampton')).shape[0]
C= np.argwhere((df['embarked'].to_numpy()=='C') & (df['embark_town'].to_numpy() != 'Cherbourg')).shape[0]
Q= np.argwhere((df['embarked'].to_numpy()=='Q') & (df['embark_town'].to_numpy() != 'Queenstown')).shape[0]
print("S: %d, C: %d, Q: %d" %(S, C,Q))
#since embarked and embark_town column values gives the same info, we can drop one of them
df=df.drop('embarked',axis= 1)
#to crosscheck the data is correct in survived column by crosschecking it withp alive column.
df['survived'][(df['survived']==1) & (df['alive']=='no')]
#drooping alive clumn as survived provided the same info
df= df.drop('alive', axis=1)
#to check if there are any missing values for age of a child so that they could be filled appropriately
df['age'][(df['age'].isnull()) & (df['who']=='child')]
#no missing values for age of a child, now look at distplot of age
sns.displot(df['age'])
#The distplot of age is approximately normal, so it would be approriate to fill the missing values with mean
df['age']=df['age'].fillna(df['age'].mean())
df['deck'].unique()
#printing highest fare
df['fare'].max()
#printing all survivor's details
survivors_details=df[df['survived']==1]
survivors_details
#printing total passengers per class.
df['class'].value_counts()
#to find each class's highest fare
df.groupby('class')['fare'].max()
#to find average age of passengers by class
df.groupby('class')['age'].mean()
#sorting passengers by fare and age
df=df.sort_values(by=['fare','age'])
df