import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('travel_insurance.csv')
df.head()
df.info()
df.describe(include='all')
df_dummy = pd.get_dummies(df, drop_first=True)
# Rename columns for readability; attempted to keep the same naming convention used
df_dummy.columns = ['Age', 'AnnualIncome(k$)', 'FamilyMembers',
'HealthIssues', 'Insured', 'PrivateSector',
'Graduate', 'FrequentFlyer', 'TravelledAbroad']
# Converting AnnualIncome to be in k$
df_dummy['AnnualIncome(k$)'] = df_dummy['AnnualIncome(k$)'] / 1000
# Data after column rename
df_dummy.describe().round(3)
sns.countplot(x='Insured', data=df_dummy)
plt.title('Distribution of Insured Status')
plt.ylabel('Count')
plt.xticks([0, 1], ['No', 'Yes'])
df_dummy.hist('Age')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.grid(b=None)
plt.savefig('age_distribution.png')
age_28 = df_dummy[(df_dummy['Age'] == 28)]
age_28.describe().round(3)
plt.hist('AnnualIncome(k$)', data=not_28, label='Other ages', alpha=0.4)
plt.hist('AnnualIncome(k$)', data=age_28, label='28 year olds', alpha=0.4)
plt.legend()
# Separate by insured vs uninsured
insured = df_dummy[df_dummy['Insured'] == 1]
uninsured = df_dummy[df_dummy['Insured'] == 0]
# Drop the insured columns
insured = insured.drop(columns='Insured')
uninsured = uninsured.drop(columns='Insured')
plt.hist('Age', data=df_dummy[df_dummy['Insured']==0], alpha=0.5, label='Uninsured')
plt.hist('Age', data=df_dummy[df_dummy['Insured']==1], alpha=0.5, label='Insured')
plt.title('Distribution of Age by Insured Status')
plt.xlabel('Age')
plt.ylabel('Count')
plt.legend()
plt.savefig('age_distribution_insured.png')
plt.hist('AnnualIncome(k$)', data=uninsured, alpha=0.5, label='Uninsured')
plt.hist('AnnualIncome(k$)', data=insured, alpha=0.5, label='Insured')
plt.title('Distribution of Income by Insured Status')
plt.xlabel('Annual Income (Thousands of $)')
plt.ylabel('Count')
plt.legend()
plt.savefig('age_distribution_income.png')
# Annual income by insured status scatterplot
sns.scatterplot(x='Age', y='AnnualIncome(k$)', hue='Insured', data=df_dummy, legend=True)
plt.legend(labels=['Uninsured', 'Insured'])
plt.title('Age to Income, Colored by Insured Status')
# plt.savefig('age_income_scatterplot.png')
sns.catplot(x='TravelledAbroad', y='AnnualIncome(k$)', data=df_dummy, alpha=0.3, col='Insured', row='FrequentFlyer')
under_130k = df_dummy[df_dummy['AnnualIncome(k$)'] < 1300]
over_130k = df_dummy[df_dummy['AnnualIncome(k$)'] > 1300]
under_130k.describe().round(3)
over_130k.describe().round(3)
sns.heatmap(df_dummy.corr(), annot=True)
plt.title('Correlation Heatmap - All')
plt.savefig('Correlation Heatmap - All.png')
sns.catplot(x='TravelledAbroad', y='AnnualIncome(k$)', hue='Insured', kind='box', data=df_dummy)
plt.ylabel('Annual Income (Thousands of $)')
plt.title('Travelled Abroad vs Income by Insured Status')
plt.xticks([0, 1], ['No', 'Yes'])
plt.savefig('ta_income.png')
sns.catplot(x='FrequentFlyer', y='AnnualIncome(k$)', hue='Insured', kind='box', data=df_dummy)
plt.ylabel('Annual Income (Thousands of $)')
plt.xticks([0, 1], ['No', 'Yes'])
plt.title('Frequent Flyers vs Income by Insured Status')
plt.savefig('ff_income.png')
sns.catplot(x='PrivateSector', y='AnnualIncome(k$)', hue='Insured', kind='box', data=df_dummy)
plt.ylabel('Annual Income (Thousands of $)')
plt.xlabel('Sector')
plt.xticks([0, 1], ['Government', 'Private'])
plt.title('Private Sector vs Income by Insured Status')
plt.savefig('privatesec_income.png')
sns.heatmap(insured.corr(), annot=True)
plt.title('Correlation Heatmap - Insured')
plt.savefig('insured_heatmap.png')
sns.heatmap(uninsured.corr(), annot=True)
plt.title('Correlation Heatmap - Uninsured')
plt.savefig('uninsured_heatmap.png')
insured.describe().round(3)
uninsured.describe().round(3)