import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('travel_insurance.csv')
df.head()
df.info()
df_dummy = pd.get_dummies(df, drop_first=True)
# Rename columns for readability; attempted to keep the same naming convention used
df_dummy.columns = ['Age', 'AnnualIncome(k$)', 'FamilyMembers',
'HealthIssues', 'Insured', 'PrivateSector',
'Graduate', 'FrequentFlyer', 'TraveledAbroad']
# Converting AnnualIncome to be in k$
df_dummy['AnnualIncome(k$)'] = df_dummy['AnnualIncome(k$)'] / 1000
# Data after column rename
df_dummy.describe().round(3)
cols = ['HealthIssues', 'PrivateSector', 'Graduate', 'FrequentFlyer', 'TraveledAbroad']
for i in cols:
a = df_dummy[df_dummy[i] == 1]
b = a.Insured.value_counts(normalize=True).round(3)
print('{} customer rate: {}%'.format(i, b[1] * 100))
sns.heatmap(df_dummy.corr(), annot=True)
plt.title('Correlation Heatmap - All')
plt.tight_layout()
cust = df_dummy[df_dummy['Insured'] == 1].drop(columns='Insured')
non_cust = df_dummy[df_dummy['Insured'] == 0].drop(columns='Insured')
fig, ax = plt.subplots(1, 2, figsize=(11,4), sharey=True)
fig.suptitle('Ages of Non-customers vs. Customers')
sns.countplot(x='Age', data=non_cust, ax=ax[0])
ax[0].set(xlabel='Age - Non-Customer')
sns.countplot(x='Age', data=cust, ax=ax[1])
ax[1].set(xlabel='Age - Customer')
plt.tight_layout()
twenty_8 = df_dummy[df_dummy['Age'] == 28].drop(columns='Age')
twenty_8 = (twenty_8.Insured.value_counts(normalize=True) * 100).round(2)
print('{}% of 28 year old travelers are not customers'.format(twenty_8[0]))
non_cust_28 = non_cust[non_cust['Age'] == 28].drop(columns='Age')
non_cust_28.describe().round(3)
plt.hist('AnnualIncome(k$)', data=cust, label='Customer', alpha=0.4)
plt.hist('AnnualIncome(k$)', data=non_cust, label='Non-customer', alpha=0.4)
plt.title('Annual Income by Customer')
plt.xlabel('Annual Income (Thousands of $)')
plt.ylabel('Count')
plt.legend()
plt.tight_layout()
over_130k = df_dummy[df_dummy['AnnualIncome(k$)'] > 1300]
cust_over = (over_130k.Insured.value_counts(normalize=True) * 100).round(2)
under_130k = df_dummy[df_dummy['AnnualIncome(k$)'] < 1300]
cust_under = (under_130k.Insured.value_counts(normalize=True) * 100).round(2)
print('{}% of travelers that earn over $1.3million annually are customers'.format(cust_over[1]))
print('{}% of travelers that earn under $1.3million annually are not customers'.format(cust_under[0]))
# Annual income by insured status scatterplot
ax = sns.scatterplot(x='Age', y='AnnualIncome(k$)', hue='Insured', data=df_dummy, alpha=0.4)
plt.title('Age vs Income by Insured Status')
plt.ylabel('Annual Income (Thousands of $)')
# sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.tight_layout()
sns.catplot(x='TraveledAbroad', y='AnnualIncome(k$)', data=df_dummy, hue='Insured', alpha=0.4)
plt.xticks([0, 1], ['No', 'Yes'])
plt.suptitle('Income distribution by Travel and Insurance')
plt.tight_layout()
sns.catplot(x='FrequentFlyer', y='AnnualIncome(k$)', data=df_dummy, hue='Insured', alpha=0.4)
plt.xticks([0, 1], ['No', 'Yes'])
plt.suptitle('Income distribution by Frequent flyers and Insurance')
plt.tight_layout()
sns.catplot(x='PrivateSector', y='AnnualIncome(k$)', data=df_dummy, hue='Insured', alpha=0.3)
plt.xticks([0, 1], ['No', 'Yes'])
plt.suptitle('Income distribution by Private Sector and Insurance')
plt.tight_layout()
sns.countplot(x='PrivateSector', data=df_dummy, hue='Insured', alpha=0.4)
plt.xticks([0, 1], ['No', 'Yes'])
plt.suptitle('Income distribution by Private Sector and Insurance')
plt.tight_layout()
gov_sec = df_dummy[(df_dummy['PrivateSector'] == 0)]
gov_sec = 1 - (gov_sec.Insured.value_counts(normalize=True)).round(3)
print('{}% of Government sector are not customers'.format((gov_sec[1] * 100)))
priv_sec = df_dummy[(df_dummy['PrivateSector'] == 1)]
priv_sec = 1 - (priv_sec.Insured.value_counts(normalize=True)).round(3)
print('{}% of Private sector are not customers'.format(priv_sec[1] * 100))
cust.describe().round(3)
non_cust.describe().round(3)