pip install seaborn
Requirement already satisfied: seaborn in /opt/venv/lib/python3.7/site-packages (0.11.0)
Requirement already satisfied: pandas>=0.23 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.0.5)
Requirement already satisfied: scipy>=1.0 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.5.2)
Requirement already satisfied: numpy>=1.15 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.19.2)
Requirement already satisfied: matplotlib>=2.2 in /opt/venv/lib/python3.7/site-packages (from seaborn) (3.3.2)
Requirement already satisfied: pytz>=2017.2 in /opt/venv/lib/python3.7/site-packages (from pandas>=0.23->seaborn) (2020.1)
Requirement already satisfied: python-dateutil>=2.6.1 in /opt/venv/lib/python3.7/site-packages (from pandas>=0.23->seaborn) (2.8.1)
Requirement already satisfied: cycler>=0.10 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.2->seaborn) (1.2.0)
Requirement already satisfied: certifi>=2020.06.20 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.2->seaborn) (2020.6.20)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.2->seaborn) (2.4.7)
Requirement already satisfied: pillow>=6.2.0 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.2->seaborn) (7.2.0)
Requirement already satisfied: six>=1.5 in /opt/venv/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas>=0.23->seaborn) (1.15.0)
WARNING: You are using pip version 20.2.3; however, version 20.2.4 is available.
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
pip install plotly
Requirement already satisfied: plotly in /opt/venv/lib/python3.7/site-packages (4.12.0)
Requirement already satisfied: six in /opt/venv/lib/python3.7/site-packages (from plotly) (1.15.0)
Requirement already satisfied: retrying>=1.3.3 in /opt/venv/lib/python3.7/site-packages (from plotly) (1.3.3)
WARNING: You are using pip version 20.2.3; however, version 20.2.4 is available.
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("insurance.csv")
df.shape
df.head()
df.tail()
df.columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 age 1338 non-null int64
1 sex 1338 non-null object
2 bmi 1338 non-null float64
3 children 1338 non-null int64
4 smoker 1338 non-null object
5 region 1338 non-null object
6 charges 1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB
df.describe()
df.isnull().sum()
df = df.rename(columns=({'sex':'gender'}))
df.head()
df["region"].unique()
##We aggregate average charges for each value in the region column.
region_df = df[["region","charges"]].groupby('region').agg(
Mean_charges=pd.NamedAgg("charges", "mean")
)
region_df = region_df.reset_index()
region_df.head()
## Distribution of each the regions of the US.
sns.displot(df, x="region")
"""
Visualization of the aggregation of the premium charges with respect to the region
As we can see individuals in the SouthEast have a higher charge.
"""
region_df.sort_values(by="Mean_charges").plot.bar(
x="region",
y="Mean_charges",
figsize=(12,8)
)
"""
We aggregate average charges for each value in the smoker column to see
if smokers or non smokers have higher charges.
"""
smoker_df = df[["smoker","charges"]].groupby('smoker').agg(
Mean_charges=pd.NamedAgg("charges", "mean")
)
smoker_df = smoker_df.reset_index()
smoker_df.head()
## Distribution of each of the smoker status of all individuals.
sns.displot(df, x="smoker")
"""
Visualization of the aggregation of the premium charges with respect to smokers
As we can see those who smoke tend to have a higher charge.
"""
smoker_df.sort_values(by="Mean_charges").plot.bar(
x="smoker",
y="Mean_charges",
figsize=(12,8)
)
"""
We aggregate average charges for each value in the column showing
the number of childrens for each user.
"""
children_df = df[["children","charges"]].groupby('children').agg(
Mean_charges=pd.NamedAgg("charges", "mean")
)
children_df = children_df.reset_index()
children_df.head()
sns.displot(df, x="children")
"""
Visualization of the aggregation of the premium charges with respect to
how many children each individual has. As we can see those who have
about 5 children seem to be paying the least.
"""
children_df.sort_values(by="Mean_charges").plot.bar(
x="children",
y="Mean_charges",
figsize=(12,8)
)
"""
We aggregate average charges for each value in the column showing
the sex or gender for each of the users.
"""
gender_df = df[["gender","charges"]].groupby('gender').agg(
Mean_charges=pd.NamedAgg("charges", "mean")
)
gender_df = gender_df.reset_index()
gender_df.head()
sns.displot(df, x="gender")
"""
Visualization of the aggregation of the premium charges
with respect to the sex or gender of each individual. As we can
see, males seem to have a higher premium charge..
"""
gender_df.sort_values(by="Mean_charges").plot.bar(
x="gender",
y="Mean_charges",
figsize=(12,8)
)
## Find out how many unique ages are in the dataset.
df["age"].unique()
"""
Below is a distribution of the ages of individuals in the dataset
and we can see which age has more occurence and which has lower
occurence.
"""
sns.displot(df, x="age")
sns.displot(df, x="bmi")
"""
We use aggregates for average charges and BMI. We will be calculating
the mean BMI and mean charges of individuals of each age.
"""
df2 = df[["age", "bmi","charges"]].groupby('age').agg(
Mean_BMI=pd.NamedAgg("bmi", "mean"),
Mean_Insurance_Charges=pd.NamedAgg("charges", "mean")
)
df2 = df2.reset_index()
df2.head()
"""
We rounded each of the mean values to two decimal places to
keep the data in a simpler format and easier to understand.
"""
df2 = df2.round(2)
df2.head()
"""
Visualization of the aggregation of the premium charges
with respect to the age of the individuals.
"""
df2.sort_values(by="age").plot.bar(
x="age",
y="Mean_Insurance_Charges",
figsize=(12,8)
)
"""
Visualization of the aggregation of the BMI scores
with respect to the age of the individuals.
"""
df2.sort_values(by="age").plot.bar(
x="age",
y="Mean_BMI",
figsize=(12,8)
)
pd.options.plotting.backend = "plotly"
df2.plot.scatter(x="Mean_BMI", y="Mean_Insurance_Charges", hover_data=["age"])
"""
We take all of those who are smokers (smoker = yes) and then evaluating their insurance charges.
We use aggregates finding average charges. We will be calculating
the mean charges of all smokers of each age.
"""
df3 = df.loc[df['smoker']=='yes']
df3 = df3[["age","charges"]].groupby('age').agg(
Mean_Insurance_Charges=pd.NamedAgg("charges", "mean")
)
df3 = df3.reset_index()
df3.head()
#We take all smokers between ages 20-40 and their respective average charges.
df3 = df3.loc[df3['age'].between(20, 40)]
df3 = df3.rename(columns={'Mean_Insurance_Charges': 'smok_avg_charge'})
df3.head()
"""
We take all of those who are smokers (smoker = no) and then evaluating their insurance charges.
We use aggregates finding average charges. We will be calculating
the mean charges of all smokers of each age.
"""
df4 = df.loc[df['smoker']=='no']
df4 = df4[["age","charges"]].groupby('age').agg(
Mean_Insurance_Charges=pd.NamedAgg("charges", "mean")
)
df4 = df4.reset_index()
df4.head()
#We take all non-smokers between ages 20-40 and their respective average charges.
df4 = df4.loc[df4['age'].between(20, 40)]
df4 = df4.rename(columns={'Mean_Insurance_Charges': 'nosmok_avg_charge'})
df4.head()
##We will be merging the two dataframes with smoker and non-smoker data based on the ages (Age 20-40)
merge_df = df3.merge(df4, on = ['age'])
merge_df = merge_df.round(2)
merge_df.head()
"""
Data Visualization of a bar graph where we are comparing the individuals who are amokers and those who
are non-smokers and then we compare their average charges by age. Individuals are between ages 20-40.
"""
fig, ax = plt.subplots(figsize=(24, 12))
bar_width = 0.40
ax.bar(merge_df["age"] - bar_width/2, merge_df["smok_avg_charge"], bar_width, color = 'blue')
ax.bar(merge_df["age"] + bar_width/2, merge_df["nosmok_avg_charge"], bar_width, color = 'orange')
ax.set_title("Smoking status and charges for young adults", fontsize=20)
ax.set_xlabel("Age", fontsize=20)
ax.set_ylabel("Charges", fontsize=20)
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
fig.show()
"""
We take all of those who are smokers (smoker = yes) for all age groups and then evaluating their
average insurance charge using aggregates.
"""
age_smok_df = df.loc[df['smoker']=='yes']
age_smok_df = age_smok_df[["age","charges"]].groupby('age').agg(
Mean_Insurance_Charges=pd.NamedAgg("charges", "mean")
)
age_smok_df = age_smok_df.reset_index()
age_smok_df.head()
"""
We take all of those who are smokers (smoker = no) for all age groups and then evaluating their
average insurance charge using aggregates.
"""
age_no_smok_df = df.loc[df['smoker']=='no']
age_no_smok_df = age_no_smok_df[["age","charges"]].groupby('age').agg(
Mean_Insurance_Charges=pd.NamedAgg("charges", "mean")
)
age_no_smok_df = age_no_smok_df.reset_index()
age_no_smok_df.head()
"""
Bar chart of the aggregation of the premium charges
with respect to the age of the individuals and their smoking
status and we take individuals of all ages in the dataset.
"""
fig, ax = plt.subplots(figsize=(50, 30))
bar_width = 0.35
ax.bar(age_smok_df["age"] - bar_width/2, age_smok_df["Mean_Insurance_Charges"], bar_width, color = 'blue')
ax.bar(age_no_smok_df["age"] + bar_width/2, age_no_smok_df["Mean_Insurance_Charges"], bar_width, color = 'orange')
ax.set_title("Smoking status and charges for all ages", fontsize=40)
ax.set_xlabel("Age", fontsize=40)
ax.set_ylabel("Charges", fontsize=40)
plt.rc('xtick', labelsize=40)
plt.rc('ytick', labelsize=40)
fig.show()
"""
Scatter Plot of the aggregation of the premium charges
with respect to the age of the individuals and their smoking
status and we take individuals of all ages in the dataset.
"""
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(data=df, x="age", y="charges", hue="smoker",
sizes=17)
ax.set_title("Smoking status and charges for different age groups", fontsize=15)
ax.set_xlabel("Age", fontsize=15)
ax.set_ylabel("Charges", fontsize=15)
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)
"""
We take all of those who are smokers (smoker = yes) for
each of the four regions and evaluate their average
insurance premiums based on the region.
"""
region_smok_df = df.loc[df['smoker']=='yes']
region_smok_df = region_smok_df[["region","charges"]].groupby('region').agg(
Mean_Insurance_Charges=pd.NamedAgg("charges", "mean")
)
region_smok_df = region_smok_df.rename(columns={'Mean_Insurance_Charges': 'reg_smok_avg_charge'})
region_smok_df = region_smok_df.reset_index()
region_smok_df
"""
We take all of those who are smokers (smoker = no) for
each of the four regions and evaluate their average
insurance premiums based on the region.
"""
region_no_smok_df = df.loc[df['smoker']=='no']
region_no_smok_df = region_no_smok_df[["region","charges"]].groupby('region').agg(
Mean_Insurance_Charges=pd.NamedAgg("charges", "mean")
)
region_no_smok_df = region_no_smok_df.rename(columns={'Mean_Insurance_Charges': 'reg_no_smok_avg_charge'})
region_no_smok_df = region_no_smok_df.reset_index()
region_no_smok_df
##We will be merging the two dataframes with smoker and non-smoker data based on regions.
reg_merg_df = region_smok_df.merge(region_no_smok_df, on = ['region'])
reg_merg_df = reg_merg_df.round(2)
reg_merg_df
#Bar chart for average charges for each of the regions for all smokers.
fig, ax = plt.subplots(figsize=(12, 8))
sns.barplot(x = 'region', y = 'reg_smok_avg_charge' , data = reg_merg_df)
ax.set_title("Smoking status and charges for different age groups", fontsize=15)
ax.set_xlabel("Region", fontsize=15)
ax.set_ylabel("Charges", fontsize=15)
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)
#Bar chart for average charges for each of the regions for all non-smokers.
fig, ax = plt.subplots(figsize=(12, 8))
sns.barplot(x = 'region', y = 'reg_no_smok_avg_charge' , data = reg_merg_df)
ax.set_title("Smoking status and charges for different age groups", fontsize=15)
ax.set_xlabel("Region", fontsize=15)
ax.set_ylabel("Charges", fontsize=15)
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)
#Line chart showing the comparision of the average charges of smokers and non-smokers of each of the regions.
fig, ax = plt.subplots(figsize=(20, 10))
ax.plot(reg_merg_df["region"], reg_merg_df["reg_smok_avg_charge"], linewidth=5, color = 'blue')
ax.plot(reg_merg_df["region"], reg_merg_df["reg_no_smok_avg_charge"], linewidth=5, color = 'red')
ax.set_title("Smoking status and charges for different regions", fontsize=20)
ax.set_xlabel("Region", fontsize=20)
ax.set_ylabel("Charges", fontsize=20)
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)
fig.show()
#We will be calculating the average insurance charges for each gender of all of those who smoke.
gender_smok_df = df.loc[df['smoker']=='yes']
gender_smok_df = gender_smok_df[["gender","charges"]].groupby('gender').agg(
Mean_Insurance_Charges=pd.NamedAgg("charges", "mean")
)
gender_smok_df = gender_smok_df.rename(columns={'Mean_Insurance_Charges': 'gender_smok_avg_charge'})
gender_smok_df = gender_smok_df.reset_index()
gender_smok_df
#We will be calculating the average insurance charges for each gender of all of those who do not smoke.
gender_no_smok_df = df.loc[df['smoker']=='no']
gender_no_smok_df = gender_no_smok_df[["gender","charges"]].groupby('gender').agg(
Mean_Insurance_Charges=pd.NamedAgg("charges", "mean")
)
gender_no_smok_df = gender_no_smok_df.rename(columns={'Mean_Insurance_Charges': 'gender_no_smok_avg_charge'})
gender_no_smok_df = gender_no_smok_df.reset_index()
gender_no_smok_df
"""
We are merging the data for both of the dataframes with the average
charges per gender for smokers and non-smokers.
"""
gender_merg_df = gender_smok_df.merge(gender_no_smok_df, on = ['gender'])
gender_merg_df = gender_merg_df.round(2)
gender_merg_df
##Bar chart for all individuals who are smokers and we calulcaute average insurance price by gender.
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x = 'gender', y = 'gender_smok_avg_charge' , data = gender_merg_df)
ax.set_title("Charges for smokers based on gender", fontsize=15)
ax.set_xlabel("Gender", fontsize=15)
ax.set_ylabel("Charges", fontsize=15)
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)
##Bar chart for all individuals who are non-smokers and we calulcaute average insurance price by gender.
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x = 'gender', y = 'gender_no_smok_avg_charge' , data = gender_merg_df)
ax.set_title("Charges for non-smokers based on gender", fontsize=15)
ax.set_xlabel("Gender", fontsize=15)
ax.set_ylabel("Charges", fontsize=15)
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)
fig, ax = plt.subplots(figsize=(12, 8))
sns.scatterplot(data=df, x="age", y="charges", hue="gender",
sizes=17)
ax.set_title("Age and charges for different genders", fontsize=15)
ax.set_xlabel("Age", fontsize=15)
ax.set_ylabel("Charges", fontsize=15)
plt.rc('xtick', labelsize=15)
plt.rc('ytick', labelsize=15)