Visualizing data with seaborn plotly

import pandas as pd import numpy as np import seaborn as sns import plotly.express as px import matplotlib.pyplot as plt from scipy import stats # set seaborn theme sns.set_style(style="whitegrid")

df = pd.read_csv("data/Admission_Predict_Ver1.1.csv")

df.head()

df.columns

df.shape

df.isnull().sum()

df.dtypes

df.columns

def read_data(): temp_df = pd.read_csv("data/Admission_Predict_Ver1.1.csv") return temp_df

def normalize_column_names(temp_df): return temp_df.rename( columns={"LOR ": "LOR", "Chance of Admit ": "Chance of Admit"} )

def drop_noisy_columns(temp_df): return temp_df.drop(columns=["Serial No."])

def normalize_dtypes(temp_df): return temp_df.astype({"Research": bool, "University Rating": str})

def sort_uni_ranking(temp_df): return temp_df.sort_values(by="University Rating")

df = ( read_data() .pipe(normalize_column_names) .pipe(drop_noisy_columns) .pipe(normalize_dtypes) .pipe(sort_uni_ranking) )

df.columns

df.shape

df.dtypes

df.describe()

df["GRE Score"].describe()

print(df["GRE Score"].mode())

print(stats.skew(df["GRE Score"]))

px.histogram(df, x="GRE Score", nbins=20, marginal="box")

sns.displot(df, x="GRE Score", kind="hist", kde=True)

df["TOEFL Score"].describe()

print(df["TOEFL Score"].mode())

print(stats.skew(df["TOEFL Score"]))

px.histogram(df, x="TOEFL Score", marginal="box", nbins=15)

sns.displot(df, x="GRE Score", kind="hist", kde=True)

df["TOEFL Score"].value_counts()[:4]

df["University Rating"].value_counts()

sns.catplot(data=df, x="University Rating", kind="count")

temp_df = df.groupby(by="University Rating", as_index=False).agg( counts=pd.NamedAgg(column="University Rating", aggfunc="count") )

temp_df["University Rating"] = temp_df["University Rating"].astype(str)

px.bar( data_frame=temp_df, x="University Rating", y="counts", color="University Rating", color_discrete_sequence=px.colors.qualitative.D3, )

df["SOP"].value_counts()

temp_df = df.groupby(by="SOP", as_index=False).agg( counts=pd.NamedAgg(column="SOP", aggfunc="count") )

temp_df["SOP"] = temp_df["SOP"].astype(str)

px.bar( data_frame=temp_df, x="SOP", y="counts", color="SOP", color_discrete_sequence=px.colors.qualitative.Prism, )

df["LOR"].value_counts()

temp_df = df.groupby(by="LOR", as_index=False).agg( counts=pd.NamedAgg(column="LOR", aggfunc="count") )

temp_df["LOR"] = temp_df["LOR"].astype(str)

px.bar( data_frame=temp_df, x="LOR", y="counts", color="LOR", color_discrete_sequence=px.colors.qualitative.Prism, )

df["CGPA"].describe()

print(stats.skew(df["CGPA"]))

px.histogram(data_frame=df, x="CGPA", marginal="box", nbins=12)

df["Research"].value_counts()

sns.catplot(data=df, x="Research", kind="count")

temp_df = df.groupby(by="Research", as_index=False).agg( counts=pd.NamedAgg(column="Research", aggfunc="count") )

px.bar( data_frame=temp_df, x="Research", y="counts", color="Research", color_continuous_scale=px.colors.qualitative.D3, )

df["Chance of Admit"].describe()

print(stats.skew(df["Chance of Admit"]))

sns.displot(data=df, x="Chance of Admit")

px.histogram(data_frame=df, x="Chance of Admit", marginal="box")

df[df["Chance of Admit"] < 0.36]

numeric_cols = ["GRE Score", "TOEFL Score", "CGPA"]

corr = df[numeric_cols].corr()

px.imshow( corr, color_continuous_scale="PuBu", color_continuous_midpoint=0.6, title="Correlation matrix", )

fig = px.scatter_matrix( df, dimensions=numeric_cols, title="Scatter matrix of student's TOEFL Score, GRE Score, and CGPA", ) fig.show()

sns.pairplot(data=df, vars=numeric_cols)

corr_value = df["TOEFL Score"].corr(df["GRE Score"]) fig = px.scatter( data_frame=df, x="TOEFL Score", y="GRE Score", marginal_x="histogram", marginal_y="histogram", trendline="ols", trendline_color_override="red", title=f"Correlation between TOEFL Score and GRE Score is: {corr_value:.2f}", ) fig.show()

corr_value = df["TOEFL Score"].corr(df["CGPA"]) fig = px.scatter( data_frame=df, x="TOEFL Score", y="CGPA", marginal_x="histogram", marginal_y="histogram", trendline="ols", trendline_color_override="red", title=f"Correlation between TOEFL Score and CGPA is: {corr_value:.2f}", ) fig.show()

corr_value = df["GRE Score"].corr(df["CGPA"]) fig = px.scatter( data_frame=df, x="GRE Score", y="CGPA", marginal_x="histogram", marginal_y="histogram", trendline="ols", trendline_color_override="red", title=f"Correlation between GRE Score and CGPA is: {corr_value:.2f}", ) fig.show()

fig = px.density_heatmap( data_frame=df, x="TOEFL Score", y="GRE Score", color_continuous_scale="PuBu" ) fig.show()

fig = px.density_heatmap( data_frame=df, x="TOEFL Score", y="CGPA", color_continuous_scale="PuBu" ) fig.show()

fig = px.density_heatmap( data_frame=df, x="GRE Score", y="CGPA", color_continuous_scale="PuBu" ) fig.show()

px.histogram(data_frame=df, x="TOEFL Score", color="Research", barmode="group")

px.histogram(data_frame=df, x="GRE Score", color="Research", barmode="group")

px.histogram(data_frame=df, x="CGPA", color="Research", barmode="group")

px.histogram( data_frame=df, x="TOEFL Score", color="University Rating", barmode="group", color_discrete_sequence=px.colors.sequential.Blugrn, )

px.histogram( data_frame=df, x="GRE Score", color="University Rating", color_discrete_sequence=px.colors.sequential.Blugrn, barmode="group", )

px.histogram( data_frame=df, x="CGPA", color="University Rating", color_discrete_sequence=px.colors.sequential.Blugrn, barmode="group", )

px.scatter_matrix( data_frame=df, dimensions=numeric_cols, color="Research", title="Scatter matrix for TOEFL Score, GRE Score, and CGPA conditioning on Research variable", )

px.density_heatmap( data_frame=df, x="TOEFL Score", y="GRE Score", color_continuous_scale="PuBu", facet_col="University Rating", title="TOEFL Score vs. GRE Score for different university ranking values", )

px.density_heatmap( data_frame=df, x="TOEFL Score", y="CGPA", facet_col="University Rating", color_continuous_scale="PuBu", title="TOEFL Score vs. CGPA for different university ranking values", )

px.density_heatmap( data_frame=df, x="GRE Score", y="CGPA", facet_col="University Rating", color_continuous_scale="PuBu", title="GRE Score vs. CGPA for different university ranking values", )

px.scatter( data_frame=df, x="TOEFL Score", y="GRE Score", color="Research", facet_col="University Rating", trendline="ols", symbol="Research", )

px.scatter( data_frame=df, x="TOEFL Score", y="CGPA", color="Research", facet_col="University Rating", trendline="ols", symbol="Research", )

px.scatter( data_frame=df, x="GRE Score", y="CGPA", color="Research", facet_col="University Rating", trendline="ols", symbol="Research", )