import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from scipy import stats
# set seaborn theme
sns.set_style(style="whitegrid")
df = pd.read_csv("data/Admission_Predict_Ver1.1.csv")
df.head()
df.columns
df.shape
df.isnull().sum()
df.dtypes
df.columns
def read_data():
temp_df = pd.read_csv("data/Admission_Predict_Ver1.1.csv")
return temp_df
def normalize_column_names(temp_df):
return temp_df.rename(
columns={"LOR ": "LOR", "Chance of Admit ": "Chance of Admit"}
)
def drop_noisy_columns(temp_df):
return temp_df.drop(columns=["Serial No."])
def normalize_dtypes(temp_df):
return temp_df.astype({"Research": bool, "University Rating": str})
def sort_uni_ranking(temp_df):
return temp_df.sort_values(by="University Rating")
df = (
read_data()
.pipe(normalize_column_names)
.pipe(drop_noisy_columns)
.pipe(normalize_dtypes)
.pipe(sort_uni_ranking)
)
df.columns
df.shape
df.dtypes
df.describe()
df["GRE Score"].describe()
print(df["GRE Score"].mode())
print(stats.skew(df["GRE Score"]))
px.histogram(df, x="GRE Score", nbins=20, marginal="box")
sns.displot(df, x="GRE Score", kind="hist", kde=True)
df["TOEFL Score"].describe()
print(df["TOEFL Score"].mode())
print(stats.skew(df["TOEFL Score"]))
px.histogram(df, x="TOEFL Score", marginal="box", nbins=15)
sns.displot(df, x="GRE Score", kind="hist", kde=True)
df["TOEFL Score"].value_counts()[:4]
df["University Rating"].value_counts()
sns.catplot(data=df, x="University Rating", kind="count")
temp_df = df.groupby(by="University Rating", as_index=False).agg(
counts=pd.NamedAgg(column="University Rating", aggfunc="count")
)
temp_df["University Rating"] = temp_df["University Rating"].astype(str)
px.bar(
data_frame=temp_df,
x="University Rating",
y="counts",
color="University Rating",
color_discrete_sequence=px.colors.qualitative.D3,
)
df["SOP"].value_counts()
temp_df = df.groupby(by="SOP", as_index=False).agg(
counts=pd.NamedAgg(column="SOP", aggfunc="count")
)
temp_df["SOP"] = temp_df["SOP"].astype(str)
px.bar(
data_frame=temp_df,
x="SOP",
y="counts",
color="SOP",
color_discrete_sequence=px.colors.qualitative.Prism,
)
df["LOR"].value_counts()
temp_df = df.groupby(by="LOR", as_index=False).agg(
counts=pd.NamedAgg(column="LOR", aggfunc="count")
)
temp_df["LOR"] = temp_df["LOR"].astype(str)
px.bar(
data_frame=temp_df,
x="LOR",
y="counts",
color="LOR",
color_discrete_sequence=px.colors.qualitative.Prism,
)
df["CGPA"].describe()
print(stats.skew(df["CGPA"]))
px.histogram(data_frame=df, x="CGPA", marginal="box", nbins=12)
df["Research"].value_counts()
sns.catplot(data=df, x="Research", kind="count")
temp_df = df.groupby(by="Research", as_index=False).agg(
counts=pd.NamedAgg(column="Research", aggfunc="count")
)
px.bar(
data_frame=temp_df,
x="Research",
y="counts",
color="Research",
color_continuous_scale=px.colors.qualitative.D3,
)
df["Chance of Admit"].describe()
print(stats.skew(df["Chance of Admit"]))
sns.displot(data=df, x="Chance of Admit")
px.histogram(data_frame=df, x="Chance of Admit", marginal="box")
df[df["Chance of Admit"] < 0.36]
numeric_cols = ["GRE Score", "TOEFL Score", "CGPA"]
corr = df[numeric_cols].corr()
px.imshow(
corr,
color_continuous_scale="PuBu",
color_continuous_midpoint=0.6,
title="Correlation matrix",
)
fig = px.scatter_matrix(
df,
dimensions=numeric_cols,
title="Scatter matrix of student's TOEFL Score, GRE Score, and CGPA",
)
fig.show()
sns.pairplot(data=df, vars=numeric_cols)
corr_value = df["TOEFL Score"].corr(df["GRE Score"])
fig = px.scatter(
data_frame=df,
x="TOEFL Score",
y="GRE Score",
marginal_x="histogram",
marginal_y="histogram",
trendline="ols",
trendline_color_override="red",
title=f"Correlation between TOEFL Score and GRE Score is: {corr_value:.2f}",
)
fig.show()
corr_value = df["TOEFL Score"].corr(df["CGPA"])
fig = px.scatter(
data_frame=df,
x="TOEFL Score",
y="CGPA",
marginal_x="histogram",
marginal_y="histogram",
trendline="ols",
trendline_color_override="red",
title=f"Correlation between TOEFL Score and CGPA is: {corr_value:.2f}",
)
fig.show()
corr_value = df["GRE Score"].corr(df["CGPA"])
fig = px.scatter(
data_frame=df,
x="GRE Score",
y="CGPA",
marginal_x="histogram",
marginal_y="histogram",
trendline="ols",
trendline_color_override="red",
title=f"Correlation between GRE Score and CGPA is: {corr_value:.2f}",
)
fig.show()
fig = px.density_heatmap(
data_frame=df, x="TOEFL Score", y="GRE Score", color_continuous_scale="PuBu"
)
fig.show()
fig = px.density_heatmap(
data_frame=df, x="TOEFL Score", y="CGPA", color_continuous_scale="PuBu"
)
fig.show()
fig = px.density_heatmap(
data_frame=df, x="GRE Score", y="CGPA", color_continuous_scale="PuBu"
)
fig.show()
px.histogram(data_frame=df, x="TOEFL Score", color="Research", barmode="group")
px.histogram(data_frame=df, x="GRE Score", color="Research", barmode="group")
px.histogram(data_frame=df, x="CGPA", color="Research", barmode="group")
px.histogram(
data_frame=df,
x="TOEFL Score",
color="University Rating",
barmode="group",
color_discrete_sequence=px.colors.sequential.Blugrn,
)
px.histogram(
data_frame=df,
x="GRE Score",
color="University Rating",
color_discrete_sequence=px.colors.sequential.Blugrn,
barmode="group",
)
px.histogram(
data_frame=df,
x="CGPA",
color="University Rating",
color_discrete_sequence=px.colors.sequential.Blugrn,
barmode="group",
)
px.scatter_matrix(
data_frame=df,
dimensions=numeric_cols,
color="Research",
title="Scatter matrix for TOEFL Score, GRE Score, and CGPA conditioning on Research variable",
)
px.density_heatmap(
data_frame=df,
x="TOEFL Score",
y="GRE Score",
color_continuous_scale="PuBu",
facet_col="University Rating",
title="TOEFL Score vs. GRE Score for different university ranking values",
)
px.density_heatmap(
data_frame=df,
x="TOEFL Score",
y="CGPA",
facet_col="University Rating",
color_continuous_scale="PuBu",
title="TOEFL Score vs. CGPA for different university ranking values",
)
px.density_heatmap(
data_frame=df,
x="GRE Score",
y="CGPA",
facet_col="University Rating",
color_continuous_scale="PuBu",
title="GRE Score vs. CGPA for different university ranking values",
)
px.scatter(
data_frame=df,
x="TOEFL Score",
y="GRE Score",
color="Research",
facet_col="University Rating",
trendline="ols",
symbol="Research",
)
px.scatter(
data_frame=df,
x="TOEFL Score",
y="CGPA",
color="Research",
facet_col="University Rating",
trendline="ols",
symbol="Research",
)
px.scatter(
data_frame=df,
x="GRE Score",
y="CGPA",
color="Research",
facet_col="University Rating",
trendline="ols",
symbol="Research",
)