import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
df = pd.read_csv('exams.csv')
df.head()
df.describe(include='all').round(2)
df.info()
df.isnull().sum()
# Creating separate data frame with average mean by test_prep_course completion
read_prep = df.groupby('test_prep_course').agg(avg_reading=('reading', 'mean')).round(4)
# Add percent change column
read_prep['pct_change'] = read_prep.pct_change().round(4)
read_prep
# Vizualize average reading scores by test_prep_course completion
fig = px.box(df, x='reading', y='test_prep_course', color='test_prep_course')
# Update titles and remove legend
fig.update_layout(
showlegend=False,
title='Average reading scores by test prep course completion',
xaxis_title='<b>reading</b> scores',
yaxis_title='test preparation course')
# Show graph
fig.show()
# Create data frame grouped by parent_education_level
parent_avg = df.groupby('parent_education_level')[['math', 'reading', 'writing']].mean()
# Sort values
parent_avg = parent_avg.sort_values('writing', ascending=False).round(2)
parent_avg.columns.name = 'subject'
parent_avg
# Visualize avg subject scores by parental education level
fig = px.line(parent_avg, facet_col='subject', markers=True)
# Remove '=' sign from facet titles
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
# Update titles
fig.update_xaxes(title_text='parent education level')
fig.update_layout(title='Average subject scores by parent education level',
yaxis_title='<b>average</b> score',
showlegend=False)
# Show graph
fig.show()
# Create data frame grouped by parent_education_level and test_prep_course
parent_edu = df.groupby(['parent_education_level',
'test_prep_course'])[['math', 'reading', 'writing']].mean().unstack()
# Sort values to ensure parent_education_level is ordered properly
parent_edu = parent_edu.sort_values(('writing', 'completed'),
ascending=False).round(2)
parent_edu
subject_list = ['math', 'reading', 'writing']
for i in subject_list:
x = (parent_edu[i].pct_change(axis=1) * -100).round(2)
min_level = x['none'].idxmin()
max_level = x['none'].idxmax()
min_improve = x['none'].min()
max_improve = x['none'].max()
print('{}: {} has lowest improvement {}%'.format(
i, min_level, min_improve))
print('{}: {} has highest improvement {}%\n'.format(
i, max_level, max_improve))
# Initiate subplots
fig = make_subplots(rows=1, cols=2,
horizontal_spacing=0.01,
shared_yaxes=True,
subplot_titles=('course completed', 'not completed'))
# Average scores, by test_prep_course completed
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['math', 'completed'],
name='math',
legendgroup='a',
line=dict(width=3)),
row=1, col=1)
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['reading', 'completed'],
name='reading',
legendgroup='a',
line=dict(width=3)),
row=1, col=1)
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['writing', 'completed'],
name='writing',
legendgroup='a',
line=dict(width=3)),
row=1, col=1)
# Average scores, by test_prep_course not completed
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['math', 'none'],
name='math',
legendgroup='b'),
row=1, col=2)
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['reading', 'none'],
name='reading',
legendgroup='b'),
row=1, col=2)
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['writing', 'none'],
name='writing',
legendgroup='b'),
row=1, col=2)
# Update titles
fig.update_layout(legend_tracegroupgap=50,
yaxis_title='<b>average</b> score',
title_text='Average scores by category and test prep')
fig.update_xaxes(title_text='parent education level')
# Show graph
fig.show()
fig = px.imshow(df.corr())
fig.show()