import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
df = pd.read_csv('exams.csv')
df.head()
genderobject
race/ethnicityobject
0
female
group B
1
female
group C
2
female
group B
3
male
group A
4
male
group C
df.describe(include='all').round(2)
genderobject
10009.1%
3 others27.3%
Missing63.6%
race/ethnicityobject
10009.1%
3 others27.3%
Missing63.6%
count
1000
1000
unique
2
5
top
female
group C
freq
518
319
mean
nan
nan
std
nan
nan
min
nan
nan
25%
nan
nan
50%
nan
nan
75%
nan
nan
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 1000 non-null object
1 race/ethnicity 1000 non-null object
2 parent_education_level 1000 non-null object
3 lunch 1000 non-null object
4 test_prep_course 1000 non-null object
5 math 1000 non-null int64
6 reading 1000 non-null int64
7 writing 1000 non-null int64
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
df.isnull().sum()
# Creating separate data frame with average mean by test_prep_course completion
read_prep = df.groupby('test_prep_course').agg(avg_reading=('reading', 'mean')).round(4)
# Add percent change column
read_prep['pct_change'] = read_prep.pct_change().round(4)
read_prep
avg_readingfloat64
pct_changefloat64
completed
73.8939
nan
none
66.5343
-0.0996
# Vizualize average reading scores by test_prep_course completion
fig = px.box(df, x='reading', y='test_prep_course', color='test_prep_course')
# Update titles and remove legend
fig.update_layout(
showlegend=False,
title='Average reading scores by test prep course completion',
xaxis_title='<b>reading</b> scores',
yaxis_title='test preparation course')
# Show graph
fig.show()
# Create data frame grouped by parent_education_level
parent_avg = df.groupby('parent_education_level')[['math', 'reading', 'writing']].mean()
# Sort values
parent_avg = parent_avg.sort_values('writing', ascending=False).round(2)
parent_avg.columns.name = 'subject'
parent_avg
mathfloat64
readingfloat64
master's degree
69.75
75.37
bachelor's degree
69.39
73.0
associate's degree
67.88
70.93
some college
67.13
69.46
some high school
63.5
66.94
high school
62.14
64.7
# Visualize avg subject scores by parental education level
fig = px.line(parent_avg, facet_col='subject', markers=True)
# Remove '=' sign from facet titles
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
# Update titles
fig.update_xaxes(title_text='parent education level')
fig.update_layout(title='Average subject scores by parent education level',
yaxis_title='<b>average</b> score',
showlegend=False)
# Show graph
fig.show()
# Create data frame grouped by parent_education_level and test_prep_course
parent_edu = df.groupby(['parent_education_level',
'test_prep_course'])[['math', 'reading', 'writing']].mean().unstack()
# Sort values to ensure parent_education_level is ordered properly
parent_edu = parent_edu.sort_values(('writing', 'completed'),
ascending=False).round(2)
parent_edu
math completedfloat64
math nonefloat64
master's degree
70.6
69.31
bachelor's degree
73.28
66.9
associate's degree
71.83
65.57
some college
71.45
64.89
some high school
66.7
61.08
high school
65.0
60.99
subject_list = ['math', 'reading', 'writing']
for i in subject_list:
x = (parent_edu[i].pct_change(axis=1) * -100).round(2)
min_level = x['none'].idxmin()
max_level = x['none'].idxmax()
min_improve = x['none'].min()
max_improve = x['none'].max()
print('{}: {} has lowest improvement {}%'.format(
i, min_level, min_improve))
print('{}: {} has highest improvement {}%\n'.format(
i, max_level, max_improve))
math: master's degree has lowest improvement 1.83%
math: some college has highest improvement 9.18%
reading: master's degree has lowest improvement 5.56%
reading: some college has highest improvement 13.03%
writing: master's degree has lowest improvement 8.35%
writing: some college has highest improvement 15.22%
# Initiate subplots
fig = make_subplots(rows=1, cols=2,
horizontal_spacing=0.01,
shared_yaxes=True,
subplot_titles=('course completed', 'not completed'))
# Average scores, by test_prep_course completed
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['math', 'completed'],
name='math',
legendgroup='a',
line=dict(width=3)),
row=1, col=1)
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['reading', 'completed'],
name='reading',
legendgroup='a',
line=dict(width=3)),
row=1, col=1)
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['writing', 'completed'],
name='writing',
legendgroup='a',
line=dict(width=3)),
row=1, col=1)
# Average scores, by test_prep_course not completed
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['math', 'none'],
name='math',
legendgroup='b'),
row=1, col=2)
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['reading', 'none'],
name='reading',
legendgroup='b'),
row=1, col=2)
fig.add_trace(go.Scatter(x=parent_edu.index,
y=parent_edu['writing', 'none'],
name='writing',
legendgroup='b'),
row=1, col=2)
# Update titles
fig.update_layout(legend_tracegroupgap=50,
yaxis_title='<b>average</b> score',
title_text='Average scores by category and test prep')
fig.update_xaxes(title_text='parent education level')
# Show graph
fig.show()
fig = px.imshow(df.corr())
fig.show()