import numpy as np
import pandas as pd
import math
import glob
import os
import altair as alt
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
def LearnPlatorm_theme(*args, **kwargs):
return {
"width": 500,
"height": 300,
"config": {
"style": {
"bar": {
"size": 15
},
"guide-title": {
"fontSize": 15
},
"guide-label": {
"fontSize": 15
},
},
"scale": {
"bandPaddingInner": 0.5,
"bandPaddingOuter": 0.5
},
"legend": {
"symbolSize": 15,
"titleFontSize": 15,
"labelFontSize": 9
},
"axis": {
"titleFontSize": 15,
"labelFontSize": 18
},
"title": {"fontSize": 20},
}
}
alt.themes.register('LearnPlatorm_theme', LearnPlatorm_theme)
alt.themes.enable('LearnPlatorm_theme');
products_data = pd.read_csv(
"./learnplatform-covid19-impact-on-digital-learning/products_info.csv"
)
products_data.head()
districts_data = pd.read_csv(
"./learnplatform-covid19-impact-on-digital-learning/districts_info.csv"
)
districts_data.head()
def engagement_data_extract(file_path):
dir_path = os.path.dirname(file_path)
ap = []
for root, dirs, files in os.walk(dir_path):
for file in files:
if file.endswith('.csv'):
# print (root+'/'+str(file))
df = pd.read_csv(root+'/'+str(file), index_col=None, header=0)
district_id = file.split(".")[0]
df["district_id"] = district_id
ap.append(df)
engagement = pd.concat(ap)
engagement = engagement.reset_index(drop=True)
return engagement
%%time
engagement_data = engagement_data_extract(
"./learnplatform-covid19-impact-on-digital-learning/engagement_data/"
)
engagement_data.head()
def plot_missing(df):
data = df.isnull().sum().to_frame().reset_index()\
.rename(columns={'index':'Columns',0:'Counts'})
return alt.Chart(data).mark_bar().encode(
x='Columns',
y='Counts',
tooltip='Counts'
)
NA = pd.DataFrame(data=[districts_data.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \
for i in (districts_data.isna().sum()/districts_data.shape[0]*100).tolist()]],
columns=districts_data.columns, index=['NA Count', 'NA Percent']).transpose().\
sort_values(by='NA Count',ascending=False)
NA.style.background_gradient(cmap="vlag", subset=['NA Count'])
plot_eng = plot_missing(engagement_data).\
properties(title='Engagement', width=200, height=200)
plot_pro = plot_missing(products_data).\
properties(title='Products', width=200, height=200)
plot_dis = plot_missing(districts_data).\
properties(title='Districts', width=200, height=200)
alt.hconcat(plot_pro, plot_dis, plot_eng )
districts_data.dropna(inplace = True)
for i in ['pct_black/hispanic', 'pct_free/reduced','county_connections_ratio']:
districts_data[i] = districts_data[i].apply(lambda x: float(str(x).split(',')[0][1:])+0.1)
districts_data['pp_total_raw'] = districts_data['pp_total_raw'].apply(lambda x: int(x.split(',')[0][1:]) + 1000)
districts_data.drop('county_connections_ratio', axis = 1, inplace = True)
districts_data.head()
alt.Chart(districts_data.dropna()).mark_bar(
cornerRadiusTopLeft=3,
cornerRadiusTopRight=3).encode(
x=alt.X('state:O',sort=alt.EncodingSortField(field="state", op="count", order='descending')),
y=alt.Y('count(state):O'),
tooltip='count(state):O',
color=alt.Color('state:O',
sort=alt.EncodingSortField(field="state",op="count", order='descending'),
legend=None,scale=alt.Scale(scheme = 'blues'))
).properties(title='Number of Districts per states')
fig, ax = plt.subplots(figsize=(16, 8))
fig.suptitle('Locale Distribution', size = 20, color = "black")
explode = ( 0.03, 0.03, 0.3, 0.03)
labels = list(districts_data.locale.value_counts().index)
sizes = districts_data["locale"].dropna().value_counts()
ax.pie(sizes,
explode = explode,
colors = sns.color_palette("Set2"),
startangle = 60,
labels = labels,
autopct = '%1.0f%%',
pctdistance = 0.9
)
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()
alt.Chart(products_data.dropna()).transform_aggregate(
count='count()',
groupby=['Provider/Company Name']
).transform_window(
rank='rank(count)',
sort=[alt.SortField('count', order='descending')]
).transform_filter(
alt.datum.rank < 10
).mark_bar(
cornerRadiusTopLeft=3,
cornerRadiusTopRight=3).encode(
y=alt.Y('Provider/Company Name:N', sort='-x'),
x='count:Q',
tooltip='count:Q',
color=alt.Color('Provider/Company Name:O',
sort=alt.EncodingSortField(field="Provider/Company Name",op="count", order='descending'),
legend=None,scale=alt.Scale(scheme = 'set2'))
).properties(title='Top Educational Product Providers')
products_data[products_data['Provider/Company Name']=='Google LLC']['Product Name'].value_counts().head(10).to_frame()
sector = products_data["Sector(s)"].value_counts().to_frame().reset_index()
sector.style.background_gradient(cmap="Set2", subset=['Sector(s)'])
c1,c2,c3 = 0, 0, 0
for s in products_data["Sector(s)"]:
if(not pd.isnull(s)):
s = s.split(";")
for i in range(len(s)):
sub = s[i].strip()
if(sub == 'PreK-12'): c1+=1
if(sub == 'Higher Ed'): c2+=1
if(sub == 'Corporate'): c3+=1
fig, ax = plt.subplots(figsize=(16, 8))
fig.suptitle('Educational Sector Distribution', size = 20, color = "black")
explode = ( 0.03, 0.03, 0.3)
labels = ['PreK-12','Higher Ed','Corporate']
sizes = [c1,c2,c3]
ax.pie(sizes,
explode = explode,
colors = sns.color_palette("Set2"),
startangle = 60,
labels = labels,
autopct = '%1.0f%%',
pctdistance = 0.9
)
ax.add_artist(plt.Circle((0,0),0.4,fc='white'))
plt.show()
PEF_cat_main = []
PEF_cat_sub = []
for p in products_data["Primary Essential Function"]:
if (not pd.isnull(p)):
cat1 = p.split("-",1)[0].strip()
PEF_cat_main.append(cat1)
cat2 = p.split("-",1)[1].strip()
PEF_cat_sub.append(cat2)
else:
PEF_cat_main.append(np.nan)
PEF_cat_sub.append(np.nan)
products_data["Essential_Function_main"] = PEF_cat_main
products_data["Essential_Function_sub"] = PEF_cat_sub
DE = (
products_data[["Essential_Function_main", "Essential_Function_sub"]]
.value_counts()
.rename_axis(["Essential_Function_main", "Essential_Function_sub"])
.reset_index(name="counts")
)
fig = px.sunburst(
DE,
path=["Essential_Function_main", "Essential_Function_sub"],
values="counts",
title="Sunburst Primary Essential Functions",
)
fig.show()
fig = px.histogram(
DE,
x = "counts",
y = "Essential_Function_sub",
title="Primary Essential Function Sub-Categories",
)
fig.update_traces(marker = dict(
color='mediumpurple'
)),
fig.show()
engagement_data['time'] = pd.to_datetime(engagement_data['time'])
engagement_data["district_id"] = engagement_data["district_id"].astype(str).astype(int)
complete_data = products_data.merge(
engagement_data,
left_on='LP ID',
right_on='lp_id'
).merge(
districts_data,
left_on='district_id',
right_on='district_id'
) # merging 3 datasets triggers memory warning
complete_data.head()
complete_data['Product Name'].value_counts().head(20).to_frame()
def mean_state_data(df,col):
return df[[col,'state']].\
groupby(by='state').\
mean().\
reset_index().\
sort_values(by= col, ascending = False).\
style.background_gradient(cmap="Set2_r", subset=[col])
mean_state_data(complete_data,'pct_black/hispanic')
mean_state_data(complete_data,'pct_free/reduced')
mean_state_data(complete_data,'pp_total_raw')
locale = complete_data[["locale","pct_access","time"]].\
groupby(by=["time","locale"])\
.mean()\
.dropna()\
.reset_index()
fig = px.line(locale, x="time", y="pct_access", facet_col='locale',facet_col_wrap=1,color = 'locale')
fig.update_layout(
title=("Educational product Access per Locale").title(),
)
fig.show()
df = complete_data[["state","pct_access","time"]].groupby(by=["time","state"]).mean()
df.dropna(inplace =True)
df.reset_index(inplace = True)
top_5 = df.groupby(by='state')\
.mean()\
.sort_values(by='pct_access',ascending=False)\
.head(5)\
.reset_index()['state']\
.to_list()
top_states = df[df.state.isin(top_5)]
fig = px.line(top_states, x="time", y="pct_access", facet_col='state',facet_col_wrap=1,color = 'state')
fig.update_layout(
title=("Educational product Access on Top 5 States").title(),
)
fig.show()
bottom_5 = df.groupby(by='state')\
.mean()\
.sort_values(by='pct_access',ascending=False)\
.tail(5)\
.reset_index()['state']\
.to_list()
top_states = df[df.state.isin(bottom_5)]
fig = px.line(top_states, x="time", y="pct_access", facet_col='state',facet_col_wrap=1,color = 'state')
fig.update_layout(
title=("Educational product Access on Bottom 5 States").title(),
)
fig.show()
geo_pct = complete_data[["state","pct_access"]]\
.groupby(by=["state"])\
.mean()\
.dropna()\
.reset_index()
geo = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv')
geo_pct = pd.merge(geo_pct,geo,on = 'state', how = 'left')[['state','pct_access','code']]
geo_pct.head()
fig = go.Figure(data=go.Choropleth(
locations=geo_pct['code'], # Spatial coordinates
z = geo_pct['pct_access'].astype(float), # Data to be color-coded
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale = 'plasma',
text = geo_pct['state'],
colorbar_title = "product access",
))
fig.update_layout(
title_text = 'Education Products access per state',
geo_scope='usa', # limite map scope to USA
)
fig.show()
Top_state = ['New York','Wisconson', 'Illinois','Indiana']
Race_state = complete_data[['state','pct_access','pct_black/hispanic']]\
.groupby('state')\
.mean()\
.dropna()\
.reset_index()\
.sort_values(by=['pct_black/hispanic','pct_access'])
Race_state
geo_engag = complete_data[["state","engagement_index"]]\
.groupby(by=["state"])\
.mean()\
.dropna()\
.reset_index()
geo_engag = pd.merge(geo_engag,geo,on = 'state', how = 'left')[['state','engagement_index','code']]
geo_engag.head()
fig = go.Figure(data=go.Choropleth(
locations=geo_engag['code'], # Spatial coordinates
z = geo_engag['engagement_index'].astype(float), # Data to be color-coded
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale = 'plasma',
text = geo_engag['state'],
colorbar_title = "Engagement Index",
))
fig.update_layout(
title_text = 'Education Products Engagement Index per state',
geo_scope='usa', # limite map scope to USA
)
fig.show()
sns.heatmap(complete_data.drop(['LP ID','lp_id','district_id'],axis=1).corr(), cmap="YlGnBu", annot=True);
Imp_neigh= complete_data[complete_data['pct_black/hispanic']<0.2][['time','pct_access','pct_black/hispanic']].groupby('time').mean().reset_index()
fig = px.line(Imp_neigh, x="time", y="pct_access")
fig.update_layout(
title=("Black/Hispanic community access to Online Education").title(),
)
fig.show()
Imp_neigh= complete_data[complete_data['pct_black/hispanic']>=0.2][['time','pct_access','pct_black/hispanic']].groupby('time').mean().reset_index()
fig = px.line(Imp_neigh, x="time", y="pct_access")
fig.update_layout(
title=("Black/Hispanic community access to Online Education").title(),
)
fig.show()