Digital Learning During COVID19 EDA

import numpy as np import pandas as pd import math import glob import os import altair as alt import seaborn as sns import matplotlib.pyplot as plt import plotly.express as px import plotly.graph_objects as go

def LearnPlatorm_theme(*args, **kwargs): return { "width": 500, "height": 300, "config": { "style": { "bar": { "size": 15 }, "guide-title": { "fontSize": 15 }, "guide-label": { "fontSize": 15 }, }, "scale": { "bandPaddingInner": 0.5, "bandPaddingOuter": 0.5 }, "legend": { "symbolSize": 15, "titleFontSize": 15, "labelFontSize": 9 }, "axis": { "titleFontSize": 15, "labelFontSize": 18 }, "title": {"fontSize": 20}, } } alt.themes.register('LearnPlatorm_theme', LearnPlatorm_theme) alt.themes.enable('LearnPlatorm_theme');

products_data = pd.read_csv( "./learnplatform-covid19-impact-on-digital-learning/products_info.csv" ) products_data.head()

districts_data = pd.read_csv( "./learnplatform-covid19-impact-on-digital-learning/districts_info.csv" ) districts_data.head()

def engagement_data_extract(file_path): dir_path = os.path.dirname(file_path) ap = [] for root, dirs, files in os.walk(dir_path): for file in files: if file.endswith('.csv'): # print (root+'/'+str(file)) df = pd.read_csv(root+'/'+str(file), index_col=None, header=0) district_id = file.split(".")[0] df["district_id"] = district_id ap.append(df) engagement = pd.concat(ap) engagement = engagement.reset_index(drop=True) return engagement

%%time engagement_data = engagement_data_extract( "./learnplatform-covid19-impact-on-digital-learning/engagement_data/" ) engagement_data.head()

def plot_missing(df): data = df.isnull().sum().to_frame().reset_index()\ .rename(columns={'index':'Columns',0:'Counts'}) return alt.Chart(data).mark_bar().encode( x='Columns', y='Counts', tooltip='Counts' )

NA = pd.DataFrame(data=[districts_data.isna().sum().tolist(), ["{:.2f}".format(i)+'%' \ for i in (districts_data.isna().sum()/districts_data.shape[0]*100).tolist()]], columns=districts_data.columns, index=['NA Count', 'NA Percent']).transpose().\ sort_values(by='NA Count',ascending=False) NA.style.background_gradient(cmap="vlag", subset=['NA Count'])

plot_eng = plot_missing(engagement_data).\ properties(title='Engagement', width=200, height=200) plot_pro = plot_missing(products_data).\ properties(title='Products', width=200, height=200) plot_dis = plot_missing(districts_data).\ properties(title='Districts', width=200, height=200) alt.hconcat(plot_pro, plot_dis, plot_eng )

districts_data.dropna(inplace = True) for i in ['pct_black/hispanic', 'pct_free/reduced','county_connections_ratio']: districts_data[i] = districts_data[i].apply(lambda x: float(str(x).split(',')[0][1:])+0.1) districts_data['pp_total_raw'] = districts_data['pp_total_raw'].apply(lambda x: int(x.split(',')[0][1:]) + 1000) districts_data.drop('county_connections_ratio', axis = 1, inplace = True) districts_data.head()

alt.Chart(districts_data.dropna()).mark_bar( cornerRadiusTopLeft=3, cornerRadiusTopRight=3).encode( x=alt.X('state:O',sort=alt.EncodingSortField(field="state", op="count", order='descending')), y=alt.Y('count(state):O'), tooltip='count(state):O', color=alt.Color('state:O', sort=alt.EncodingSortField(field="state",op="count", order='descending'), legend=None,scale=alt.Scale(scheme = 'blues')) ).properties(title='Number of Districts per states')

fig, ax = plt.subplots(figsize=(16, 8)) fig.suptitle('Locale Distribution', size = 20, color = "black") explode = ( 0.03, 0.03, 0.3, 0.03) labels = list(districts_data.locale.value_counts().index) sizes = districts_data["locale"].dropna().value_counts() ax.pie(sizes, explode = explode, colors = sns.color_palette("Set2"), startangle = 60, labels = labels, autopct = '%1.0f%%', pctdistance = 0.9 ) ax.add_artist(plt.Circle((0,0),0.4,fc='white')) plt.show()

alt.Chart(products_data.dropna()).transform_aggregate( count='count()', groupby=['Provider/Company Name'] ).transform_window( rank='rank(count)', sort=[alt.SortField('count', order='descending')] ).transform_filter( alt.datum.rank < 10 ).mark_bar( cornerRadiusTopLeft=3, cornerRadiusTopRight=3).encode( y=alt.Y('Provider/Company Name:N', sort='-x'), x='count:Q', tooltip='count:Q', color=alt.Color('Provider/Company Name:O', sort=alt.EncodingSortField(field="Provider/Company Name",op="count", order='descending'), legend=None,scale=alt.Scale(scheme = 'set2')) ).properties(title='Top Educational Product Providers')

products_data[products_data['Provider/Company Name']=='Google LLC']['Product Name'].value_counts().head(10).to_frame()

sector = products_data["Sector(s)"].value_counts().to_frame().reset_index() sector.style.background_gradient(cmap="Set2", subset=['Sector(s)'])

c1,c2,c3 = 0, 0, 0 for s in products_data["Sector(s)"]: if(not pd.isnull(s)): s = s.split(";") for i in range(len(s)): sub = s[i].strip() if(sub == 'PreK-12'): c1+=1 if(sub == 'Higher Ed'): c2+=1 if(sub == 'Corporate'): c3+=1 fig, ax = plt.subplots(figsize=(16, 8)) fig.suptitle('Educational Sector Distribution', size = 20, color = "black") explode = ( 0.03, 0.03, 0.3) labels = ['PreK-12','Higher Ed','Corporate'] sizes = [c1,c2,c3] ax.pie(sizes, explode = explode, colors = sns.color_palette("Set2"), startangle = 60, labels = labels, autopct = '%1.0f%%', pctdistance = 0.9 ) ax.add_artist(plt.Circle((0,0),0.4,fc='white')) plt.show()

PEF_cat_main = [] PEF_cat_sub = [] for p in products_data["Primary Essential Function"]: if (not pd.isnull(p)): cat1 = p.split("-",1)[0].strip() PEF_cat_main.append(cat1) cat2 = p.split("-",1)[1].strip() PEF_cat_sub.append(cat2) else: PEF_cat_main.append(np.nan) PEF_cat_sub.append(np.nan) products_data["Essential_Function_main"] = PEF_cat_main products_data["Essential_Function_sub"] = PEF_cat_sub DE = ( products_data[["Essential_Function_main", "Essential_Function_sub"]] .value_counts() .rename_axis(["Essential_Function_main", "Essential_Function_sub"]) .reset_index(name="counts") ) fig = px.sunburst( DE, path=["Essential_Function_main", "Essential_Function_sub"], values="counts", title="Sunburst Primary Essential Functions", ) fig.show()

fig = px.histogram( DE, x = "counts", y = "Essential_Function_sub", title="Primary Essential Function Sub-Categories", ) fig.update_traces(marker = dict( color='mediumpurple' )), fig.show()

engagement_data['time'] = pd.to_datetime(engagement_data['time']) engagement_data["district_id"] = engagement_data["district_id"].astype(str).astype(int)

complete_data = products_data.merge( engagement_data, left_on='LP ID', right_on='lp_id' ).merge( districts_data, left_on='district_id', right_on='district_id' ) # merging 3 datasets triggers memory warning

complete_data.head()

complete_data['Product Name'].value_counts().head(20).to_frame()

def mean_state_data(df,col): return df[[col,'state']].\ groupby(by='state').\ mean().\ reset_index().\ sort_values(by= col, ascending = False).\ style.background_gradient(cmap="Set2_r", subset=[col])

mean_state_data(complete_data,'pct_black/hispanic')

mean_state_data(complete_data,'pct_free/reduced')

mean_state_data(complete_data,'pp_total_raw')

locale = complete_data[["locale","pct_access","time"]].\ groupby(by=["time","locale"])\ .mean()\ .dropna()\ .reset_index() fig = px.line(locale, x="time", y="pct_access", facet_col='locale',facet_col_wrap=1,color = 'locale') fig.update_layout( title=("Educational product Access per Locale").title(), ) fig.show()

df = complete_data[["state","pct_access","time"]].groupby(by=["time","state"]).mean() df.dropna(inplace =True) df.reset_index(inplace = True) top_5 = df.groupby(by='state')\ .mean()\ .sort_values(by='pct_access',ascending=False)\ .head(5)\ .reset_index()['state']\ .to_list() top_states = df[df.state.isin(top_5)] fig = px.line(top_states, x="time", y="pct_access", facet_col='state',facet_col_wrap=1,color = 'state') fig.update_layout( title=("Educational product Access on Top 5 States").title(), ) fig.show()

bottom_5 = df.groupby(by='state')\ .mean()\ .sort_values(by='pct_access',ascending=False)\ .tail(5)\ .reset_index()['state']\ .to_list() top_states = df[df.state.isin(bottom_5)] fig = px.line(top_states, x="time", y="pct_access", facet_col='state',facet_col_wrap=1,color = 'state') fig.update_layout( title=("Educational product Access on Bottom 5 States").title(), ) fig.show()

geo_pct = complete_data[["state","pct_access"]]\ .groupby(by=["state"])\ .mean()\ .dropna()\ .reset_index() geo = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/2011_us_ag_exports.csv') geo_pct = pd.merge(geo_pct,geo,on = 'state', how = 'left')[['state','pct_access','code']] geo_pct.head()

fig = go.Figure(data=go.Choropleth( locations=geo_pct['code'], # Spatial coordinates z = geo_pct['pct_access'].astype(float), # Data to be color-coded locationmode = 'USA-states', # set of locations match entries in `locations` colorscale = 'plasma', text = geo_pct['state'], colorbar_title = "product access", )) fig.update_layout( title_text = 'Education Products access per state', geo_scope='usa', # limite map scope to USA ) fig.show()

Top_state = ['New York','Wisconson', 'Illinois','Indiana'] Race_state = complete_data[['state','pct_access','pct_black/hispanic']]\ .groupby('state')\ .mean()\ .dropna()\ .reset_index()\ .sort_values(by=['pct_black/hispanic','pct_access']) Race_state

geo_engag = complete_data[["state","engagement_index"]]\ .groupby(by=["state"])\ .mean()\ .dropna()\ .reset_index() geo_engag = pd.merge(geo_engag,geo,on = 'state', how = 'left')[['state','engagement_index','code']] geo_engag.head()

fig = go.Figure(data=go.Choropleth( locations=geo_engag['code'], # Spatial coordinates z = geo_engag['engagement_index'].astype(float), # Data to be color-coded locationmode = 'USA-states', # set of locations match entries in `locations` colorscale = 'plasma', text = geo_engag['state'], colorbar_title = "Engagement Index", )) fig.update_layout( title_text = 'Education Products Engagement Index per state', geo_scope='usa', # limite map scope to USA ) fig.show()

sns.heatmap(complete_data.drop(['LP ID','lp_id','district_id'],axis=1).corr(), cmap="YlGnBu", annot=True);

Imp_neigh= complete_data[complete_data['pct_black/hispanic']<0.2][['time','pct_access','pct_black/hispanic']].groupby('time').mean().reset_index() fig = px.line(Imp_neigh, x="time", y="pct_access") fig.update_layout( title=("Black/Hispanic community access to Online Education").title(), ) fig.show()

Imp_neigh= complete_data[complete_data['pct_black/hispanic']>=0.2][['time','pct_access','pct_black/hispanic']].groupby('time').mean().reset_index() fig = px.line(Imp_neigh, x="time", y="pct_access") fig.update_layout( title=("Black/Hispanic community access to Online Education").title(), ) fig.show()