from datetime import datetime
from modelhub import ModelHub
from bach import display_sql_as_markdown
# instantiate the model hub and set the default time aggregation to daily
modelhub = ModelHub(time_aggregation='%Y-%m-%d')
# add 'application', 'feature_nice_name' and 'root_location' as columns, so that we can use it for grouping etc later
df['application'] = df.global_contexts.gc.application
df['feature_nice_name'] = df.location_stack.ls.nice_name
df['root_location'] = df.location_stack.ls.get_from_context_with_type_series(type='RootLocationContext', key='id')
# have a look at the data
df.sort_values(['session_id', 'session_hit_number'], ascending=False).head()
# model hub: unique users, monthly
monthly_users = modelhub.aggregate.unique_users(df, groupby=modelhub.time_agg(df, '%Y-%m'))
monthly_users.sort_index(ascending=False).head()
# model hub: unique users, daily
daily_users = modelhub.aggregate.unique_users(df)
daily_users.sort_index(ascending=False).head(10)
# model hub: unique users, per main product section
users_root = modelhub.aggregate.unique_users(df, groupby=['application', 'root_location'])
users_root.sort_index(ascending=False).head(10)
retention_matrix = modelhub.aggregate.retention_matrix(df, time_period='monthly', percentage=True, display=True)
retention_matrix.head()
# calculate the first cohort
cohorts = df[['user_id', 'moment']].groupby('user_id')['moment'].min().reset_index()
cohorts = cohorts.rename(columns={'moment': 'first_cohort'})
# add first cohort of the users to our DataFrame
df_with_cohorts = df.merge(cohorts, on='user_id')
# filter data where users belong to # 0 cohort
cohort0_filter = (df_with_cohorts['first_cohort'] > datetime(2022, 2, 1)) & (df_with_cohorts['first_cohort'] < datetime(2022, 3, 1))
df_with_cohorts[cohort0_filter]['event_type'].value_counts().head()
# filter data where users belong to # 1 cohort (the problematic one)
cohort1_filter = (df_with_cohorts['first_cohort'] > datetime(2022, 3, 1)) & (df_with_cohorts['first_cohort'] < datetime(2022, 4, 1))
df_with_cohorts[cohort1_filter]['event_type'].value_counts().head()
# model hub: duration, monthly average
duration_monthly = modelhub.aggregate.session_duration(df, groupby=modelhub.time_agg(df, '%Y-%m'))
duration_monthly.sort_index(ascending=False).head()
# model hub: duration, daily average
duration_daily = modelhub.aggregate.session_duration(df)
duration_daily.sort_index(ascending=False).head()
# model hub: duration, monthly average per root location
duration_root_month = modelhub.aggregate.session_duration(df, groupby=['application', 'root_location', modelhub.time_agg(df, '%Y-%m')]).sort_index()
duration_root_month.head(10)
# how is this time spent distributed?
session_duration = modelhub.aggregate.session_duration(df, groupby='session_id', exclude_bounces=False)
# Materialization is needed because the expression of the created series contains aggregated data, and it is not allowed to aggregate that.
session_duration.materialize().quantile(q=[0.25, 0.50, 0.75]).head()
# by default we select only user actions
top_product_features = modelhub.aggregate.top_product_features(df)
top_product_features.head()
# select only user actions, so stack_event_types must contain 'InteractiveEvent'
interactive_events = df[df.stack_event_types.json.array_contains('InteractiveEvent')]
top_interactions = modelhub.agg.unique_users(interactive_events, groupby=['application','root_location','feature_nice_name', 'event_type'])
top_interactions = top_interactions.reset_index()
home_users = top_interactions[(top_interactions.application == 'objectiv-website') &
(top_interactions.root_location == 'home')]
home_users.sort_values('unique_users', ascending=False).head()
docs_users = top_interactions[top_interactions.application == 'objectiv-docs']
docs_users.sort_values('unique_users', ascending=False).head()
df_acquisition = df.copy()
# extract referrer and marketing contexts from the global contexts
df_acquisition['referrer'] = df_acquisition.global_contexts.gc.get_from_context_with_type_series(type='HttpContext', key='referrer')
df_acquisition['utm_source'] = df_acquisition.global_contexts.gc.get_from_context_with_type_series(type='MarketingContext', key='source')
df_acquisition['utm_medium'] = df_acquisition.global_contexts.gc.get_from_context_with_type_series(type='MarketingContext', key='medium')
df_acquisition['utm_campaign'] = df_acquisition.global_contexts.gc.get_from_context_with_type_series(type='MarketingContext', key='campaign')
df_acquisition['utm_content'] = df_acquisition.global_contexts.gc.get_from_context_with_type_series(type='MarketingContext', key='content')
df_acquisition['utm_term'] = df_acquisition.global_contexts.gc.get_from_context_with_type_series(type='MarketingContext', key='term')
# users by referrer
modelhub.agg.unique_users(df_acquisition, groupby='referrer').sort_values(ascending=False).head()
# users by marketing campaign
campaign_users = modelhub.agg.unique_users(df_acquisition, groupby=['utm_source', 'utm_medium', 'utm_campaign', 'utm_content', 'utm_term'])
campaign_users = campaign_users.reset_index().dropna(axis=0, how='any', subset='utm_source')
campaign_users.sort_values('utm_source', ascending=True).head()
# users by feature per campaign source & term
users_feature_campaign = modelhub.agg.unique_users(df_acquisition[
df_acquisition.stack_event_types.json.array_contains('InteractiveEvent')],
groupby=['utm_source',
'utm_term',
'feature_nice_name',
'event_type'])
users_feature_campaign = users_feature_campaign.reset_index().dropna(axis=0, how='any', subset='utm_source')
users_feature_campaign.sort_values(['utm_source', 'utm_term', 'unique_users'], ascending=[True, True, False]).head()
# create a column that extracts all location stacks that lead to our github
df_acquisition['github_press'] = df_acquisition.location_stack.json[{'id': 'objectiv-on-github', '_type': 'LinkContext'}:]
df_acquisition.loc[df_acquisition.location_stack.json[{'id': 'github', '_type': 'LinkContext'}:]!=[],'github_press'] = df_acquisition.location_stack
# define which events to use as conversion events
modelhub.add_conversion_event(location_stack=df_acquisition.github_press,
event_type='PressEvent',
name='github_press')
# model hub: calculate conversions
df_acquisition['is_conversion_event'] = modelhub.map.is_conversion_event(df_acquisition, 'github_press')
conversions = modelhub.aggregate.unique_users(df_acquisition[df_acquisition.is_conversion_event])
conversions.to_frame().sort_index(ascending=False).head(10)
# calculate conversion rate
conversion_rate = conversions / daily_users
conversion_rate.sort_index(ascending=False).head(10)
# model hub: calculate conversions per marketing canpaign based on UTM data in MarketingContext
campaign_conversions = modelhub.aggregate.unique_users(df_acquisition[df_acquisition.is_conversion_event],
groupby=['utm_source', 'utm_medium', 'utm_campaign'])
campaign_conversions.reset_index().dropna(axis=0, how='any', subset='utm_source').head()
conversion_locations = modelhub.agg.unique_users(df_acquisition[df_acquisition.is_conversion_event],
groupby=['application', 'feature_nice_name', 'event_type'])
# calling .to_frame() for nicer formatting
conversion_locations.sort_values(ascending=False).to_frame().head()
top_features_before_conversion = modelhub.agg.top_product_features_before_conversion(df_acquisition, name='github_press')
top_features_before_conversion.head()
# label sessions with a conversion
df_acquisition['converted_users'] = modelhub.map.conversions_counter(df_acquisition, name='github_press') >= 1
# label hits where at that point in time, there are 0 conversions in the session
df_acquisition['zero_conversions_at_moment'] = modelhub.map.conversions_in_time(df_acquisition, 'github_press') == 0
# filter on above created labels
converted_users = df_acquisition[(df_acquisition.converted_users & df_acquisition.zero_conversions_at_moment)]
# how much time do users spend before they convert?
modelhub.aggregate.session_duration(converted_users, groupby=None).to_frame().head()
# just one analysis as an example, this works for anything you do with Objectiv Bach
display_sql_as_markdown(conversions)