from modelhub import ModelHub
from bach import display_sql_as_markdown
# instantiate the model hub
modelhub = ModelHub(time_aggregation='%Y-%m-%d')
# extract the root location from the location stack
df['root'] = df.location_stack.ls.get_from_context_with_type_series(type='RootLocationContext', key='id')
# root series is later unstacked and its values might contain dashes
# which are not allowed in BigQuery column names, lets replace them
df['root'] = df['root'].str.replace('-', '_')
# only look at press events and count the root locations
features = df[(df.event_type=='PressEvent')].groupby('user_id').root.value_counts()
# unstack the series, to create a DataFrame with the number of clicks per root location as columns
features_unstacked = features.unstack(fill_value=0)
# for BigQuery the table name should be 'YOUR_PROJECT.YOUR_WRITABLE_DATASET.YOUR_TABLE_NAME'
features_set_sample = features_unstacked.get_sample('test_lr_sample', sample_percentage=10, overwrite=True)
y_column = 'modeling'
y = features_set_sample[y_column] > 0
X = features_set_sample.drop(columns=[y_column])
X.head()
y.head()
lr = modelhub.get_logistic_regression(fit_intercept=False)
lr.fit(X, y)
lr.score(X, y)
# show the coefficients of the fitted model
lr.coef_
features_set_sample['predicted_values'] = lr.predict_proba(X)
features_set_sample['predicted_labels'] = lr.predict(X)
# show the sampled data set, including predictions
features_set_sample.head(20)
features_set_full = features_set_sample.get_unsampled()
display_sql_as_markdown(features_set_full)