from modelhub import ModelHub
from bach import display_sql_as_markdown
# instantiate the model hub
modelhub = ModelHub(time_aggregation='%Y-%m-%d')
# extract the root location from the location stack
df['root'] = df.location_stack.ls.get_from_context_with_type_series(type='RootLocationContext', key='id')
# root series is later unstacked and its values might contain dashes
# which are not allowed in BigQuery column names, lets replace them
df['root'] = df['root'].str.replace('-', '_')
# only look at press events and count the root locations
features = df[(df.event_type=='PressEvent')].groupby('user_id').root.value_counts()
# unstack the series, to create a DataFrame with the number of clicks per root location as columns
features_unstacked = features.unstack(fill_value=0)
# for BigQuery the table name should be 'YOUR_PROJECT.YOUR_WRITABLE_DATASET.YOUR_TABLE_NAME'
features_set_sample = features_unstacked.get_sample('test_lr_sample', sample_percentage=10, overwrite=True)
y_column = 'modeling'
y = features_set_sample[y_column] > 0
X = features_set_sample.drop(columns=[y_column])
X.head()
aboutint64
blogint64
04afad3c-e525-41b8-b541-753e8a9117a1
0
0
0d94e986-3340-425a-a4b9-6cea1508fcc3
0
0
0eb6636c-9a85-46ff-9f40-62d77ea2683d
0
0
1151d90e-35f6-41da-bf70-31df71673c79
0
0
1673e2bf-ea79-4885-b1ba-2de34bec56f1
0
0
y.head()
lr = modelhub.get_logistic_regression(fit_intercept=False)
lr.fit(X, y)
lr.score(X, y)
# show the coefficients of the fitted model
lr.coef_
features_set_sample['predicted_values'] = lr.predict_proba(X)
features_set_sample['predicted_labels'] = lr.predict(X)
# show the sampled data set, including predictions
features_set_sample.head(20)
aboutint64
0 - 5
blogint64
0 - 2
04afad3c-e525-41b8-b541-753e8a9117a1
0
0
0d94e986-3340-425a-a4b9-6cea1508fcc3
0
0
0eb6636c-9a85-46ff-9f40-62d77ea2683d
0
0
1151d90e-35f6-41da-bf70-31df71673c79
0
0
1673e2bf-ea79-4885-b1ba-2de34bec56f1
0
0
21e014a5-0609-4abe-b065-63dff30260c0
0
0
26baee41-9683-4b43-97bf-c70a9aa43078
0
0
29516f59-2734-47ad-8cc8-85b2476afb9e
0
0
2d15aa42-411c-481d-88e6-720efcd6483c
1
0
2de8e2f5-31d7-4ea5-9197-c4fd61bf9424
5
0
features_set_full = features_set_sample.get_unsampled()
display_sql_as_markdown(features_set_full)