BigQueryでGA4/Firebaseのログを使って機械学習

使うデータセット

Firebaseのパブリックデータが以下にあるのでこれを使う。

firebase-public-project.analytics_153293282.events_*

0612~1003の114日分のデータ

基本集計(EDA)

イベントの数の種類別カウント

select
  event_name,
  count(0)
from `firebase-public-project.analytics_153293282.events_20181003`
group by 1
order by 2 desc;

スクリーンビューのスクリーン別カウント

select
  (select ep.value.string_value from unnest(event_params) ep where ep.key = 'firebase_screen_class'),
  count(0) screen_views
from `firebase-public-project.analytics_153293282.events_20181003`
where event_name = 'screen_view'
group by 1
order by 1;

ユーザ別イベント数カウント

行がユーザ、列がイベント名。
イベント名が動的なのでBigQuery Scriptingを使う。

declare event_pivot_columns string;
set event_pivot_columns = (
select string_agg(distinct 'sum(case when event_name = "' || event_name || '" then 1 else 0 end) ' || event_name) from `firebase-public-project.analytics_153293282.events_*`
);

execute immediate format("""
create or replace table `my-project.test.user_stats` as
select
  user_pseudo_id,
  min(timestamp_micros(event_timestamp)) first_open_timestamp,
  max(timestamp_micros(event_timestamp)) last_open_timestamp,
  count(0) total_events,
  max(user_ltv.revenue) total_revenue,
  %s
from `firebase-public-project.analytics_153293282.events_*`
group by 1
order by 4 desc;
""", event_pivot_columns)

対象データを抽出してフラット化

1行は1イベントで、生ログからイベントパラメータやユーザープロパティをフラット化(UNNEST())し、列にする。パラメータやプロパティは動的なのでBigQuery Scriptingを使う。

declare ep_columns string;
declare up_columns string;

set ep_columns = (
with t1 as (
  select
    p.key key,
    sum(case when p.value.string_value is not null then 1 else 0 end) cnt_string,
    sum(case when p.value.float_value is not null or p.value.double_value is not null then 1 else 0 end) cnt_float64,
    sum(case when p.value.int_value is not null then 1 else 0 end) cnt_int64
  from `firebase-public-project.analytics_153293282.events_*`, unnest(event_params) p
  group by 1
), t2 as (
  select
    key,
    case
      when cnt_string > 0 then 'string'
      when cnt_float64 > 0 and cnt_int64 > 0 then 'numeric'
      when cnt_float64 > 0 then 'float64'
      when cnt_int64 > 0 then 'int64'
      else 'string'
    end type
  from t1
)
select
string_agg(
  '(select case when p.key = "'
  || key
  || '" then case when p.value.string_value is not null then safe_cast(p.value.string_value as '
  || type 
  || ') when p.value.int_value is not null then safe_cast(p.value.int_value as '
  || type 
  || ') when p.value.float_value is not null then safe_cast(p.value.float_value as ' 
  || type 
  || ') when p.value.double_value is not null then safe_cast(p.value.double_value as ' 
  || type 
  || ') else null end else null end from unnest(event_params) p where p.key = "'
  || key
  || '") e_'
  || key
  order by key
)
from t2
);

set up_columns = (
with t1 as (
  select
    p.key key,
    sum(case when p.value.string_value is not null then 1 else 0 end) cnt_string,
    sum(case when p.value.float_value is not null or p.value.double_value is not null then 1 else 0 end) cnt_float64,
    sum(case when p.value.int_value is not null then 1 else 0 end) cnt_int64
  from `firebase-public-project.analytics_153293282.events_*`, unnest(user_properties) p
  group by 1
), t2 as(
  select
    key,
    case
      when cnt_string > 0 then 'string'
      when cnt_float64 > 0 and cnt_int64 > 0 then 'numeric'
      when cnt_float64 > 0 then 'float64'
      when cnt_int64 > 0 then 'int64'
      else 'string'
    end type
  from t1
)
select
string_agg(
  '(select case when p.key = "'
  || key
  || '" then case when p.value.string_value is not null then safe_cast(p.value.string_value as '
  || type 
  || ') when p.value.int_value is not null then safe_cast(p.value.int_value as '
  || type 
  || ') when p.value.float_value is not null then safe_cast(p.value.float_value as ' 
  || type 
  || ') when p.value.double_value is not null then safe_cast(p.value.double_value as ' 
  || type 
  || ') else null end else null end from unnest(user_properties) p where p.key = "'
  || key
  || '") u_'
  || key
  order by key
)
from t2
);

execute immediate format("""
create or replace table `my-project.test.log_filtered` partition by date(event_timestamp) as
with t1 as (
  select
    user_pseudo_id,
    min(timestamp_micros(event_timestamp)) over(partition by user_pseudo_id) first_open_timestamp,
    timestamp_micros(event_timestamp) event_timestamp,
    event_name,
    %s, %s,
    device.category device_category,
    device.operating_system,
    device.operating_system_version,
    case when device.browser is not null then device.browser else device.web_info.browser end browser,
    case when device.browser_version is not null then device.browser_version else device.web_info.browser_version end browser_version,
    platform,
    app_info.install_source
  from `firebase-public-project.analytics_153293282.events_*`
  where user_pseudo_id in (select distinct user_pseudo_id from `firebase-public-project.analytics_153293282.events_*` where event_name = 'first_open')
), t2 as (
  select *, timestamp_diff(event_timestamp, first_open_timestamp, second) seconds_since_first_open from t1
)
select * from t2
""", ep_columns, up_columns);

機械学習用データセットを作る

1か月後に継続しているか
→利用開始から29日目~35日目にスクリーンビューがあれば継続とみなして、利用開始から7日間のアクティビティに基づいて継続の有無を予測する

1行1ユーザで、利用開始から7日間のアクティビティを集計(イベント名やスクリーンクラス別カウント、エンゲージメント時間など)。特徴量化するイベントやスクリーンクラスは出現頻度1%以上に絞り込む。

-- screen_classごとのスクリーンビュー数
declare screen_class_columns string;
declare event_name_columns string;

create temp table screen_classes as
with t1 as (
  select
    e_firebase_screen_class, count(0) cnt
  from `my-project.test.log_filtered`
  where event_name = 'screen_view'
  group by 1
  order by 1
)
select *, cnt/sum(cnt) over() ratio from t1 order by 2 desc
;

-- 出現頻度が1%より大きいものをカウント対象にする
set screen_class_columns = (select
string_agg('sum(case when e_firebase_screen_class = "' || e_firebase_screen_class || '" and event_name = "screen_view" then 1 else 0 end) screen_views_' || e_firebase_screen_class)
from screen_classes where ratio > 0.01
);

-- event_nameごとのイベント数
create temp table event_names as
with t1 as (
  select
    event_name, count(0) cnt
  from `my-project.test.log_filtered`
  where event_name not in ('screen_view', 'user_engagement', 'first_open')
  group by 1
  order by 1
)
select *, cnt/sum(cnt) over() ratio from t1 order by 2 desc
;

-- 出現頻度が1%より大きいものをカウント対象にする
set event_name_columns = (select
string_agg('sum(case when event_name = "' || event_name || '" then 1 else 0 end) event_count_' || event_name)
from event_names where ratio > 0.01
);

execute immediate format("""
create or replace table `my-project.test.sum_by_user` as
with t1 as (
  select
    user_pseudo_id,
    min(extract(dayofweek from first_open_timestamp)) first_open_dow,
    min(platform) platform,
    min(install_source) install_source,
    sum(case when seconds_since_first_open <= 1*24*60*60 and event_name = 'screen_view' then 1 else 0 end) screen_views_first_1d,
    sum(case when seconds_since_first_open <= 1*24*60*60 and e_engagement_time_msec is not null then e_engagement_time_msec else 0 end)/1000 engagement_time_seconds_first_1d,
    sum(case when event_name = 'screen_view' then 1 else 0 end) screen_views,
    sum(case when e_engagement_time_msec is not null then e_engagement_time_msec else 0 end)/1000 engagement_time_seconds,
    %s, %s
  from `my-project.test.log_filtered`
  where seconds_since_first_open <= 7*24*60*60
  group by 1
), t2 as (
  select
    user_pseudo_id,
    sum(case when event_name = 'screen_view' then 1 else 0 end) screen_views_29d_35d
  from `my-project.test.log_filtered`
  where seconds_since_first_open > 28*24*60*60 and seconds_since_first_open <= 35*24*60*60
  group by 1
), t3 as (
  select
    t1.*,
    ifnull(screen_views_29d_35d, 0) screen_views_29d_35d,
    rand() rnd
  from t1 left join t2 using(user_pseudo_id)
), t4 as (
  select
    * except(screen_views_29d_35d, rnd),
    screen_views_29d_35d > 0 active_after_4w,
    case
      when rnd < 0.8 then 'TRAIN'
      when rnd < 0.9 then 'EVALUATE'
      else 'TEST'
    end subset
  from t3
)
select * from t4
""", screen_class_columns, event_name_columns);

BQML

-- 学習
CREATE OR REPLACE MODEL `my-project.test.model_predict_active_users_xgb`
OPTIONS(
  MODEL_TYPE='BOOSTED_TREE_CLASSIFIER',
  MAX_ITERATIONS=100,
  LEARN_RATE=0.1,
  SUBSAMPLE=0.8,
  INPUT_LABEL_COLS=['active_after_4w']
) AS
select * except(user_pseudo_id, subset)
from `my-project.test.sum_by_user`
where subset = 'TRAIN';

-- 評価
SELECT * FROM ML.EVALUATE(MODEL `my-project.test.model_predict_active_users_xgb`, (
  select * except(user_pseudo_id, subset)
  from `my-project.test.sum_by_user`
  where subset = 'EVALUATE'
));

-- 予測
with predictions as (
  SELECT * FROM ML.PREDICT(MODEL `my-project.test.model_predict_active_users_xgb`, (
    select * except(subset)
    from `my-project.test.sum_by_user`
    where subset = 'TEST'
  ))
)
select
  user_pseudo_id
  active_after_4w,
  (select p.prob from unnest(predictions.predicted_active_after_4w_probs) p where p.label = true) prob
from predictions
order by prob desc;

データ分析 の記事一覧