diff --git a/sql_generators/glean_usage/__init__.py b/sql_generators/glean_usage/__init__.py index 3adead7d4bc..46fa6920c77 100644 --- a/sql_generators/glean_usage/__init__.py +++ b/sql_generators/glean_usage/__init__.py @@ -26,6 +26,10 @@ glean_app_ping_views, metrics_clients_daily, metrics_clients_last_seen, + usage_reporting_clients_daily, + usage_reporting_clients_first_seen, + usage_reporting_clients_last_seen, + usage_reporting_active_users_aggregates, ) from sql_generators.glean_usage.common import get_app_info, list_tables @@ -43,6 +47,10 @@ event_error_monitoring.EventErrorMonitoring(), event_flow_monitoring.EventFlowMonitoring(), events_stream.EventsStreamTable(), + usage_reporting_clients_daily.UsageReportingClientsDailyTable(), + usage_reporting_clients_first_seen.UsageReportingClientsFirstSeenTable(), + usage_reporting_clients_last_seen.UsageReportingClientsLastSeenTable(), + usage_reporting_active_users_aggregates.UsageReportingActiveUsersAggregatesTable(), ] @@ -136,7 +144,7 @@ def get_tables(table_name="baseline_v1"): not in ConfigLoader.get("generate", "glean_usage", "skip_apps", fallback=[]) ] - id_token=get_id_token() + id_token = get_id_token() # Prepare parameters so that generation of all Glean datasets can be done in parallel @@ -151,7 +159,7 @@ def get_tables(table_name="baseline_v1"): use_cloud_function=use_cloud_function, app_info=app_info, parallelism=parallelism, - id_token=id_token + id_token=id_token, ), baseline_table, ) @@ -169,7 +177,7 @@ def get_tables(table_name="baseline_v1"): output_dir=output_dir, use_cloud_function=use_cloud_function, parallelism=parallelism, - id_token=id_token + id_token=id_token, ), info, ) diff --git a/sql_generators/glean_usage/common.py b/sql_generators/glean_usage/common.py index 0d1bb444abd..82484b46437 100644 --- a/sql_generators/glean_usage/common.py +++ b/sql_generators/glean_usage/common.py @@ -151,6 +151,17 @@ def table_names_from_baseline(baseline_table, include_project_id=True): events_view=f"{prefix}.events", events_stream_table=f"{prefix}_derived.events_stream_v1", events_stream_view=f"{prefix}.events_stream", + dau_reporting_stable_table=f"{prefix}_stable.dau_reporting_v1", + usage_reporting_stable_table=f"{prefix}_stable.usage_reporting_v1", + usage_reporting_clients_daily_table=f"{prefix}_derived.usage_reporting_clients_daily_v1", + usage_reporting_clients_first_seen_table=f"{prefix}_derived.usage_reporting_clients_first_seen_v1", + usage_reporting_clients_last_seen_table=f"{prefix}_derived.usage_reporting_clients_last_seen_v1", + usage_reporting_active_users_aggregates_table=f"{prefix}_derived.usage_reporting_active_users_aggregates_v1", + usage_reporting_clients_daily_view=f"{prefix}.usage_reporting_clients_daily", + usage_reporting_clients_first_seen_view=f"{prefix}.usage_reporting_clients_first_seen", + usage_reporting_clients_last_seen_view=f"{prefix}.usage_reporting_clients_last_seen", + usage_reporting_active_users_view=f"{prefix}.usage_reporting_active_users", + usage_reporting_active_users_aggregates_view=f"{prefix}.usage_reporting_active_users_aggregates", ) @@ -234,7 +245,7 @@ def generate_per_app_id( use_cloud_function=True, app_info=[], parallelism=8, - id_token=None + id_token=None, ): """Generate the baseline table query per app_id.""" if not self.per_app_id_enabled: @@ -268,7 +279,7 @@ def generate_per_app_id( derived_dataset=derived_dataset, app_name=app_name, has_distribution_id=app_name in APPS_WITH_DISTRIBUTION_ID, - has_profile_group_id= app_name in APPS_WITH_PROFILE_GROUP_ID, + has_profile_group_id=app_name in APPS_WITH_PROFILE_GROUP_ID, ) render_kwargs.update(self.custom_render_kwargs) @@ -364,7 +375,7 @@ def generate_per_app( output_dir=None, use_cloud_function=True, parallelism=8, - id_token=None + id_token=None, ): """Generate the baseline table query per app_name.""" if not self.per_app_enabled: diff --git a/sql_generators/glean_usage/templates/cross_channel.view.sql b/sql_generators/glean_usage/templates/cross_channel.view.sql index 108750f1400..3fb991b2f8c 100644 --- a/sql_generators/glean_usage/templates/cross_channel.view.sql +++ b/sql_generators/glean_usage/templates/cross_channel.view.sql @@ -7,11 +7,11 @@ AS UNION ALL {% endif -%} {% if app_name == "fenix" -%} -SELECT +SELECT "{{ dataset }}" AS normalized_app_id, * REPLACE(mozfun.norm.fenix_app_info("{{ dataset }}", app_build).channel AS normalized_channel), {% else -%} -SELECT +SELECT "{{ dataset }}" AS normalized_app_id, * REPLACE("{{ channel }}" AS normalized_channel) {% endif -%} diff --git a/sql_generators/glean_usage/templates/usage_reporting_active_users.metadata.yaml b/sql_generators/glean_usage/templates/usage_reporting_active_users.metadata.yaml new file mode 100644 index 00000000000..e0ebf572483 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_active_users.metadata.yaml @@ -0,0 +1,14 @@ +{{ header_yaml }} +friendly_name: Usage Reporting Active_Users + A daily client aggregation view for usage_reporting ping. Merges the computations for client first seen + and last seen metrics + +owners: + - gkatre@mozilla.com +labels: {} +bigquery: null +workgroup_access: +- role: roles/bigquery.dataViewer + members: + - workgroup:dataops-managed/taar + - workgroup:mozilla-confidential diff --git a/sql_generators/glean_usage/templates/usage_reporting_active_users.schema.yaml b/sql_generators/glean_usage/templates/usage_reporting_active_users.schema.yaml new file mode 100644 index 00000000000..2fe21a702c8 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_active_users.schema.yaml @@ -0,0 +1,143 @@ +fields: +- mode: NULLABLE + name: submission_date + type: DATE + description: | + Logical date used for processing and paritioning. + +- mode: NULLABLE + name: usage_profile_id + type: STRING + description: + +# - mode: NULLABLE +# name: first_run_date +# type: DATE +# description: | +# The date of the first run of the application. + +- mode: NULLABLE + name: normalized_channel + type: STRING + description: | + The channel the application is being distributed on. + +- mode: NULLABLE + name: normalized_os + type: STRING + description: | + The name of the operating system. + +- mode: NULLABLE + name: normalized_os_version + type: STRING + description: | + The user-visible version of the operating system (e.g. "1.2.3"). + If the version detection fails, this metric gets set to Unknown. + +# - mode: NULLABLE +# name: locale +# type: STRING +# description: | +# The locale of the application during initialization (e.g. "es-ES"). +# If the locale can't be determined on the system, the value is "und", to indicate "undetermined". + +- mode: NULLABLE + name: normalized_country_code + type: STRING + description: | + Country code + +# - mode: NULLABLE +# name: app_build +# type: STRING +# description: | +# The build identifier generated by the CI system (e.g. "1234/A"). +# If the value was not provided through configuration, this metric gets set to Unknown. + +# - mode: NULLABLE +# name: app_display_version +# type: STRING +# description: | +# The user visible version string (e.g. "1.0.3"). +# If the value was not provided through configuration, this metric gets set to Unknown. + +- mode: NULLABLE + name: distribution_id + type: STRING + description: | + A string containing the distribution identifier. This was used to identify installs + from Mozilla Online, but now also identifies partnership deal distributions. + +- mode: NULLABLE + name: is_active + type: BOOLEAN + description: | + A flag field indicating whether the specific client was active. + +- mode: NULLABLE + name: first_seen_date + type: DATE + description: | + Logical date of when we observed the client for the first time in our warehouse. + +- mode: NULLABLE + name: days_seen_bits + type: INTEGER + description: | + Bit field shows on which of the last 28 days a client sent us the usage_reporting ping. + +- mode: NULLABLE + name: days_active_bits + type: INTEGER + description: | + Bit field shows on which of the last 28 days a client fulfilled the active criteria. + +- mode: NULLABLE + name: days_created_profile_bits + type: INTEGER + description: | + bit field indicating how many days lapsed since profile creation. + +- mode: NULLABLE + name: activity_segment + type: STRING + description: | + categorizing activity days into segments + +- mode: NULLABLE + name: is_dau + type: BOOLEAN + description: | + A flag field indicating whether the specific client was active on the submission_date. + +- mode: NULLABLE + name: is_wau + type: BOOLEAN + description: | + A flag field indicating whether the specific client was active on any of the 7 days prior to the submission_date. + +- mode: NULLABLE + name: is_mau + type: BOOLEAN + description: | + A flag field indicating whether the specific client was active on any of the 28 days prior to the submission_date. + +- mode: NULLABLE + name: is_daily_user + type: BOOLEAN + description: | + A flag field indicating whether the specific client sent the dau_reporting ping on the submission_date. + +- mode: NULLABLE + name: is_weekly_user + type: BOOLEAN + description: | + A flag field indicating whether the specific client sent the dau_reporting ping on any of the 7 days prior to the submission_date. + +- mode: NULLABLE + name: is_monthly_user + type: BOOLEAN + description: | + A flag field indicating whether the specific client sent the dau_reporting ping on any of the 28 days prior to the + submission_date. diff --git a/sql_generators/glean_usage/templates/usage_reporting_active_users.view.sql b/sql_generators/glean_usage/templates/usage_reporting_active_users.view.sql new file mode 100644 index 00000000000..7b2e3bd1489 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_active_users.view.sql @@ -0,0 +1,87 @@ +{{ header }} +CREATE OR REPLACE VIEW + `{{ project_id }}.{{ usage_reporting_active_users_view }}` +AS +SELECT + submission_date, + usage_profile_id, +-- first_run_date, + normalized_channel, + normalized_os, + normalized_os_version, +-- locale, + normalized_country_code, +-- app_build, +-- app_display_version, + distribution_id, + is_active, + first_seen_date, + days_seen_bits, + days_active_bits, + days_created_profile_bits, + CASE + WHEN BIT_COUNT(days_active_bits) + BETWEEN 1 + AND 6 + THEN 'infrequent_user' + WHEN BIT_COUNT(days_active_bits) + BETWEEN 7 + AND 13 + THEN 'casual_user' + WHEN BIT_COUNT(days_active_bits) + BETWEEN 14 + AND 20 + THEN 'regular_user' + WHEN BIT_COUNT(days_active_bits) >= 21 + THEN 'core_user' + ELSE 'other' + END AS activity_segment, + IFNULL(mozfun.bits28.days_since_seen(days_active_bits) = 0, FALSE) AS is_dau, + IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 7, FALSE) AS is_wau, + IFNULL(mozfun.bits28.days_since_seen(days_active_bits) < 28, FALSE) AS is_mau, + IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) = 0, FALSE) AS is_daily_user, + IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) < 7, FALSE) AS is_weekly_user, + IFNULL(mozfun.bits28.days_since_seen(days_seen_bits) < 28, FALSE) AS is_monthly_user + +-- +-- TODO: uncomment once duration is added to the usage_reporting ping +-- +-- -- Bit patterns capturing activity dates relative to the submission date. +-- days_seen_session_start_bits, +-- days_seen_session_end_bits, +-- + +-- -- TODO: verify if these fields are needed +-- app_version, +-- country, +-- city, +-- locale, +-- os, +-- windows_build_number, +-- scalar_parent_browser_engagement_total_uri_count_normal_and_private_mode_sum, +-- scalar_parent_browser_engagement_total_uri_count_sum, +-- is_default_browser, +-- isp_name, +-- CASE +-- WHEN isp_name = 'BrowserStack' +-- THEN CONCAT('Firefox Desktop', ' ', isp_name) +-- WHEN distribution_id = 'MozillaOnline' +-- THEN CONCAT('Firefox Desktop', ' ', distribution_id) +-- ELSE 'Firefox Desktop' +-- END AS app_name, +-- IF( +-- LOWER(IFNULL(isp_name, '')) <> "browserstack" +-- AND LOWER(IFNULL(distribution_id, '')) <> "mozillaonline", +-- TRUE, +-- FALSE +-- ) AS is_desktop + + +FROM + `{{ usage_reporting_clients_daily_table }}` +LEFT JOIN + `{{ usage_reporting_clients_first_seen_table }}` + USING (usage_profile_id) +LEFT JOIN + `{{ usage_reporting_clients_last_seen_table }}` + USING (usage_profile_id) diff --git a/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates.metadata.yaml b/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates.metadata.yaml new file mode 100644 index 00000000000..9ddbfd058cd --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates.metadata.yaml @@ -0,0 +1,14 @@ +{{ header_yaml }} +friendly_name: Usage Reporting Active Users Aggregates +description: |- + A daily aggregate of the usage_reporting ping representing user activity. + +owners: + - gkatre@mozilla.com +labels: {} +bigquery: null +workgroup_access: +- role: roles/bigquery.dataViewer + members: + - workgroup:dataops-managed/taar + - workgroup:mozilla-confidential diff --git a/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates.view.sql b/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates.view.sql new file mode 100644 index 00000000000..1f1f47e4e48 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates.view.sql @@ -0,0 +1,8 @@ +{{ header }} +CREATE OR REPLACE VIEW + `{{ project_id }}.{{ usage_reporting_active_users_aggregates_view }}` +AS +SELECT + * +FROM + `{{ project_id }}.{{ usage_reporting_active_users_aggregates_table }}` diff --git a/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates_v1.metadata.yaml b/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates_v1.metadata.yaml new file mode 100644 index 00000000000..9df38d12310 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates_v1.metadata.yaml @@ -0,0 +1,22 @@ +{{ header_yaml }} +friendly_name: Usage Reporting Active Users Aggregates +description: |- + A daily aggregate of the usage_reporting ping representing user activity. + +owners: + - gkatre@mozilla.com +labels: + incremental: true + schedule: daily +scheduling: + dag_name: bqetl_glean_usage + task_group: {{ app_name }} +bigquery: + time_partitioning: + type: day + field: submission_date + require_partition_filter: true + clustering: + fields: + - channel + - locale diff --git a/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates_v1.query.sql b/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates_v1.query.sql new file mode 100644 index 00000000000..5fceedcd925 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates_v1.query.sql @@ -0,0 +1,71 @@ +{{ header }} + +--- Query generated via sql_generators.active_users. +WITH todays_metrics AS ( + SELECT + submission_date, + + usage_profile_id, + normalized_channel AS channel, + EXTRACT(YEAR FROM first_seen_date) AS first_seen_year, + COALESCE( + `mozfun.norm.windows_version_info`(os, normalized_os_version, windows_build_number), + normalized_os_version + ) AS os_version, + COALESCE( + CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(0)], "") AS INTEGER), + 0 + ) AS os_version_major, + COALESCE( + CAST(NULLIF(SPLIT(normalized_os_version, ".")[SAFE_OFFSET(1)], "") AS INTEGER), + 0 + ) AS os_version_minor, + COALESCE(REGEXP_EXTRACT(locale, r'^(.+?)-'), locale, NULL) AS locale, + distribution_id, + is_active, + activity_segment AS segment, + is_daily_user, + is_weekly_user, + is_monthly_user, + is_dau, + is_wau, + is_mau + +-- -- TODO: verify if these fields are needed +-- app_name, +-- app_version AS app_version, +-- IFNULL(country, '??') country, +-- city, +-- os, +-- COALESCE( +-- scalar_parent_browser_engagement_total_uri_count_normal_and_private_mode_sum, +-- scalar_parent_browser_engagement_total_uri_count_sum +-- ) AS uri_count, +-- is_default_browser, + + FROM + `{{ usage_reporting_active_users_view }}` + WHERE + submission_date = @submission_date +) +SELECT + todays_metrics.* EXCEPT ( + usage_profile_id, + is_daily_user, + is_weekly_user, + is_monthly_user, + is_dau, + is_wau, + is_mau, + is_active + ), + COUNTIF(is_daily_user) AS daily_users, + COUNTIF(is_weekly_user) AS weekly_users, + COUNTIF(is_monthly_user) AS monthly_users, + COUNTIF(is_dau) AS dau, + COUNTIF(is_wau) AS wau, + COUNTIF(is_mau) AS mau +FROM + todays_metrics +GROUP BY + ALL diff --git a/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates_v1.schema.yaml b/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates_v1.schema.yaml new file mode 100644 index 00000000000..a51d0ebb6bb --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_active_users_aggregates_v1.schema.yaml @@ -0,0 +1,57 @@ +fields: +- mode: NULLABLE + name: submission_date + type: DATE + description: | + Logical date used for processing and paritioning. + +- mode: NULLABLE + name: channel + type: STRING + description: | + The channel the application is being distributed on. + +- mode: NULLABLE + name: os_version + type: STRING + description: | + The user-visible version of the operating system (e.g. "1.2.3"). + If the version detection fails, this metric gets set to Unknown. + +- mode: NULLABLE + name: os_version_major + type: STRING + description: | + The operating system major version. + +- mode: NULLABLE + name: os_version_minor + type: STRING + description: | + The operating system minor version. + +- mode: NULLABLE + name: locale + type: STRING + description: | + The locale of the application during initialization (e.g. "es-ES"). + If the locale can't be determined on the system, the value is "und", to indicate "undetermined". + +- mode: NULLABLE + name: distribution_id + type: STRING + description: | + A string containing the distribution identifier. This was used to identify installs + from Mozilla Online, but now also identifies partnership deal distributions. + +- mode: NULLABLE + name: first_seen_year + type: DATE + description: | + Year when we observed the client for the first time in our warehouse. + +- mode: NULLABLE + name: segment + type: STRING + description: | + categorizing activity days into segments diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_daily.metadata.yaml b/sql_generators/glean_usage/templates/usage_reporting_clients_daily.metadata.yaml new file mode 100644 index 00000000000..7103838e424 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_daily.metadata.yaml @@ -0,0 +1,12 @@ +{{ header_yaml }} +friendly_name: Clients Daily Based on the DAU Reporting Ping. +description: |- + A daily aggregate of usage_reporting pings per `profile_usage_id`. + + Cluster by: `normalized_channel`, `locale` + +owners: + - kik@mozilla.com +labels: + incremental: true + schedule: daily diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_daily.view.sql b/sql_generators/glean_usage/templates/usage_reporting_clients_daily.view.sql new file mode 100644 index 00000000000..1fad984f004 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_daily.view.sql @@ -0,0 +1,8 @@ +{{ header }} +CREATE OR REPLACE VIEW + `{{ project_id }}.{{ usage_reporting_clients_daily_view }}` +AS +SELECT + * +FROM + `{{ project_id }}.{{ usage_reporting_clients_daily_table }}` diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_daily_v1.metadata.yaml b/sql_generators/glean_usage/templates/usage_reporting_clients_daily_v1.metadata.yaml new file mode 100644 index 00000000000..88c3d1d9957 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_daily_v1.metadata.yaml @@ -0,0 +1,24 @@ +{{ header_yaml }} +friendly_name: Clients Daily Based on the DAU Reporting Ping. +description: |- + A daily aggregate of usage_reporting pings per `profile_usage_id`. + + Cluster by: `normalized_channel`, `locale` + +owners: + - kik@mozilla.com +labels: + incremental: true + schedule: daily +scheduling: + dag_name: bqetl_glean_usage + task_group: {{ app_name }} +bigquery: + time_partitioning: + type: day + field: submission_date + require_partition_filter: true + clustering: + fields: + - normalized_channel + - normalized_country_code diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_daily_v1.query.sql b/sql_generators/glean_usage/templates/usage_reporting_clients_daily_v1.query.sql new file mode 100644 index 00000000000..84cddf4d1f2 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_daily_v1.query.sql @@ -0,0 +1,108 @@ +{{ header }} + +WITH usage_reporting_base AS ( + SELECT + submission_timestamp, + DATE(submission_timestamp) AS submission_date, + metrics.uuid.usage_profile_id, + normalized_channel, + -- client_info.app_display_version, + -- client_info.app_build, + normalized_os, + normalized_os_version, + -- client_info.locale, + normalized_country_code, + {% if has_distribution_id %} + metrics.string.metrics_distribution_id AS distribution_id, + {% else %} + CAST(NULL AS STRING) AS distribution_id, + {% endif %} + {% if "_desktop" in app_name %} + COALESCE(metrics.counter.browser_engagement_uri_count, 0) AS browser_engagement_uri_count, + COALESCE(metrics.counter.browser_engagement_active_ticks, 0) AS browser_engagement_active_ticks, + {% endif %} + CAST(NULL AS BOOLEAN) AS is_active, + -- SAFE.PARSE_DATE('%F', SUBSTR(client_info.first_run_date, 1, 10)) AS first_run_date, + FROM + `{{ project_id }}.{{ usage_reporting_stable_table }}` + WHERE + {% raw %} + {% if is_init() %} + DATE(submission_timestamp) >= '2024-10-10' + {% else %} + DATE(submission_timestamp) = @submission_date + {% endif %} + {% endraw %} + AND metrics.uuid.usage_profile_id IS NOT NULL +), +dau_reporting_base AS ( + SELECT + submission_timestamp, + DATE(submission_timestamp) AS submission_date, + metrics.uuid.usage_profile_id, + normalized_channel, + -- client_info.app_display_version, + -- client_info.app_build, + normalized_os, + normalized_os_version, + -- client_info.locale, + normalized_country_code, + {% if has_distribution_id %} + metrics.string.metrics_distribution_id AS distribution_id, + {% else %} + CAST(NULL AS STRING) AS distribution_id, + {% endif %} + {% if "_desktop" in app_name %} + COALESCE(metrics.counter.browser_engagement_uri_count, 0) AS browser_engagement_uri_count, + COALESCE(metrics.counter.browser_engagement_active_ticks, 0) AS browser_engagement_active_ticks, + {% endif %} + CAST(NULL AS BOOLEAN) AS is_active, + -- SAFE.PARSE_DATE('%F', SUBSTR(client_info.first_run_date, 1, 10)) AS first_run_date, + FROM + `{{ project_id }}.{{ dau_reporting_stable_table }}` + WHERE + {% raw %} + {% if is_init() %} + DATE(submission_timestamp) >= '2024-10-10' + {% else %} + DATE(submission_timestamp) = @submission_date + {% endif %} + {% endraw %} + AND metrics.uuid.usage_profile_id IS NOT NULL +), +-- We need to union with the old dau_reporting ping here as we want to make sure we include data +-- from clients that take longer to update. +reporting_pings_union AS ( + SELECT + *, + "usage_reporting" AS source_ping_name, + FROM usage_reporting_base + UNION ALL + SELECT + *, + "dau_reporting" AS source_ping_name, + FROM dau_reporting_base +) +SELECT + submission_date, + usage_profile_id, + udf.mode_last(ARRAY_AGG(normalized_channel IGNORE NULLS ORDER BY submission_timestamp ASC)) AS normalized_channel, + udf.mode_last(ARRAY_AGG(normalized_country_code IGNORE NULLS ORDER BY submission_timestamp ASC)) AS normalized_country_code, + udf.mode_last(ARRAY_AGG(normalized_os IGNORE NULLS ORDER BY submission_timestamp ASC)) AS normalized_os, + udf.mode_last(ARRAY_AGG(normalized_os_version IGNORE NULLS ORDER BY submission_timestamp ASC)) AS normalized_os_version, + udf.mode_last(ARRAY_AGG(distribution_id IGNORE NULLS ORDER BY submission_timestamp ASC)) AS distribution_id, + {% if "_desktop" in app_name %} + COALESCE(LOGICAL_OR(is_active), SUM(browser_engagement_uri_count) > 0 AND SUM(browser_engagement_active_ticks) > 0, FALSE) AS is_active, + {% else %} + -- At the moment we do not have duration, default to True. + -- Eventually is_active value will come from the client. + COALESCE(LOGICAL_OR(is_active), TRUE) AS is_active, + {% endif %} + STRUCT( + ARRAY_AGG(DISTINCT source_ping_name) AS source_pings + ), +FROM + reporting_pings_union +GROUP BY + submission_date, + usage_profile_id diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_daily_v1.schema.yaml b/sql_generators/glean_usage/templates/usage_reporting_clients_daily_v1.schema.yaml new file mode 100644 index 00000000000..f38a1ba99c7 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_daily_v1.schema.yaml @@ -0,0 +1,76 @@ +fields: +- mode: NULLABLE + name: submission_date + type: DATE + description: | + Logical date used for processing and paritioning. + +- mode: NULLABLE + name: usage_profile_id + type: STRING + description: + +# - mode: NULLABLE +# name: first_run_date +# type: DATE +# description: | +# The date of the first run of the application. + +- mode: NULLABLE + name: normalized_channel + type: STRING + description: | + The channel the application is being distributed on. + +- mode: NULLABLE + name: normalized_os + type: STRING + description: | + The name of the operating system. + +- mode: NULLABLE + name: normalized_os_version + type: STRING + description: | + The user-visible version of the operating system (e.g. "1.2.3"). + If the version detection fails, this metric gets set to Unknown. + +# - mode: NULLABLE +# name: locale +# type: STRING +# description: | +# The locale of the application during initialization (e.g. "es-ES"). +# If the locale can't be determined on the system, the value is "und", to indicate "undetermined". + +- mode: NULLABLE + name: normalized_country_code + type: STRING + description: | + Country code + +# - mode: NULLABLE +# name: app_build +# type: STRING +# description: | +# The build identifier generated by the CI system (e.g. "1234/A"). +# If the value was not provided through configuration, this metric gets set to Unknown. + +# - mode: NULLABLE +# name: app_display_version +# type: STRING +# description: | +# The user visible version string (e.g. "1.0.3"). +# If the value was not provided through configuration, this metric gets set to Unknown. + +- mode: NULLABLE + name: distribution_id + type: STRING + description: | + A string containing the distribution identifier. This was used to identify installs + from Mozilla Online, but now also identifies partnership deal distributions. + +- mode: NULLABLE + name: is_active + type: BOOLEAN + description: | + A flag field indicating whether the specific client was active. diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen.metadata.yaml b/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen.metadata.yaml new file mode 100644 index 00000000000..7025098f9a7 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen.metadata.yaml @@ -0,0 +1,11 @@ +{{ header_yaml }} +friendly_name: Clients First Seen Based on the DAU Reporting Ping. +description: |- + A representation of when we saw each `profile_usage_id` + for the first time based on the usage_reporting ping. + +owners: + - kik@mozilla.com +labels: + incremental: true + schedule: daily diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen.view.sql b/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen.view.sql new file mode 100644 index 00000000000..436d4c04ae7 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen.view.sql @@ -0,0 +1,8 @@ +{{ header }} +CREATE OR REPLACE VIEW + `{{ project_id }}.{{ usage_reporting_clients_first_seen_view }}` +AS +SELECT + * +FROM + `{{ project_id }}.{{ usage_reporting_clients_first_seen_table }}` diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen_v1.metadata.yaml b/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen_v1.metadata.yaml new file mode 100644 index 00000000000..d12aa656798 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen_v1.metadata.yaml @@ -0,0 +1,19 @@ +{{ header_yaml }} +friendly_name: Clients First Seen Based on the DAU Reporting Ping. +description: |- + A representation of when we saw each `profile_usage_id` + for the first time based on the usage_reporting ping. + +owners: + - kik@mozilla.com +labels: + incremental: true + schedule: daily +scheduling: + dag_name: bqetl_glean_usage + task_group: {{ app_name }} +bigquery: + time_partitioning: + type: day + field: first_seen_date + require_partition_filter: false diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen_v1.query.sql b/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen_v1.query.sql new file mode 100644 index 00000000000..589aea3608c --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen_v1.query.sql @@ -0,0 +1,58 @@ +{{ header }} + +WITH + _current AS ( + SELECT + usage_profile_id, + {% raw %} + {% if is_init() %} + DATE(MIN(submission_timestamp)) AS first_seen_date, + {% else %} + @submission_date AS first_seen_date, + {% endif %} + {% endraw %} + FROM + `{{ usage_reporting_clients_daily_table }}` + WHERE + usage_profile_id IS NOT NULL + {% raw %} + {% if is_init() %} + AND DATE(submission_timestamp) > "2014-10-10" + {% else %} + AND DATE(submission_timestamp) = @submission_date + {% endif %} + {% endraw %} + GROUP BY + usage_profile_id + ), +_previous AS ( + SELECT + usage_profile_id, + FROM + `{{ usage_reporting_clients_first_seen_table }}` + WHERE + {% raw %} + {% if is_init() %} + False + {% else %} + first_seen_date < @submission_date + {% endif %} + {% endraw %} +) + +SELECT + first_seen_date, + usage_profile_id, +FROM + _current +LEFT JOIN + _previous + USING (usage_profile_id) +WHERE + _previous.usage_profile_id IS NULL +QUALIFY + IF( + COUNT(*) OVER (PARTITION BY usage_profile_id) > 1, + ERROR("Duplicate usage_profile_id combination detected."), + TRUE + ) diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen_v1.schema.yaml b/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen_v1.schema.yaml new file mode 100644 index 00000000000..7e8f77c4745 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_first_seen_v1.schema.yaml @@ -0,0 +1,13 @@ +fields: + +- mode: NULLABLE + name: usage_profile_id + type: STRING + description: | + A UUID of the usage_profile. + +- mode: NULLABLE + name: first_seen_date + type: DATE + description: | + Logical date of when we observed the client for the first time in our warehouse. diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen.metadata.yaml b/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen.metadata.yaml new file mode 100644 index 00000000000..46dd6b29302 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen.metadata.yaml @@ -0,0 +1,11 @@ +{{ header_yaml }} +friendly_name: Clients Last Seen Based on the DAU Reporting Ping. +description: |- + A daily aggregate of the usage_reporting ping for each `profile_usage_id` + representing their activity. + +owners: + - kik@mozilla.com +labels: + incremental: true + schedule: daily diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen.view.sql b/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen.view.sql new file mode 100644 index 00000000000..08930c6076e --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen.view.sql @@ -0,0 +1,8 @@ +{{ header }} +CREATE OR REPLACE VIEW + `{{ project_id }}.{{ usage_reporting_clients_last_seen_view }}` +AS +SELECT + * +FROM + `{{ project_id }}.{{ usage_reporting_clients_last_seen_table }}` diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen_v1.metadata.yaml b/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen_v1.metadata.yaml new file mode 100644 index 00000000000..9c01b7bafbb --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen_v1.metadata.yaml @@ -0,0 +1,19 @@ +{{ header_yaml }} +friendly_name: Clients Last Seen Based on the DAU Reporting Ping. +description: |- + A daily aggregate of the usage_reporting ping for each `profile_usage_id` + representing their activity. + +owners: + - kik@mozilla.com +labels: + incremental: true + schedule: daily +scheduling: + dag_name: bqetl_glean_usage + task_group: {{ app_name }} +bigquery: + time_partitioning: + type: day + field: submission_date + require_partition_filter: true diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen_v1.query.sql b/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen_v1.query.sql new file mode 100644 index 00000000000..8207bcf4061 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen_v1.query.sql @@ -0,0 +1,54 @@ +{{ header }} + +WITH _current AS ( + SELECT + usage_profile_id, + -- In this raw table, we capture the history of activity over the past + -- 28 days for each usage criterion as a single 64-bit integer. The + -- rightmost bit in 'days_since_seen' represents whether the user sent a + -- usage_reporting ping in the submission_date and similarly, the rightmost bit in + -- days_active_bits represents whether the user counts as active on that date. + CAST(TRUE AS INT64) AS days_seen_bits, + CAST(TRUE AS INT64) & CAST(is_active AS INT64) AS days_active_bits, + udf.days_since_created_profile_as_28_bits( + DATE_DIFF(submission_date, first_run_date, DAY) + ) AS days_created_profile_bits, + FROM + `{{ usage_reporting_clients_daily_table }}` + WHERE + submission_date = @submission_date +), +_previous AS ( + SELECT + usage_profile_id, + days_seen_bits, + days_active_bits, + days_created_profile_bits, + FROM + `{{ usage_reporting_clients_last_seen_table }}` + WHERE + submission_date = DATE_SUB(@submission_date, INTERVAL 1 DAY) + -- Filter out rows from yesterday that have now fallen outside the 28-day window. + AND udf.shift_28_bits_one_day(days_seen_bits) > 0 +) +SELECT + @submission_date AS submission_date, + IF(_current.usage_profile_id IS NOT NULL, _current, _previous).* REPLACE ( + udf.combine_adjacent_days_28_bits( + _previous.days_seen_bits, + _current.days_seen_bits + ) AS days_seen_bits, + udf.combine_adjacent_days_28_bits( + _previous.days_active_bits, + _current.days_active_bits + ) AS days_active_bits, + udf.combine_adjacent_days_28_bits( + _previous.days_created_profile_bits, + _current.days_created_profile_bits + ) AS days_created_profile_bits + ) +FROM + _current +FULL JOIN + _previous + USING (usage_profile_id) diff --git a/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen_v1.schema.yaml b/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen_v1.schema.yaml new file mode 100644 index 00000000000..f1c5da406f9 --- /dev/null +++ b/sql_generators/glean_usage/templates/usage_reporting_clients_last_seen_v1.schema.yaml @@ -0,0 +1,30 @@ +fields: +- mode: NULLABLE + name: submission_date + type: DATE + description: | + Logical date used for processing and paritioning. + +- mode: NULLABLE + name: usage_profile_id + type: STRING + description: | + A UUID of the usage_profile. + +- mode: NULLABLE + name: days_seen_bits + type: INTEGER + description: | + Bit field shows on which of the last 28 days a client sent us the usage_reporting ping. + +- mode: NULLABLE + name: days_active_bits + type: INTEGER + description: | + Bit field shows on which of the last 28 days a client fulfilled the active criteria. + +- mode: NULLABLE + name: days_created_profile_bits + type: INTEGER + description: | + bit field indicating how many days lapsed since profile creation. diff --git a/sql_generators/glean_usage/usage_reporting_active_users_aggregates.py b/sql_generators/glean_usage/usage_reporting_active_users_aggregates.py new file mode 100644 index 00000000000..942bbec7c2f --- /dev/null +++ b/sql_generators/glean_usage/usage_reporting_active_users_aggregates.py @@ -0,0 +1,17 @@ +"""Generating and run usage_reporting_active_users_aggregates queries for Glean apps.""" + +from sql_generators.glean_usage.common import GleanTable + +TARGET_TABLE_ID = "usage_reporting_active_users_aggregates_v1" +PREFIX = "usage_reporting_active_users_aggregates" + + +class UsageReportingActiveUsersAggregatesTable(GleanTable): + """Represents generated usage_reporting_active_users_aggregates table.""" + + def __init__(self): + """Initialize usage_reporting_active_users_aggregates table.""" + GleanTable.__init__(self) + self.target_table_id = TARGET_TABLE_ID + self.prefix = PREFIX + self.base_table_name = "usage_reporting_v1" diff --git a/sql_generators/glean_usage/usage_reporting_clients_daily.py b/sql_generators/glean_usage/usage_reporting_clients_daily.py new file mode 100644 index 00000000000..0824d63005a --- /dev/null +++ b/sql_generators/glean_usage/usage_reporting_clients_daily.py @@ -0,0 +1,17 @@ +"""Generating and run dau_reporting_clients_daily queries for Glean apps.""" + +from sql_generators.glean_usage.common import GleanTable + +TARGET_TABLE_ID = "dau_reporting_clients_daily_v1" +PREFIX = "dau_reporting_clients_daily" + + +class UsageReportingClientsDailyTable(GleanTable): + """Represents generated dau_reporting_clients_daily table.""" + + def __init__(self): + """Initialize dau_reporting_clients_daily table.""" + GleanTable.__init__(self) + self.target_table_id = TARGET_TABLE_ID + self.prefix = PREFIX + self.base_table_name = "usage_reporting_v1" diff --git a/sql_generators/glean_usage/usage_reporting_clients_first_seen.py b/sql_generators/glean_usage/usage_reporting_clients_first_seen.py new file mode 100644 index 00000000000..7af65a56eb5 --- /dev/null +++ b/sql_generators/glean_usage/usage_reporting_clients_first_seen.py @@ -0,0 +1,17 @@ +"""Generating and run dau_reporting_clients_first_seen queries for Glean apps.""" + +from sql_generators.glean_usage.common import GleanTable + +TARGET_TABLE_ID = "dau_reporting_clients_first_seen_v1" +PREFIX = "dau_reporting_clients_first_seen" + + +class UsageReportingClientsFirstSeenTable(GleanTable): + """Represents generated dau_reporting_clients_first_seen table.""" + + def __init__(self): + """Initialize dau_reporting_clients_first_seen table.""" + GleanTable.__init__(self) + self.target_table_id = TARGET_TABLE_ID + self.prefix = PREFIX + self.base_table_name = "usage_reporting_v1" diff --git a/sql_generators/glean_usage/usage_reporting_clients_last_seen.py b/sql_generators/glean_usage/usage_reporting_clients_last_seen.py new file mode 100644 index 00000000000..eb8ffc741cd --- /dev/null +++ b/sql_generators/glean_usage/usage_reporting_clients_last_seen.py @@ -0,0 +1,17 @@ +"""Generating and run dau_reporting_clients_last_seen queries for Glean apps.""" + +from sql_generators.glean_usage.common import GleanTable + +TARGET_TABLE_ID = "dau_reporting_clients_last_seen_v1" +PREFIX = "dau_reporting_clients_last_seen" + + +class UsageReportingClientsLastSeenTable(GleanTable): + """Represents generated dau_reporting_clients_last_seen table.""" + + def __init__(self): + """Initialize dau_reporting_clients_last_seen table.""" + GleanTable.__init__(self) + self.target_table_id = TARGET_TABLE_ID + self.prefix = PREFIX + self.base_table_name = "usage_reporting_v1"