discourse/app/jobs/scheduled/maintain_browser_pageview_rollups.rb
Alan Guo Xiang Tan 31ff0071f8
FIX: Backfill normalized referrer for historical browser pageviews (#40357)
Previously, 437ab337d2 added the normalized_referrer column to the
browser_pageview_events table. That column stores the cleaned referrer
value used by the admin Top Referrers report, but existing
browser_pageview_events rows were not backfilled. Older rows still have
NULL normalized_referrer values, so historical dates can show little or
no referrer data even though the original referrer was recorded.

This PR backfills normalized_referrer for existing
browser_pageview_events rows and adds a mechanism to repeat that work if
the normalization rules change later.

Key technical changes:

1. Add the normalized_referrer_version column to the
browser_pageview_events table, which stores raw browser pageview events.
The new column records which version of the referrer normalization rules
processed each row. This gives the backfill a clear stopping condition
and lets a future rules change reprocess older rows by bumping the
version.

2. Replace the existing AggregateBrowserPageviewDailyRollups scheduled
job, which rebuilt browser pageview country and referrer daily rollups,
with the new MaintainBrowserPageviewRollups scheduled job. The new job
still keeps those rollups current, and it also backfills older
normalized referrers from the same execution path so separate jobs do
not rebuild the same rollups at the same time.

3. Backfill browser_pageview_events rows in batches so large sites do
not need to update all historical pageview events in one run. A day’s
referrer rollup is only rebuilt once every stale referrer row for that
day has been processed, so the Top Referrers report does not show
partial counts between batches.

4. Skip dates that no longer have source rows in the
browser_pageview_events table. Browser pageview events can be pruned by
cleanup, but daily rollups are the permanent report data. Before
rebuilding a date, the backfill checks that browser_pageview_events
still has events for that date. If no events remain, the existing rollup
is left untouched because the job can no longer safely reconstruct it.
2026-06-09 09:04:57 +08:00

141 lines
4.1 KiB
Ruby
Vendored

# frozen_string_literal: true
module Jobs
class MaintainBrowserPageviewRollups < ::Jobs::Scheduled
every 10.minutes
cluster_concurrency 1
def execute(_args)
return if !SiteSetting.persist_browser_pageview_events
aggregate
backfill
end
private
def aggregate
start_date, end_date = aggregation_window
return if start_date.nil?
BrowserPageviewCountryDailyRollup.aggregate(start_date: start_date, end_date: end_date)
BrowserPageviewReferrerDailyRollup.aggregate(start_date: start_date, end_date: end_date)
end
def aggregation_window
end_date = Time.zone.today
if BrowserPageviewCountryDailyRollup.none? && BrowserPageviewReferrerDailyRollup.none?
earliest_event_date = BrowserPageviewEvent.minimum(:created_at)&.to_date
[earliest_event_date, end_date]
else
[1.day.ago.to_date, end_date]
end
end
def backfill
rows = next_batch
return if rows.empty?
ids = rows.map(&:id)
store_normalized_referrers(rows)
BrowserPageviewReferrerDailyRollup.recompute(recomputable_dates(ids))
stamp_version(ids)
end
def next_batch
params = { version: BrowserPageviewReferrerInspector::VERSION, limit: batch_size }
retention_clause = ""
if SiteSetting.clean_up_browser_pageview_events
retention_clause = "AND created_at >= :retention_cutoff"
# CleanUpBrowserPageviewEvents computes its own cutoff, so around
# midnight the two cutoffs can differ by a day. The extra day ensures
# the backfill never rebuilds a day that cleanup may be deleting.
params[:retention_cutoff] = BrowserPageviewEvent.retention_cutoff + 1.day
end
DB.query(<<~SQL, params)
SELECT id, referrer
FROM browser_pageview_events
WHERE referrer IS NOT NULL
AND (
normalized_referrer_version IS NULL
OR normalized_referrer_version < :version
)
#{retention_clause}
LIMIT :limit
SQL
end
def store_normalized_referrers(rows)
ids = rows.map(&:id)
normalized = rows.map { |row| BrowserPageviewReferrerInspector.normalize(row.referrer) }
DB.exec(<<~SQL, ids: ids, normalized: normalized)
UPDATE browser_pageview_events AS e
SET normalized_referrer = data.normalized_referrer
FROM (
SELECT
unnest(ARRAY[:ids]::bigint[]) AS id,
unnest(ARRAY[:normalized]::text[]) AS normalized_referrer
) AS data
WHERE e.id = data.id
SQL
end
def recomputable_dates(ids)
params = { ids: ids, version: BrowserPageviewReferrerInspector::VERSION }
retention_clause = ""
if SiteSetting.clean_up_browser_pageview_events
retention_clause = "AND e.created_at >= :retention_cutoff"
params[:retention_cutoff] = BrowserPageviewEvent.retention_cutoff + 1.day
end
DB.query_single(<<~SQL, params)
WITH batch_ids AS (
SELECT unnest(ARRAY[:ids]::bigint[]) AS id
),
touched_dates AS (
SELECT DISTINCT created_at::date AS date
FROM browser_pageview_events
WHERE id IN (:ids)
)
SELECT touched_dates.date
FROM touched_dates
WHERE NOT EXISTS (
SELECT 1
FROM browser_pageview_events e
WHERE e.created_at >= touched_dates.date
AND e.created_at < touched_dates.date + 1
AND e.referrer IS NOT NULL
AND NOT EXISTS (
SELECT 1
FROM batch_ids
WHERE batch_ids.id = e.id
)
AND (
e.normalized_referrer_version IS NULL
OR e.normalized_referrer_version < :version
)
#{retention_clause}
)
SQL
end
def stamp_version(ids)
DB.exec(<<~SQL, version: BrowserPageviewReferrerInspector::VERSION, ids: ids)
UPDATE browser_pageview_events
SET normalized_referrer_version = :version
WHERE id IN (:ids)
SQL
end
def batch_size
SiteSetting.browser_pageview_referrer_backfill_batch_size
end
end
end