discourse/lib/crawler_scorer.rb
Krzysztof Kotlarek 52c15cdaf9
DEV: Persist per-heuristic crawler score breakdown (#40022)
Adds a `browser_pageview_event_scores` side table keyed by `event_id`,
  holding the per-heuristic contribution as six smallint columns:

  - `automation_ua_score` (0 or 100)
  - `known_asn_score` (0 or 15)
  - `velocity_score` (0, 10, 20, or 35)
  - `churn_score` (0, 10, or 20)
  - `rapid_nav_score` (0 or 15)
  - `referrer_score` (0, 5, or 10)

The side table only holds rows for events that scored above zero.
2026-05-18 12:13:08 +08:00

214 lines
6.4 KiB
Ruby
Vendored

# frozen_string_literal: true
class CrawlerScorer
AUTOMATION_UA_SCORE = 100
KNOWN_ASN_SCORE = 15
VELOCITY_LOW = 120
VELOCITY_MEDIUM = 300
VELOCITY_HIGH = 600
VELOCITY_LOW_SCORE = 10
VELOCITY_MEDIUM_SCORE = 20
VELOCITY_HIGH_SCORE = 35
CHURN_LOW_MIN_SESSIONS = 5
CHURN_HIGH_MIN_SESSIONS = 10
CHURN_MAX_AVG_EVENTS = 2
CHURN_LOW_SCORE = 10
CHURN_HIGH_SCORE = 20
RAPID_NAV_MIN_GAPS = 10
RAPID_NAV_MAX_MEDIAN_SECONDS = 5
RAPID_NAV_SCORE = 15
REFERRER_MIN_EVENTS = 5
REFERRER_LOW_RATIO = 0.5
REFERRER_HIGH_RATIO = 0.8
REFERRER_LOW_SCORE = 5
REFERRER_HIGH_SCORE = 10
def self.score!(window_start:, window_end:)
crawler_asns = SiteSetting.crawler_asns_map.map(&:to_i)
ActiveRecord::Base.transaction do
DB.exec(
SQL,
window_start: window_start,
window_end: window_end,
ua_regex: SiteSetting.crawler_automation_user_agents,
crawler_asns: crawler_asns,
hostname: Discourse.current_hostname,
automation_ua_score: AUTOMATION_UA_SCORE,
known_asn_score: KNOWN_ASN_SCORE,
velocity_low: VELOCITY_LOW,
velocity_medium: VELOCITY_MEDIUM,
velocity_high: VELOCITY_HIGH,
velocity_low_score: VELOCITY_LOW_SCORE,
velocity_medium_score: VELOCITY_MEDIUM_SCORE,
velocity_high_score: VELOCITY_HIGH_SCORE,
churn_low_min_sessions: CHURN_LOW_MIN_SESSIONS,
churn_high_min_sessions: CHURN_HIGH_MIN_SESSIONS,
churn_max_avg_events: CHURN_MAX_AVG_EVENTS,
churn_low_score: CHURN_LOW_SCORE,
churn_high_score: CHURN_HIGH_SCORE,
rapid_nav_min_gaps: RAPID_NAV_MIN_GAPS,
rapid_nav_max_median_seconds: RAPID_NAV_MAX_MEDIAN_SECONDS,
rapid_nav_score: RAPID_NAV_SCORE,
referrer_min_events: REFERRER_MIN_EVENTS,
referrer_low_ratio: REFERRER_LOW_RATIO,
referrer_high_ratio: REFERRER_HIGH_RATIO,
referrer_low_score: REFERRER_LOW_SCORE,
referrer_high_score: REFERRER_HIGH_SCORE,
)
end
end
SQL = <<~SQL
WITH events AS (
SELECT id, session_id, ip_address, user_agent, referrer, asn, created_at
FROM browser_pageview_events
WHERE created_at >= :window_start
AND created_at < :window_end
),
ipua_stats AS (
SELECT
ip_address,
user_agent,
COUNT(*) AS pageviews,
COUNT(DISTINCT session_id) AS distinct_sessions,
AVG(
CASE
WHEN referrer IS NULL THEN 1.0
WHEN substring(referrer from '^https?://([^/]+)') = :hostname THEN 0.0
ELSE 1.0
END
) AS bad_referrer_ratio
FROM events
GROUP BY ip_address, user_agent
),
gaps AS (
SELECT
ip_address,
user_agent,
EXTRACT(EPOCH FROM created_at - LAG(created_at) OVER (
PARTITION BY ip_address, user_agent ORDER BY created_at
)) AS gap_seconds
FROM events
),
median_gap AS (
SELECT
ip_address,
user_agent,
percentile_cont(0.5) WITHIN GROUP (ORDER BY gap_seconds)
AS median_gap_seconds,
COUNT(*) AS gap_count
FROM gaps
WHERE gap_seconds IS NOT NULL
GROUP BY ip_address, user_agent
),
breakdown AS (
SELECT
e.id,
CASE
WHEN :ua_regex <> '' AND e.user_agent ~* :ua_regex THEN :automation_ua_score
ELSE 0
END AS automation_ua_score,
CASE
WHEN e.asn = ANY(ARRAY[:crawler_asns]::int[]) THEN :known_asn_score
ELSE 0
END AS known_asn_score,
CASE
WHEN iu.pageviews >= :velocity_high THEN :velocity_high_score
WHEN iu.pageviews >= :velocity_medium THEN :velocity_medium_score
WHEN iu.pageviews >= :velocity_low THEN :velocity_low_score
ELSE 0
END AS velocity_score,
CASE
WHEN iu.distinct_sessions >= :churn_high_min_sessions
AND iu.pageviews::float / NULLIF(iu.distinct_sessions, 0) <= :churn_max_avg_events
THEN :churn_high_score
WHEN iu.distinct_sessions >= :churn_low_min_sessions
AND iu.pageviews::float / NULLIF(iu.distinct_sessions, 0) <= :churn_max_avg_events
THEN :churn_low_score
ELSE 0
END AS churn_score,
CASE
WHEN mg.gap_count >= :rapid_nav_min_gaps
AND mg.median_gap_seconds < :rapid_nav_max_median_seconds
THEN :rapid_nav_score
ELSE 0
END AS rapid_nav_score,
CASE
WHEN iu.pageviews >= :referrer_min_events
AND iu.bad_referrer_ratio >= :referrer_high_ratio THEN :referrer_high_score
WHEN iu.pageviews >= :referrer_min_events
AND iu.bad_referrer_ratio >= :referrer_low_ratio THEN :referrer_low_score
ELSE 0
END AS referrer_score
FROM events e
LEFT JOIN ipua_stats iu USING (ip_address, user_agent)
LEFT JOIN median_gap mg USING (ip_address, user_agent)
),
totals AS (
SELECT
id,
automation_ua_score,
known_asn_score,
velocity_score,
churn_score,
rapid_nav_score,
referrer_score,
automation_ua_score + known_asn_score + velocity_score
+ churn_score + rapid_nav_score + referrer_score AS score
FROM breakdown
),
updated AS (
UPDATE browser_pageview_events e
SET score = t.score
FROM totals t
WHERE e.id = t.id
AND t.score > 0
AND t.score > COALESCE(e.score, 0)
RETURNING e.id,
t.automation_ua_score,
t.known_asn_score,
t.velocity_score,
t.churn_score,
t.rapid_nav_score,
t.referrer_score
)
INSERT INTO browser_pageview_event_scores (
event_id,
automation_ua_score,
known_asn_score,
velocity_score,
churn_score,
rapid_nav_score,
referrer_score
)
SELECT
id,
automation_ua_score,
known_asn_score,
velocity_score,
churn_score,
rapid_nav_score,
referrer_score
FROM updated
ON CONFLICT (event_id) DO UPDATE
SET automation_ua_score = EXCLUDED.automation_ua_score,
known_asn_score = EXCLUDED.known_asn_score,
velocity_score = EXCLUDED.velocity_score,
churn_score = EXCLUDED.churn_score,
rapid_nav_score = EXCLUDED.rapid_nav_score,
referrer_score = EXCLUDED.referrer_score;
SQL
end