discourse/lib/emoji/regex_generator.rb
Régis Hanol 97c257e00c
DEV: Auto-generate emojiReplacementRegex from emoji database (#38491)
The `emojiReplacementRegex` in `pretty-text/emoji.js` was a manually
maintained regex string copied from an external source
(mathiasbynens/emoji-test-regex-pattern). This created a maintenance
gap: when the `discourse-emojis` gem was updated with new Unicode emoji
(e.g. Unicode 17.0), the replacements map would include them but the
regex would not match their raw Unicode characters. This meant pasting a
newer emoji like 🫪 (distorted face) would pass through un-replaced.

This commit eliminates the manual step by generating the regex
automatically from `Emoji.unicode_replacements` during `rake
javascript:update_constants` — the same task that already generates the
emoji names, aliases, and replacements map.

A new `Emoji::RegexGenerator` module builds a trie from all emoji
Unicode sequences (converted to UTF-16 code units for JS compatibility),
then emits an optimized regex pattern with character class ranges and
shared-prefix grouping. The generated regex is exported from
`pretty-text/emoji/data.js` alongside the other emoji constants, and
`emoji.js` now imports it instead of hardcoding it.

The generated regex matches all 3,418 emoji keys (including the 43
Unicode 17.0 emoji the old regex missed), is ~20% faster in benchmarks,
and can never drift from the emoji database again.

Closes #38416
2026-03-16 14:32:57 +01:00

201 lines
5.6 KiB
Ruby

# frozen_string_literal: true
# Generates an optimized JavaScript-compatible regex pattern that matches
# all Unicode emoji sequences from Emoji.unicode_replacements.
#
# The generated regex uses:
# - Character class ranges for contiguous codepoints (e.g., \u2600-\u2604)
# - Trie-based grouping for shared prefixes (e.g., \uD83D(?:...))
# - Optional markers for variation selectors and skin tones
#
# Usage:
# Emoji::RegexGenerator.generate # => "☻|♡|[#*0-9]\\uFE0F?..."
#
class Emoji
module RegexGenerator
# Characters added beyond the standard Unicode emoji set
EXTRA_PATTERNS = %w[☻ ♡].freeze
module_function
def generate
sequences = build_sequences
trie = build_trie(sequences)
pattern = trie_to_pattern(trie)
extras = EXTRA_PATTERNS.join("|")
"#{extras}|#{pattern}"
end
VARIATION_SELECTOR = 0xFE0F
# Convert emoji keys to sorted UTF-16 code unit sequences.
# Also adds FE0F (variation selector) variants so the regex matches
# emoji typed with or without the variation selector (e.g. ☠ and ☠️).
def build_sequences
sequences = []
Emoji.unicode_replacements.each_key do |key|
codepoints = key.codepoints
seq = to_utf16_code_units(codepoints)
sequences << seq
sequences << seq + [VARIATION_SELECTOR] unless codepoints.last == VARIATION_SELECTOR
end
sequences.sort!.uniq!
sequences
end
# Convert Unicode codepoints to JavaScript UTF-16 code units (surrogate pairs for > 0xFFFF)
def to_utf16_code_units(codepoints)
units = []
codepoints.each do |cp|
if cp > 0xFFFF
cp -= 0x10000
units << (0xD800 + (cp >> 10))
units << (0xDC00 + (cp & 0x3FF))
else
units << cp
end
end
units
end
# Build a trie (nested hash) from sequences of code units
# Each leaf is marked with :end => true
def build_trie(sequences)
trie = {}
sequences.each do |seq|
node = trie
seq.each do |unit|
node[unit] ||= {}
node = node[unit]
end
node[:end] = true
end
trie
end
# Convert a trie node to an optimized regex pattern string
def trie_to_pattern(node)
return nil if node.empty? || (node.keys == [:end])
children = node.reject { |k, _| k == :end }
return nil if children.empty?
is_optional = node[:end] # This node is also a valid endpoint
alternatives = build_alternatives(children)
result =
if alternatives.size == 1
alternatives.first
else
"(?:#{alternatives.join("|")})"
end
result = "(?:#{result})?" if is_optional && children.size > 0
result
end
# Group children by shared structure to produce compact alternatives
def build_alternatives(children)
# Separate children into "terminal" (leaf after this unit) and "continuing"
terminal_units = []
continuing = {}
children.each do |unit, child|
child_keys = child.reject { |k, _| k == :end }
if child_keys.empty? && child[:end]
terminal_units << unit
else
continuing[unit] = child
end
end
alternatives = []
# Terminal units can be combined into a character class
alternatives << char_class(terminal_units) if terminal_units.any?
# Group continuing children by their subtree pattern for factoring
# e.g., if multiple units lead to the same suffix pattern, group them
by_suffix = {}
continuing.each do |unit, child|
suffix = trie_to_pattern(child)
is_also_end = child[:end]
key = [suffix, is_also_end]
by_suffix[key] ||= []
by_suffix[key] << unit
end
by_suffix.each do |(suffix, _is_also_end), units|
prefix = char_class(units)
if suffix
if units.size > 1
alternatives << "#{prefix}#{suffix}"
else
# Single unit prefix — no need for extra grouping
alternatives << "#{escape_unit(units.first)}#{suffix}"
end
else
alternatives << prefix
end
end
alternatives
end
# Build a character class or single escape from a set of code units
def char_class(units)
return escape_unit(units.first) if units.size == 1
# Find contiguous ranges
sorted = units.sort
ranges = []
range_start = sorted.first
range_end = sorted.first
sorted
.drop(1)
.each do |u|
if u == range_end + 1
range_end = u
else
ranges << [range_start, range_end]
range_start = u
range_end = u
end
end
ranges << [range_start, range_end]
parts =
ranges.map do |s, e|
if s == e
escape_unit(s)
elsif e == s + 1
"#{escape_unit(s)}#{escape_unit(e)}"
else
"#{escape_unit(s)}-#{escape_unit(e)}"
end
end
"[#{parts.join}]"
end
# Escape a single UTF-16 code unit for use in a JS regex string
def escape_unit(unit)
if unit < 0x80 && unit.chr.match?(/[a-zA-Z0-9 ]/)
# Printable ASCII that's safe in regex
unit.chr
elsif unit < 0x80
# ASCII symbols — some need escaping in regex
case unit.chr
when "#", "*", ".", "+", "?", "(", ")", "[", "]", "{", "}", "\\", "^", "$", "|"
"\\#{unit.chr}"
else
unit.chr
end
else
format("\\u%04X", unit)
end
end
end
end