mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-05-02 21:27:59 +08:00
The `emojiReplacementRegex` in `pretty-text/emoji.js` was a manually maintained regex string copied from an external source (mathiasbynens/emoji-test-regex-pattern). This created a maintenance gap: when the `discourse-emojis` gem was updated with new Unicode emoji (e.g. Unicode 17.0), the replacements map would include them but the regex would not match their raw Unicode characters. This meant pasting a newer emoji like (distorted face) would pass through un-replaced. This commit eliminates the manual step by generating the regex automatically from `Emoji.unicode_replacements` during `rake javascript:update_constants` — the same task that already generates the emoji names, aliases, and replacements map. A new `Emoji::RegexGenerator` module builds a trie from all emoji Unicode sequences (converted to UTF-16 code units for JS compatibility), then emits an optimized regex pattern with character class ranges and shared-prefix grouping. The generated regex is exported from `pretty-text/emoji/data.js` alongside the other emoji constants, and `emoji.js` now imports it instead of hardcoding it. The generated regex matches all 3,418 emoji keys (including the 43 Unicode 17.0 emoji the old regex missed), is ~20% faster in benchmarks, and can never drift from the emoji database again. Closes #38416
201 lines
5.6 KiB
Ruby
201 lines
5.6 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
# Generates an optimized JavaScript-compatible regex pattern that matches
|
|
# all Unicode emoji sequences from Emoji.unicode_replacements.
|
|
#
|
|
# The generated regex uses:
|
|
# - Character class ranges for contiguous codepoints (e.g., \u2600-\u2604)
|
|
# - Trie-based grouping for shared prefixes (e.g., \uD83D(?:...))
|
|
# - Optional markers for variation selectors and skin tones
|
|
#
|
|
# Usage:
|
|
# Emoji::RegexGenerator.generate # => "☻|♡|[#*0-9]\\uFE0F?..."
|
|
#
|
|
class Emoji
|
|
module RegexGenerator
|
|
# Characters added beyond the standard Unicode emoji set
|
|
EXTRA_PATTERNS = %w[☻ ♡].freeze
|
|
|
|
module_function
|
|
|
|
def generate
|
|
sequences = build_sequences
|
|
trie = build_trie(sequences)
|
|
pattern = trie_to_pattern(trie)
|
|
extras = EXTRA_PATTERNS.join("|")
|
|
"#{extras}|#{pattern}"
|
|
end
|
|
|
|
VARIATION_SELECTOR = 0xFE0F
|
|
|
|
# Convert emoji keys to sorted UTF-16 code unit sequences.
|
|
# Also adds FE0F (variation selector) variants so the regex matches
|
|
# emoji typed with or without the variation selector (e.g. ☠ and ☠️).
|
|
def build_sequences
|
|
sequences = []
|
|
Emoji.unicode_replacements.each_key do |key|
|
|
codepoints = key.codepoints
|
|
seq = to_utf16_code_units(codepoints)
|
|
sequences << seq
|
|
sequences << seq + [VARIATION_SELECTOR] unless codepoints.last == VARIATION_SELECTOR
|
|
end
|
|
sequences.sort!.uniq!
|
|
sequences
|
|
end
|
|
|
|
# Convert Unicode codepoints to JavaScript UTF-16 code units (surrogate pairs for > 0xFFFF)
|
|
def to_utf16_code_units(codepoints)
|
|
units = []
|
|
codepoints.each do |cp|
|
|
if cp > 0xFFFF
|
|
cp -= 0x10000
|
|
units << (0xD800 + (cp >> 10))
|
|
units << (0xDC00 + (cp & 0x3FF))
|
|
else
|
|
units << cp
|
|
end
|
|
end
|
|
units
|
|
end
|
|
|
|
# Build a trie (nested hash) from sequences of code units
|
|
# Each leaf is marked with :end => true
|
|
def build_trie(sequences)
|
|
trie = {}
|
|
sequences.each do |seq|
|
|
node = trie
|
|
seq.each do |unit|
|
|
node[unit] ||= {}
|
|
node = node[unit]
|
|
end
|
|
node[:end] = true
|
|
end
|
|
trie
|
|
end
|
|
|
|
# Convert a trie node to an optimized regex pattern string
|
|
def trie_to_pattern(node)
|
|
return nil if node.empty? || (node.keys == [:end])
|
|
|
|
children = node.reject { |k, _| k == :end }
|
|
return nil if children.empty?
|
|
|
|
is_optional = node[:end] # This node is also a valid endpoint
|
|
|
|
alternatives = build_alternatives(children)
|
|
|
|
result =
|
|
if alternatives.size == 1
|
|
alternatives.first
|
|
else
|
|
"(?:#{alternatives.join("|")})"
|
|
end
|
|
|
|
result = "(?:#{result})?" if is_optional && children.size > 0
|
|
result
|
|
end
|
|
|
|
# Group children by shared structure to produce compact alternatives
|
|
def build_alternatives(children)
|
|
# Separate children into "terminal" (leaf after this unit) and "continuing"
|
|
terminal_units = []
|
|
continuing = {}
|
|
|
|
children.each do |unit, child|
|
|
child_keys = child.reject { |k, _| k == :end }
|
|
if child_keys.empty? && child[:end]
|
|
terminal_units << unit
|
|
else
|
|
continuing[unit] = child
|
|
end
|
|
end
|
|
|
|
alternatives = []
|
|
|
|
# Terminal units can be combined into a character class
|
|
alternatives << char_class(terminal_units) if terminal_units.any?
|
|
|
|
# Group continuing children by their subtree pattern for factoring
|
|
# e.g., if multiple units lead to the same suffix pattern, group them
|
|
by_suffix = {}
|
|
continuing.each do |unit, child|
|
|
suffix = trie_to_pattern(child)
|
|
is_also_end = child[:end]
|
|
key = [suffix, is_also_end]
|
|
by_suffix[key] ||= []
|
|
by_suffix[key] << unit
|
|
end
|
|
|
|
by_suffix.each do |(suffix, _is_also_end), units|
|
|
prefix = char_class(units)
|
|
if suffix
|
|
if units.size > 1
|
|
alternatives << "#{prefix}#{suffix}"
|
|
else
|
|
# Single unit prefix — no need for extra grouping
|
|
alternatives << "#{escape_unit(units.first)}#{suffix}"
|
|
end
|
|
else
|
|
alternatives << prefix
|
|
end
|
|
end
|
|
|
|
alternatives
|
|
end
|
|
|
|
# Build a character class or single escape from a set of code units
|
|
def char_class(units)
|
|
return escape_unit(units.first) if units.size == 1
|
|
|
|
# Find contiguous ranges
|
|
sorted = units.sort
|
|
ranges = []
|
|
range_start = sorted.first
|
|
range_end = sorted.first
|
|
|
|
sorted
|
|
.drop(1)
|
|
.each do |u|
|
|
if u == range_end + 1
|
|
range_end = u
|
|
else
|
|
ranges << [range_start, range_end]
|
|
range_start = u
|
|
range_end = u
|
|
end
|
|
end
|
|
ranges << [range_start, range_end]
|
|
|
|
parts =
|
|
ranges.map do |s, e|
|
|
if s == e
|
|
escape_unit(s)
|
|
elsif e == s + 1
|
|
"#{escape_unit(s)}#{escape_unit(e)}"
|
|
else
|
|
"#{escape_unit(s)}-#{escape_unit(e)}"
|
|
end
|
|
end
|
|
|
|
"[#{parts.join}]"
|
|
end
|
|
|
|
# Escape a single UTF-16 code unit for use in a JS regex string
|
|
def escape_unit(unit)
|
|
if unit < 0x80 && unit.chr.match?(/[a-zA-Z0-9 ]/)
|
|
# Printable ASCII that's safe in regex
|
|
unit.chr
|
|
elsif unit < 0x80
|
|
# ASCII symbols — some need escaping in regex
|
|
case unit.chr
|
|
when "#", "*", ".", "+", "?", "(", ")", "[", "]", "{", "}", "\\", "^", "$", "|"
|
|
"\\#{unit.chr}"
|
|
else
|
|
unit.chr
|
|
end
|
|
else
|
|
format("\\u%04X", unit)
|
|
end
|
|
end
|
|
end
|
|
end
|