mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-05-15 03:39:43 +08:00
Implemented an eval “comparison matrix” that lets you run the same evals across multiple personas or multiple LLMs and have a judge model declare a winner with per-candidate scores. The CLI adds --compare personas|llms, keeps persona selection (auto-prepending default for persona mode), and always ensures a judge is configured. A dedicated ComparisonRunner reuses Workbench results to build candidate outputs and sends them to Judge#compare, which crafts a rubric-aware comparison prompt and parses structured winner/ratings JSON. Outputs are streamed to the console and individual run logs still get written. README documents how to use the new flag and what each mode does.
77 lines
2 KiB
Ruby
77 lines
2 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
module DiscourseAi
|
|
module Evals
|
|
class Features
|
|
def initialize(modules: DiscourseAi::Configuration::Module.all, output: $stdout)
|
|
@modules = modules
|
|
@output = output
|
|
end
|
|
|
|
def print
|
|
module_entries.each do |module_name, entries|
|
|
output.puts module_name
|
|
|
|
if entries.empty?
|
|
output.puts " - no registered features"
|
|
next
|
|
end
|
|
|
|
entries.each { |entry| output.puts " - #{entry[:key]}" }
|
|
end
|
|
end
|
|
|
|
def feature_map(evals)
|
|
grouped_evals = Array(evals).group_by { |eval| eval.feature }
|
|
grouped_evals.transform_values { |mapped_evals| mapped_evals.map(&:id).sort }
|
|
end
|
|
|
|
def feature_keys
|
|
entries.map { |entry| entry[:key] }
|
|
end
|
|
|
|
def valid_feature_key?(key)
|
|
custom_keys = %w[custom:prompt custom:pdf_to_text custom:image_to_text custom:edit_artifact]
|
|
return true if custom_keys.include?(key)
|
|
|
|
feature_keys.include?(key)
|
|
end
|
|
|
|
def validate_feature!(feature_key)
|
|
return if feature_key.blank?
|
|
return if valid_feature_key?(feature_key)
|
|
|
|
STDERR.puts(
|
|
"Unknown feature '#{feature_key}'. Run with --list-features to view valid keys.",
|
|
)
|
|
exit 1
|
|
end
|
|
|
|
private
|
|
|
|
attr_reader :modules, :output
|
|
|
|
def module_entries
|
|
@module_entries ||= modules.map { |mod| [mod.name, entries_for_module(mod)] }
|
|
end
|
|
|
|
def entries
|
|
@entries ||= module_entries.flat_map { |(_, m_entries)| m_entries }
|
|
end
|
|
|
|
def entries_for_module(mod)
|
|
feature_entries_by_module[mod] ||= Array(mod.features).map do |feature|
|
|
{ key: feature_key(mod, feature), module_name: mod.name }
|
|
end
|
|
end
|
|
|
|
def feature_entries_by_module
|
|
@feature_entries_by_module ||= {}
|
|
end
|
|
|
|
def feature_key(mod, feature)
|
|
"#{mod.name}:#{feature.name}"
|
|
end
|
|
end
|
|
end
|
|
end
|