discourse/plugins/discourse-ai/evals/lib/cli.rb
Sam e3fae646d4
DEV: AI persona to agent migration (#38319)
Co-authored-by: Keegan George <kgeorge13@gmail.com>
2026-03-10 15:59:45 +11:00

188 lines
5 KiB
Ruby
Vendored

# frozen_string_literal: true
require_relative "features"
require_relative "agent_prompt_loader"
class DiscourseAi::Evals::Cli
DEFAULT_JUDGE = "gpt-4o"
attr_reader :agent_keys
attr_accessor :eval_name,
:models,
:list,
:list_models,
:list_features,
:list_agents,
:feature_key,
:judge_name,
:comparison_mode,
:dataset_path
def self.parse_options!(features_registry)
cli = new
parser =
OptionParser.new do |opts|
opts.banner = "Usage: evals/run [options]"
opts.on("-e", "--eval NAME", "Name of the evaluation to run") do |eval_name|
cli.eval_name = eval_name
end
opts.on("--list-models", "List models") { cli.list_models = true }
opts.on("--list-features", "List features available for evals") { cli.list_features = true }
opts.on("--list-agents", "List agent definitions available to evals") do
cli.list_agents = true
end
opts.on(
"-m",
"--models NAME",
"Models to evaluate (comma separated, defaults to all)",
) { |models| cli.models = models }
opts.on("-l", "--list", "List evals") { cli.list = true }
opts.on(
"-f",
"--feature KEY",
"Feature key to evaluate (module_name:feature_name)",
) { |key| cli.feature_key = key }
opts.on(
"-j",
"--judge NAME",
"LLM config used to judge eval outputs (defaults to gpt-4o when available)",
) { |judge| cli.judge_name = judge }
opts.on(
"--agent-keys KEYS",
"Comma-separated list of agent keys to run sequentially",
) { |keys| keys.split(",").each { |key| cli.add_agent_key(key) } }
opts.on("--compare MODE", "Comparison mode (agents or llms)") do |mode|
cli.comparison_mode = mode
end
opts.on("--dataset PATH", "Path to a CSV dataset file (requires --feature)") do |path|
cli.dataset_path = path
end
end
show_help = ARGV.empty?
parser.parse!
if show_help
puts parser
exit 0
end
if cli.feature_key && !features_registry.valid_feature_key?(cli.feature_key)
STDERR.puts(
"Unknown feature '#{cli.feature_key}'. Run with --list-features to view valid keys.",
)
exit 1
end
if cli.comparison_mode.present?
normalized = cli.comparison_mode.to_s.downcase.strip
cli.comparison_mode =
case normalized
when "agent", "agents"
:agents
when "llms", "models"
:llms
else
STDERR.puts("Unknown comparison mode '#{cli.comparison_mode}'. Use Agents or LLMs.")
exit 1
end
if cli.comparison_mode == :agents
cli.add_agent_key(DiscourseAi::Evals::AgentPromptLoader::DEFAULT_AGENT_KEY)
end
end
if cli.dataset_path.present? && cli.feature_key.blank?
STDERR.puts("--dataset requires a --feature flag identifying the eval feature to run.")
exit 1
end
cli
end
def initialize
@agent_keys = Set.new
end
def judge_provided?
judge_name.present?
end
def add_agent_key(key)
trimmed = key.to_s.strip
return if trimmed.empty?
@agent_keys << trimmed
end
def select_evals(available_evals)
evals = available_evals
evals = evals.select { |eval_case| eval_case.feature == feature_key } if feature_key.present?
evals = evals.select { |eval_case| eval_case.id == eval_name } if eval_name.present?
if evals.empty?
if feature_key
puts "Error: No evaluations registered for feature '#{feature_key}'"
else
puts "Error: Unknown evaluation '#{eval_name}'"
end
exit 1
end
evals
end
def validate_comparison_requirements!(llms:, agent_variants:)
case comparison_mode
when :llms
if agent_variants.length != 1
STDERR.puts("LLM comparison runs against exactly one agent.")
exit 1
end
when :agents
if llms.length != 1
STDERR.puts("Agent comparison requires exactly one LLM.")
exit 1
end
if agent_variants.length < 2
STDERR.puts("Agent comparison needs at least two agents.")
exit 1
end
else
if agent_variants.length > 1
STDERR.puts(
"Non-comparison runs accept only one agent. Remove extra --agent-keys or use --compare agents.",
)
exit 1
end
end
end
def validate_judge_presence!(requires_judge:, judge_llm:, default_judge_error:)
return if !requires_judge || judge_llm
message = "Error: Selected evaluations require a judge."
if default_judge_error
message += " Configure '#{judge_name}' or pass --judge with an LLM config name."
message += "\n\n"
message += default_judge_error
else
message += " Pass --judge with an LLM config name."
end
puts message
exit 1
end
end