discourse/plugins/discourse-ai/evals/run
Roman Rizzi 3a647c8e50
FEATURE: Use evals to compare LLMs and Personas' prompts (#36027)
Implemented an eval “comparison matrix” that lets you run the same evals
across multiple personas or multiple LLMs and have a judge model declare
a winner with per-candidate scores. The CLI adds --compare
personas|llms, keeps persona selection (auto-prepending default for
persona mode), and always ensures a judge is configured. A dedicated
ComparisonRunner reuses Workbench results to build candidate outputs and
sends them to Judge#compare, which crafts a rubric-aware comparison
prompt and parses structured winner/ratings JSON. Outputs are streamed
to the console and individual run logs still get written. README
documents how to use the new flag and what each mode does.
2025-11-18 10:39:52 -03:00

130 lines
3 KiB
Ruby
Vendored

#!/usr/bin/env ruby
# frozen_string_literal: true
require_relative "lib/boot"
require_relative "lib/llm_repository"
require_relative "lib/eval"
require_relative "lib/prompts/prompt_evaluator"
require_relative "lib/prompts/single_test_runner"
require_relative "lib/features"
require_relative "lib/recorder"
require_relative "lib/workbench"
require_relative "lib/persona_prompt_loader"
require_relative "lib/comparison_runner"
require_relative "lib/cli"
features_registry =
DiscourseAi::Evals::Features.new(modules: DiscourseAi::Configuration::Module.all)
cli = DiscourseAi::Evals::Cli.parse_options!(features_registry)
llm_repository = DiscourseAi::Evals::LlmRepository.new
if cli.list_models
llm_repository.print
exit 0
end
if cli.list_features
features_registry.print
exit 0
end
persona_loader = DiscourseAi::Evals::PersonaPromptLoader.new
if cli.list_personas
persona_loader.print
exit 0
end
available_evals = DiscourseAi::Evals::Eval.available_cases
if cli.list
available_evals.each(&:print)
exit 0
end
llms = llm_repository.choose(cli.models)
if llms.empty?
puts "Error: Unknown models '#{cli.models}'"
exit 1
end
selected_evals = cli.select_evals(available_evals)
judge_llm = nil
default_judge_error = nil
begin
judge_llm = llm_repository.hydrate(cli.judge_name)
rescue StandardError => e
if cli.judge_provided
puts "Error: #{e.message}"
exit 1
else
judge_llm = nil
default_judge_error = e.message
end
end
persona_variants =
persona_loader.variants_for(cli.persona_keys, comparison_mode: cli.comparison_mode)
requires_judge =
cli.comparison_mode.present? || selected_evals.any? { |eval_case| eval_case.judge.present? }
if requires_judge && judge_llm.nil?
message = "Error: Selected evaluations require a judge."
if default_judge_error
message += " Configure '#{cli.judge}' or pass --judge with an LLM config name."
message += "\n\n"
message += default_judge_error
else
message += " Pass --judge with an LLM config name."
end
puts message
exit 1
end
if cli.comparison_mode
begin
DiscourseAi::Evals::ComparisonRunner.new(
mode: cli.comparison_mode,
judge_llm: judge_llm,
output: $stdout,
).run(eval_cases: selected_evals, persona_variants: persona_variants, llms: llms)
rescue DiscourseAi::Evals::ComparisonRunner::ComparisonError => e
puts "Error: #{e.message}"
exit 1
end
exit 0
end
persona_variants.each do |variant|
if variant[:key]
label =
if variant[:key] == default_persona_key
"default (built-in)"
else
variant[:key]
end
puts "\n=== Persona: #{label} ==="
end
playground =
DiscourseAi::Evals::Workbench.new(
output: $stdout,
judge_llm: judge_llm,
persona_prompt: variant[:prompt],
persona_label: variant[:key],
)
selected_evals.each do |eval_case|
if cli.judge_provided && eval_case.judge.blank?
puts "Notice: Eval '#{eval_case.id}' has no judge block. --judge is ignored for this eval."
end
playground.run(eval_case: eval_case, llms: llms)
end
end