mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-05-24 02:41:20 +08:00
Implemented an eval “comparison matrix” that lets you run the same evals across multiple personas or multiple LLMs and have a judge model declare a winner with per-candidate scores. The CLI adds --compare personas|llms, keeps persona selection (auto-prepending default for persona mode), and always ensures a judge is configured. A dedicated ComparisonRunner reuses Workbench results to build candidate outputs and sends them to Judge#compare, which crafts a rubric-aware comparison prompt and parses structured winner/ratings JSON. Outputs are streamed to the console and individual run logs still get written. README documents how to use the new flag and what each mode does.
130 lines
3 KiB
Ruby
Vendored
130 lines
3 KiB
Ruby
Vendored
#!/usr/bin/env ruby
|
|
# frozen_string_literal: true
|
|
|
|
require_relative "lib/boot"
|
|
require_relative "lib/llm_repository"
|
|
require_relative "lib/eval"
|
|
require_relative "lib/prompts/prompt_evaluator"
|
|
require_relative "lib/prompts/single_test_runner"
|
|
require_relative "lib/features"
|
|
require_relative "lib/recorder"
|
|
require_relative "lib/workbench"
|
|
require_relative "lib/persona_prompt_loader"
|
|
require_relative "lib/comparison_runner"
|
|
require_relative "lib/cli"
|
|
|
|
features_registry =
|
|
DiscourseAi::Evals::Features.new(modules: DiscourseAi::Configuration::Module.all)
|
|
cli = DiscourseAi::Evals::Cli.parse_options!(features_registry)
|
|
|
|
llm_repository = DiscourseAi::Evals::LlmRepository.new
|
|
|
|
if cli.list_models
|
|
llm_repository.print
|
|
exit 0
|
|
end
|
|
|
|
if cli.list_features
|
|
features_registry.print
|
|
exit 0
|
|
end
|
|
|
|
persona_loader = DiscourseAi::Evals::PersonaPromptLoader.new
|
|
|
|
if cli.list_personas
|
|
persona_loader.print
|
|
exit 0
|
|
end
|
|
|
|
available_evals = DiscourseAi::Evals::Eval.available_cases
|
|
|
|
if cli.list
|
|
available_evals.each(&:print)
|
|
exit 0
|
|
end
|
|
|
|
llms = llm_repository.choose(cli.models)
|
|
|
|
if llms.empty?
|
|
puts "Error: Unknown models '#{cli.models}'"
|
|
exit 1
|
|
end
|
|
|
|
selected_evals = cli.select_evals(available_evals)
|
|
|
|
judge_llm = nil
|
|
default_judge_error = nil
|
|
|
|
begin
|
|
judge_llm = llm_repository.hydrate(cli.judge_name)
|
|
rescue StandardError => e
|
|
if cli.judge_provided
|
|
puts "Error: #{e.message}"
|
|
exit 1
|
|
else
|
|
judge_llm = nil
|
|
default_judge_error = e.message
|
|
end
|
|
end
|
|
|
|
persona_variants =
|
|
persona_loader.variants_for(cli.persona_keys, comparison_mode: cli.comparison_mode)
|
|
|
|
requires_judge =
|
|
cli.comparison_mode.present? || selected_evals.any? { |eval_case| eval_case.judge.present? }
|
|
|
|
if requires_judge && judge_llm.nil?
|
|
message = "Error: Selected evaluations require a judge."
|
|
if default_judge_error
|
|
message += " Configure '#{cli.judge}' or pass --judge with an LLM config name."
|
|
message += "\n\n"
|
|
message += default_judge_error
|
|
else
|
|
message += " Pass --judge with an LLM config name."
|
|
end
|
|
puts message
|
|
exit 1
|
|
end
|
|
|
|
if cli.comparison_mode
|
|
begin
|
|
DiscourseAi::Evals::ComparisonRunner.new(
|
|
mode: cli.comparison_mode,
|
|
judge_llm: judge_llm,
|
|
output: $stdout,
|
|
).run(eval_cases: selected_evals, persona_variants: persona_variants, llms: llms)
|
|
rescue DiscourseAi::Evals::ComparisonRunner::ComparisonError => e
|
|
puts "Error: #{e.message}"
|
|
exit 1
|
|
end
|
|
exit 0
|
|
end
|
|
|
|
persona_variants.each do |variant|
|
|
if variant[:key]
|
|
label =
|
|
if variant[:key] == default_persona_key
|
|
"default (built-in)"
|
|
else
|
|
variant[:key]
|
|
end
|
|
|
|
puts "\n=== Persona: #{label} ==="
|
|
end
|
|
|
|
playground =
|
|
DiscourseAi::Evals::Workbench.new(
|
|
output: $stdout,
|
|
judge_llm: judge_llm,
|
|
persona_prompt: variant[:prompt],
|
|
persona_label: variant[:key],
|
|
)
|
|
|
|
selected_evals.each do |eval_case|
|
|
if cli.judge_provided && eval_case.judge.blank?
|
|
puts "Notice: Eval '#{eval_case.id}' has no judge block. --judge is ignored for this eval."
|
|
end
|
|
|
|
playground.run(eval_case: eval_case, llms: llms)
|
|
end
|
|
end
|