discourse/plugins/discourse-ai/evals/run
Sam e3fae646d4
DEV: AI persona to agent migration (#38319)
Co-authored-by: Keegan George <kgeorge13@gmail.com>
2026-03-10 15:59:45 +11:00

97 lines
2.3 KiB
Ruby
Executable file
Vendored

#!/usr/bin/env ruby
# frozen_string_literal: true
require_relative "lib/boot"
require_relative "lib/llm_repository"
require_relative "lib/eval"
require_relative "lib/prompts/prompt_evaluator"
require_relative "lib/prompts/single_test_runner"
require_relative "lib/features"
require_relative "lib/recorder"
require_relative "lib/workbench"
require_relative "lib/agent_prompt_loader"
require_relative "lib/cli"
features_registry =
DiscourseAi::Evals::Features.new(modules: DiscourseAi::Configuration::Module.all)
cli = DiscourseAi::Evals::Cli.parse_options!(features_registry)
llm_repository = DiscourseAi::Evals::LlmRepository.new
if cli.list_models
llm_repository.print
exit 0
end
if cli.list_features
features_registry.print
exit 0
end
agent_loader = DiscourseAi::Evals::AgentPromptLoader.new
if cli.list_agents
agent_loader.print
exit 0
end
available_evals = DiscourseAi::Evals::Eval.available_cases
if cli.list
available_evals.each(&:print)
exit 0
end
llms = llm_repository.choose(cli.models)
if llms.empty?
puts "Error: Unknown models '#{cli.models}'"
exit 1
end
selected_evals =
if cli.dataset_path.present?
DiscourseAi::Evals::Eval.from_dataset_csv(path: cli.dataset_path, feature: cli.feature_key)
else
cli.select_evals(available_evals)
end
judge_llm = nil
default_judge_error = nil
requires_judge = selected_evals.any? { |eval_case| eval_case.judge.present? }
judge_name = cli.judge_name.presence
judge_name = DiscourseAi::Evals::Cli::DEFAULT_JUDGE if requires_judge && judge_name.blank?
if judge_name.present?
begin
judge_llm = llm_repository.hydrate(judge_name)
rescue StandardError => e
if cli.judge_provided?
puts "Error: #{e.message}"
exit 1
else
judge_llm = nil
default_judge_error = e.message
end
end
end
agent_variants =
agent_loader.variants_for(cli.agent_keys, comparison_mode: cli.comparison_mode)
cli.validate_comparison_requirements!(llms: llms, agent_variants: agent_variants)
cli.validate_judge_presence!(
requires_judge: requires_judge,
judge_llm: judge_llm,
default_judge_error: default_judge_error,
)
DiscourseAi::Evals::Workbench.new(
output: $stdout,
judge_llm: judge_llm,
agent_variants: agent_variants,
comparison: cli.comparison_mode,
).run_evals(eval_cases: selected_evals, llms: llms, agent_variants: agent_variants)