mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-05-26 21:18:34 +08:00
97 lines
2.3 KiB
Ruby
Executable file
Vendored
97 lines
2.3 KiB
Ruby
Executable file
Vendored
#!/usr/bin/env ruby
|
|
# frozen_string_literal: true
|
|
|
|
require_relative "lib/boot"
|
|
require_relative "lib/llm_repository"
|
|
require_relative "lib/eval"
|
|
require_relative "lib/prompts/prompt_evaluator"
|
|
require_relative "lib/prompts/single_test_runner"
|
|
require_relative "lib/features"
|
|
require_relative "lib/recorder"
|
|
require_relative "lib/workbench"
|
|
require_relative "lib/agent_prompt_loader"
|
|
require_relative "lib/cli"
|
|
|
|
features_registry =
|
|
DiscourseAi::Evals::Features.new(modules: DiscourseAi::Configuration::Module.all)
|
|
cli = DiscourseAi::Evals::Cli.parse_options!(features_registry)
|
|
|
|
llm_repository = DiscourseAi::Evals::LlmRepository.new
|
|
|
|
if cli.list_models
|
|
llm_repository.print
|
|
exit 0
|
|
end
|
|
|
|
if cli.list_features
|
|
features_registry.print
|
|
exit 0
|
|
end
|
|
|
|
agent_loader = DiscourseAi::Evals::AgentPromptLoader.new
|
|
|
|
if cli.list_agents
|
|
agent_loader.print
|
|
exit 0
|
|
end
|
|
|
|
available_evals = DiscourseAi::Evals::Eval.available_cases
|
|
|
|
if cli.list
|
|
available_evals.each(&:print)
|
|
exit 0
|
|
end
|
|
|
|
llms = llm_repository.choose(cli.models)
|
|
|
|
if llms.empty?
|
|
puts "Error: Unknown models '#{cli.models}'"
|
|
exit 1
|
|
end
|
|
|
|
selected_evals =
|
|
if cli.dataset_path.present?
|
|
DiscourseAi::Evals::Eval.from_dataset_csv(path: cli.dataset_path, feature: cli.feature_key)
|
|
else
|
|
cli.select_evals(available_evals)
|
|
end
|
|
|
|
judge_llm = nil
|
|
default_judge_error = nil
|
|
|
|
requires_judge = selected_evals.any? { |eval_case| eval_case.judge.present? }
|
|
|
|
judge_name = cli.judge_name.presence
|
|
judge_name = DiscourseAi::Evals::Cli::DEFAULT_JUDGE if requires_judge && judge_name.blank?
|
|
|
|
if judge_name.present?
|
|
begin
|
|
judge_llm = llm_repository.hydrate(judge_name)
|
|
rescue StandardError => e
|
|
if cli.judge_provided?
|
|
puts "Error: #{e.message}"
|
|
exit 1
|
|
else
|
|
judge_llm = nil
|
|
default_judge_error = e.message
|
|
end
|
|
end
|
|
end
|
|
|
|
agent_variants =
|
|
agent_loader.variants_for(cli.agent_keys, comparison_mode: cli.comparison_mode)
|
|
|
|
cli.validate_comparison_requirements!(llms: llms, agent_variants: agent_variants)
|
|
|
|
cli.validate_judge_presence!(
|
|
requires_judge: requires_judge,
|
|
judge_llm: judge_llm,
|
|
default_judge_error: default_judge_error,
|
|
)
|
|
|
|
DiscourseAi::Evals::Workbench.new(
|
|
output: $stdout,
|
|
judge_llm: judge_llm,
|
|
agent_variants: agent_variants,
|
|
comparison: cli.comparison_mode,
|
|
).run_evals(eval_cases: selected_evals, llms: llms, agent_variants: agent_variants)
|