discourse/plugins/discourse-ai/spec/evals/workbench_compare_spec.rb
Sam e3fae646d4
DEV: AI persona to agent migration (#38319)
Co-authored-by: Keegan George <kgeorge13@gmail.com>
2026-03-10 15:59:45 +11:00

163 lines
5.5 KiB
Ruby

# frozen_string_literal: true
require_relative "../../evals/lib/workbench"
require_relative "../../evals/lib/judge"
RSpec.describe DiscourseAi::Evals::Workbench do
let(:output) { StringIO.new }
let(:eval_case) do
OpenStruct.new(
id: "topic-summary",
judge: {
criteria: "accuracy",
pass_rating: 8,
},
feature: "dummy",
args: {
},
)
end
let(:llm_one) { Fabricate(:fake_model, display_name: "LLM One") }
let(:llm_two) { Fabricate(:fake_model, display_name: "LLM Two") }
let(:agent_variants) { [{ key: "default", prompt: nil }, { key: "custom", prompt: "prompt" }] }
let(:formatter) do
instance_double(DiscourseAi::Evals::ConsoleFormatter, announce_start: nil, finalize: nil)
end
describe "#compare with judge in agent mode" do
let(:judge_llm) { Fabricate(:fake_model) }
let(:workbench) { described_class.new(output: output, judge_llm: judge_llm, comparison: true) }
let(:recorder) do
instance_double(
DiscourseAi::Evals::Recorder,
execution_context: nil,
record_llm_results: nil,
record_llm_skip: nil,
announce_comparison_judged: nil,
announce_comparison_expected: nil,
announce_comparison_aggregate: nil,
finish: nil,
)
end
before do
allow(DiscourseAi::Evals::ConsoleFormatter).to receive(:new).and_return(formatter)
allow(DiscourseAi::Evals::Recorder).to receive(:with_cassette).and_return(recorder)
allow(workbench).to receive(:execute_eval).and_return(
{ raw: "default out", raw_entries: ["default out"], classified: [{ result: :pass }] },
{ raw: "custom out", raw_entries: ["custom out"], classified: [{ result: :pass }] },
)
allow_any_instance_of(DiscourseAi::Evals::Judge).to receive(:compare).and_return(
winner: "custom",
winner_label: "Candidate 2",
ratings: [
{ candidate: "default", rating: 6, explanation: "ok" },
{ candidate: "custom", rating: 9, explanation: "great" },
],
winner_explanation: "better",
)
end
it "announces judged comparison with resolved winner labels" do
workbench.compare(
eval_cases: [eval_case],
llms: [llm_one],
agent_variants: agent_variants,
formatter: formatter,
)
expect(recorder).to have_received(:announce_comparison_judged).with(
eval_case_id: "topic-summary",
mode_label: "agents",
agent_key: "default",
result:
a_hash_including(
winner: "custom",
winner_label: "Candidate 2",
winner_explanation: "better",
ratings: [
{ candidate: "default", rating: 6, explanation: "ok" },
{ candidate: "custom", rating: 9, explanation: "great" },
],
),
candidates:
a_collection_containing_exactly(
a_hash_including(label: "default", display_label: "default"),
a_hash_including(label: "custom", display_label: "custom"),
),
)
end
end
describe "#compare expected-output aggregate" do
let(:workbench) { described_class.new(output: output, judge_llm: nil, comparison: :llms) }
let(:recorder) do
instance_double(
DiscourseAi::Evals::Recorder,
execution_context: nil,
record_llm_results: nil,
record_llm_skip: nil,
announce_comparison_judged: nil,
announce_comparison_expected: nil,
announce_comparison_aggregate: nil,
finish: nil,
)
end
let(:eval_case) { OpenStruct.new(id: "spam_eval", judge: nil, args: nil, feature: "dummy") }
before do
allow(DiscourseAi::Evals::ConsoleFormatter).to receive(:new).and_return(formatter)
allow(DiscourseAi::Evals::Recorder).to receive(:with_cassette).and_return(recorder)
allow(DiscourseAi::Evals::Judge).to receive(:new).and_raise("judge should not be called")
allow(workbench).to receive(:execute_eval).and_return(
{ raw: "out one", raw_entries: ["out one"], classified: [{ result: :pass }] },
{
raw: "out two",
raw_entries: ["out two"],
classified: [
{ result: :pass },
{ result: :fail, expected_output: "true", actual_output: "false" },
],
},
)
end
it "announces expected comparison and aggregates totals across evals" do
workbench.compare(
eval_cases: [eval_case],
llms: [llm_one, llm_two],
agent_variants: [{ key: "default", prompt: nil }],
formatter: formatter,
)
expect(recorder).to have_received(:announce_comparison_expected).with(
eval_case_id: "spam_eval",
mode_label: "LLMs",
agent_key: "default",
winner: "LLM One",
status_line: "LLM One 🟢 -- LLM Two 🔴",
failures: [{ label: "LLM Two", expected: "true", actual: "false" }],
candidates:
a_collection_containing_exactly(
a_hash_including(label: "LLM One", display_label: "LLM One"),
a_hash_including(label: "LLM Two", display_label: "LLM Two"),
),
)
expect(recorder).to have_received(:announce_comparison_aggregate).with(
mode_label: "LLMs",
agent_key: "default",
aggregate_scores: {
"LLM One" => {
evals: 1,
passes: 1,
},
"LLM Two" => {
evals: 1,
passes: 0,
},
},
)
end
end
end