discourse/plugins/discourse-ai/spec/evals/workbench_spec.rb
Natalie Tay fbcbdc46d8
FIX: Data explorer agent reliability for schema and plurality (#40152)
Reported through a few user tests, the agent was unreliable in three
ways:
- `tags.tag_name` instead of `tags.name`
- did not use `current_user_id` for "my posts" prompts
- plural nouns as singular
- and used unparse-able date defaults like "today".

Few issues:
- `DbSchema` tool was returning a dense one-line-per-table comma that
qwen was unable to deal with. Now line-per-column so schema accuracy
originally flaky is now 5/5 PASSING on qwen and Gemini.
- The prompt was teaching the wrong thing where the `-- null boolean
:opt_flag = #null` example made models use `#null` as a default value.
We now have a "Parameter rules" section, ISO date examples that match
the "no natural-language defaults" rule below them, explicit
`current_user_id` guidance for first-person prompts, and a plural-noun
rule that applies to each plural noun independently in the same prompt
(e.g. "categories and tags" → BOTH list params, not one of each).
- Eval runner now captures `name` and `description` separately, not just
`sql`. The description text is graded directly rather than grading the
SQL string.

Tested against qwen 3.5 122B (our hosted model) + Gemini 3.1 Flash Lite
(judge GPT-5.2): 20/20 each. New eval cases ship in this PR
https://github.com/discourse/discourse-ai-evals/pull/18
2026-05-19 16:23:17 +08:00

322 lines
10 KiB
Ruby
Vendored

# frozen_string_literal: true
require_relative "../../evals/lib/workbench"
require_relative "../../evals/lib/eval"
require_relative "../../evals/lib/llm_repository"
require_relative "../../evals/lib/recorder"
RSpec.describe DiscourseAi::Evals::Workbench do
subject(:workbench) { described_class.new(output: output) }
let(:output) { StringIO.new }
let(:formatter) do
instance_double(DiscourseAi::Evals::ConsoleFormatter, announce_start: nil, finalize: nil)
end
let(:recorder) do
instance_double(
DiscourseAi::Evals::Recorder,
execution_context: nil,
record_llm_skip: nil,
record_llm_results: nil,
finish: nil,
)
end
let(:eval_case) do
instance_double(
DiscourseAi::Evals::Eval,
id: "example-eval",
vision: requires_vision,
feature: "custom:prompt",
args: {
},
expected_output: nil,
expected_output_regex: nil,
expected_tool_call: nil,
judge: nil,
)
end
let(:requires_vision) { false }
let(:llm) do
Fabricate.build(
:fake_model,
display_name: "gpt-4",
name: "gpt-4",
vision_enabled: llm_supports_vision,
)
end
let(:llm_supports_vision) { true }
before do
allow(DiscourseAi::Evals::ConsoleFormatter).to receive(:new).and_return(formatter)
allow(DiscourseAi::Evals::Recorder).to receive(:with_cassette).and_return(recorder)
freeze_time
end
describe "#run" do
it "records results for each llm" do
# rubocop:disable RSpec/SubjectStub
allow(workbench).to receive(:execute_eval).and_return(
{ raw: "output", raw_entries: ["output"], classified: [{ result: :pass }] },
)
workbench.run_evals(eval_cases: [eval_case], llms: [llm])
expect(DiscourseAi::Evals::Recorder).to have_received(:with_cassette).with(
eval_case,
output: output,
total_targets: 1,
agent_key: :default,
formatter: formatter,
announce_formatter: false,
finalize_formatter: false,
)
expect(recorder).to have_received(:record_llm_results).with(
"gpt-4",
[{ result: :pass }],
Time.now.utc,
raw_entries: ["output"],
display_label: "gpt-4",
row_prefix: "example-eval",
)
expect(recorder).to have_received(:finish)
expect(formatter).to have_received(:finalize)
end
context "when the eval requires vision but the llm does not support it" do
let(:requires_vision) { true }
let(:llm_supports_vision) { false }
it "skips the llm and records the reason" do
workbench.run_evals(eval_cases: [eval_case], llms: [llm])
expect(recorder).to have_received(:record_llm_skip).with(
"gpt-4",
"LLM does not support vision",
display_label: "gpt-4",
row_prefix: "example-eval",
)
expect(recorder).to have_received(:finish)
end
end
context "when eval execution raises an EvalError" do
it "records the failure with the error context" do
error = DiscourseAi::Evals::Eval::EvalError.new("boom", { foo: "bar" })
allow(workbench).to receive(:execute_eval).and_raise(error) # rubocop:disable RSpec/SubjectStub
workbench.run_evals(eval_cases: [eval_case], llms: [llm])
expect(recorder).to have_received(:record_llm_results).with(
"gpt-4",
[{ result: :fail, message: "boom", context: { foo: "bar" } }],
Time.now.utc,
display_label: "gpt-4",
row_prefix: "example-eval",
)
expect(recorder).to have_received(:finish)
end
end
context "when eval execution raises an unexpected error" do
it "records the failure with the exception message" do
allow(workbench).to receive(:execute_eval).and_raise(StandardError.new("kaboom")) # rubocop:disable RSpec/SubjectStub
workbench.run_evals(eval_cases: [eval_case], llms: [llm])
expect(recorder).to have_received(:record_llm_results).with(
"gpt-4",
[{ message: "kaboom", result: :fail }],
Time.now.utc,
display_label: "gpt-4",
row_prefix: "example-eval",
)
end
end
end
describe "feature execution flows" do
fab!(:category)
let(:llm) { Fabricate(:fake_model, display_name: "Fake Eval Model", vision_enabled: false) }
let(:workbench) { described_class.new(output: output) }
before do
enable_current_plugin
ensure_system_agent(DiscourseAi::Agents::Summarizer)
ensure_system_agent(DiscourseAi::Agents::ShortSummarizer)
ensure_system_agent(DiscourseAi::Agents::SpamDetector)
AiAgent.agent_cache.flush!
end
it "generates topic summaries using the summarization eval feature" do
eval_case =
OpenStruct.new(
id: "topic-summary",
feature: "summarization:topic_summaries",
args: {
input: "First post\nSecond post",
},
expected_output: "Concise summary",
expected_output_regex: nil,
expected_tool_call: nil,
judge: nil,
)
results =
DiscourseAi::Completions::Llm.with_prepared_responses(["Concise summary"]) do
workbench.execute_eval(eval_case, llm)
end
expect(results[:classified].first[:result]).to eq(:pass)
end
it "always provides an execution_context to runners even when caller omits it" do
eval_case =
OpenStruct.new(
id: "ctx-check",
feature: "summarization:topic_summaries",
args: {
input: "Test post",
},
expected_output: nil,
expected_output_regex: nil,
expected_tool_call: nil,
judge: nil,
vision: false,
)
captured_context = nil
allow(DiscourseAi::Agents::Bot).to receive(
:as,
).and_wrap_original do |original, *args, **kwargs|
original
.call(*args, **kwargs)
.tap do |bot|
allow(bot).to receive(
:reply,
).and_wrap_original do |reply_method, *rargs, **rkwargs, &blk|
captured_context = rkwargs[:execution_context]
reply_method.call(*rargs, **rkwargs, &blk)
end
end
end
DiscourseAi::Completions::Llm.with_prepared_responses(["Summary"]) do
workbench.execute_eval(eval_case, llm)
end
expect(captured_context).to be_a(DiscourseAi::Completions::ExecutionContext)
end
it "flags spam posts via the spam inspection eval feature" do
eval_case =
OpenStruct.new(
id: "spam-eval",
feature: "spam:inspect_posts",
args: {
input: "Buy now click now http://spam.test",
topic_title: "Limited offer",
},
expected_output: "true",
expected_output_regex: nil,
expected_tool_call: nil,
judge: nil,
)
results =
DiscourseAi::Completions::Llm.with_prepared_responses([true, "obvious spam"]) do
workbench.execute_eval(eval_case, llm)
end
expect(results[:classified].first[:result]).to eq(:pass)
end
end
def ensure_system_agent(agent_class)
agent_id = DiscourseAi::Agents::Agent.system_agents[agent_class]
base = agent_class.new
AiAgent
.find_or_initialize_by(id: agent_id)
.tap do |agent|
agent.system = true
agent.enabled = true
agent.priority ||= false
agent.name ||= agent_class.name
agent.description ||= agent_class.description
agent.system_prompt = base.system_prompt
agent.allowed_group_ids = [Group::AUTO_GROUPS[:everyone]]
agent.response_format = base.response_format
agent.examples = base.examples
agent.temperature = base.respond_to?(:temperature) ? base.temperature : nil
agent.top_p = base.respond_to?(:top_p) ? base.top_p : nil
agent.show_thinking = true
agent.tools ||= []
agent.save!(validate: false)
end
end
describe "#judge_result" do
let(:judge_eval_case) do
OpenStruct.new(
id: "judge-eval",
args: {
input: "Source content",
},
judge: {
criteria: "Score the output against the provided input, rewarding accuracy and clarity.",
pass_rating: 7,
},
)
end
it "raises a helpful error when no judge llm is configured" do
expect { workbench.send(:judge_result, judge_eval_case, "answer") }.to raise_error(
DiscourseAi::Evals::Eval::EvalError,
/requires the --judge option/,
)
end
it "returns a passing result when the rating meets the threshold" do
judge_llm = Fabricate(:fake_model)
workbench_with_judge = described_class.new(output: output, judge_llm: judge_llm)
response = { "rating" => 8, "explanation" => "good" }.to_json
result =
DiscourseAi::Completions::Llm.with_prepared_responses([response], llm: judge_llm) do
workbench_with_judge.send(:judge_result, judge_eval_case, "answer")
end
expect(result[:result]).to eq(:pass)
end
it "returns a failure when the rating is below the threshold" do
judge_llm = Fabricate(:fake_model)
workbench_with_judge = described_class.new(output: output, judge_llm: judge_llm)
response = { "rating" => 5, "explanation" => "needs work" }.to_json
result =
DiscourseAi::Completions::Llm.with_prepared_responses([response], llm: judge_llm) do
workbench_with_judge.send(:judge_result, judge_eval_case, "answer")
end
expect(result[:result]).to eq(:fail)
expect(result[:message]).to include("LLM Rating below threshold")
end
end
describe "#judge_input" do
it "forwards `name` and `description` from metadata so the judge sees them" do
input =
workbench.send(:judge_input, "SELECT 1", { name: "My Query", description: "Counts users" })
expect(input).to eq(result: "SELECT 1", name: "My Query", description: "Counts users")
end
it "returns the raw result when there is no metadata" do
expect(workbench.send(:judge_input, "SELECT 1", nil)).to eq("SELECT 1")
expect(workbench.send(:judge_input, "SELECT 1", {})).to eq("SELECT 1")
end
end
end