discourse/plugins/discourse-ai/spec/evals/judge_spec.rb
Sam b8abe100c5
FEATURE: add agentic execution mode for AI personas (#38230)
Introduce an "agentic" execution mode as an alternative to the
default fixed-turn/tool-limit approach. In agentic mode, personas
use a configurable token budget (`max_turn_tokens`) to govern how
long a tool-use session can run, with automatic context compression
when the conversation exceeds a configurable threshold percentage
(`compression_threshold`) of the model's context window.

Key changes:

- Add `execution_mode`, `max_turn_tokens`, and `compression_threshold`
  columns to `ai_personas` via migration
- Refactor `Bot#reply` to support token-budget loop control with a
  thread-local token accumulator, budget exhaustion hints, and a
  safety valve at 100 completions
- Add `maybe_compress_context` which summarizes middle conversation
  messages when token usage crosses the compression threshold,
  preserving system prompt and recent tail messages
- Update `StreamReplyCustomToolsSession` to track accumulated tokens
  across rounds and handle budget exhaustion in the custom tools path
- Discount cached tokens (Anthropic) in the token accumulator to
  avoid over-counting reused KV cache prefixes
- Update persona editor UI with execution mode selector and
  conditional fields (agentic shows token budget/compression;
  default shows max context posts)
2026-03-05 15:06:54 +11:00

84 lines
2.8 KiB
Ruby
Vendored

# frozen_string_literal: true
require_relative "../../evals/lib/judge"
require_relative "../../evals/lib/eval"
RSpec.describe DiscourseAi::Evals::Judge do
subject(:judge) { described_class.new(eval_case: eval_case, judge_llm: judge_llm) }
let(:eval_case) do
instance_double(
DiscourseAi::Evals::Eval,
id: "example",
args: {
input: "Source text",
},
judge: {
criteria: "Score the candidate output based on how well it explains the input.",
pass_rating: 7,
},
)
end
let(:llm_proxy) { instance_spy(DiscourseAi::Completions::Llm) }
let(:judge_llm) { instance_double(LlmModel, to_llm: llm_proxy) }
let(:judge_response) { { "rating" => 8, "explanation" => "Looks good" }.to_json }
before { allow(llm_proxy).to receive(:generate).and_return(judge_response) }
it "returns a passing result when the rating meets the threshold" do
expect(judge.evaluate("great output")[:result]).to eq(:pass)
end
it "returns a failing result when the rating is below the threshold" do
allow(llm_proxy).to receive(:generate).and_return(
{ "rating" => 5, "explanation" => "bad" }.to_json,
)
result = judge.evaluate("bad output")
expect(result[:result]).to eq(:fail)
expect(result[:message]).to include("below threshold")
end
it "substitutes placeholders from hash results" do
judge.evaluate({ result: "hash-output" })
expect(llm_proxy).to have_received(:generate).with(
satisfy { |prompt| prompt.messages.any? { |msg| msg[:content].include?("hash-output") } },
user: Discourse.system_user,
temperature: 0,
response_format: DiscourseAi::Evals::Judge::RESPONSE_FORMAT,
execution_context: nil,
)
end
describe "#compare" do
it "requests a structured comparison and returns parsed ratings" do
comparison_payload = {
"winner" => "Candidate 2",
"winner_explanation" => "more accurate",
"ratings" => [
{ "candidate" => "default", "rating" => 9, "explanation" => "complete" },
{ "candidate" => "custom", "rating" => 6, "explanation" => "missed details" },
],
}.to_json
allow(llm_proxy).to receive(:generate).and_return(comparison_payload)
result = judge.compare([{ label: "default", output: "A" }, { label: "custom", output: "B" }])
expect(llm_proxy).to have_received(:generate).with(
satisfy do |prompt|
prompt.messages.any? { |msg| msg[:content].include?("Candidate 2 (custom):") }
end,
user: Discourse.system_user,
temperature: 0,
response_format: DiscourseAi::Evals::Judge::COMPARISON_RESPONSE_FORMAT,
execution_context: nil,
)
expect(result[:winner]).to eq("custom")
expect(result[:ratings].map { |entry| entry[:candidate] }).to match_array(%w[default custom])
end
end
end