mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-06-19 03:05:45 +08:00
Introduce an "agentic" execution mode as an alternative to the default fixed-turn/tool-limit approach. In agentic mode, personas use a configurable token budget (`max_turn_tokens`) to govern how long a tool-use session can run, with automatic context compression when the conversation exceeds a configurable threshold percentage (`compression_threshold`) of the model's context window. Key changes: - Add `execution_mode`, `max_turn_tokens`, and `compression_threshold` columns to `ai_personas` via migration - Refactor `Bot#reply` to support token-budget loop control with a thread-local token accumulator, budget exhaustion hints, and a safety valve at 100 completions - Add `maybe_compress_context` which summarizes middle conversation messages when token usage crosses the compression threshold, preserving system prompt and recent tail messages - Update `StreamReplyCustomToolsSession` to track accumulated tokens across rounds and handle budget exhaustion in the custom tools path - Discount cached tokens (Anthropic) in the token accumulator to avoid over-counting reused KV cache prefixes - Update persona editor UI with execution mode selector and conditional fields (agentic shows token budget/compression; default shows max context posts)
84 lines
2.8 KiB
Ruby
Vendored
84 lines
2.8 KiB
Ruby
Vendored
# frozen_string_literal: true
|
|
|
|
require_relative "../../evals/lib/judge"
|
|
require_relative "../../evals/lib/eval"
|
|
|
|
RSpec.describe DiscourseAi::Evals::Judge do
|
|
subject(:judge) { described_class.new(eval_case: eval_case, judge_llm: judge_llm) }
|
|
|
|
let(:eval_case) do
|
|
instance_double(
|
|
DiscourseAi::Evals::Eval,
|
|
id: "example",
|
|
args: {
|
|
input: "Source text",
|
|
},
|
|
judge: {
|
|
criteria: "Score the candidate output based on how well it explains the input.",
|
|
pass_rating: 7,
|
|
},
|
|
)
|
|
end
|
|
|
|
let(:llm_proxy) { instance_spy(DiscourseAi::Completions::Llm) }
|
|
let(:judge_llm) { instance_double(LlmModel, to_llm: llm_proxy) }
|
|
let(:judge_response) { { "rating" => 8, "explanation" => "Looks good" }.to_json }
|
|
|
|
before { allow(llm_proxy).to receive(:generate).and_return(judge_response) }
|
|
|
|
it "returns a passing result when the rating meets the threshold" do
|
|
expect(judge.evaluate("great output")[:result]).to eq(:pass)
|
|
end
|
|
|
|
it "returns a failing result when the rating is below the threshold" do
|
|
allow(llm_proxy).to receive(:generate).and_return(
|
|
{ "rating" => 5, "explanation" => "bad" }.to_json,
|
|
)
|
|
|
|
result = judge.evaluate("bad output")
|
|
|
|
expect(result[:result]).to eq(:fail)
|
|
expect(result[:message]).to include("below threshold")
|
|
end
|
|
|
|
it "substitutes placeholders from hash results" do
|
|
judge.evaluate({ result: "hash-output" })
|
|
|
|
expect(llm_proxy).to have_received(:generate).with(
|
|
satisfy { |prompt| prompt.messages.any? { |msg| msg[:content].include?("hash-output") } },
|
|
user: Discourse.system_user,
|
|
temperature: 0,
|
|
response_format: DiscourseAi::Evals::Judge::RESPONSE_FORMAT,
|
|
execution_context: nil,
|
|
)
|
|
end
|
|
|
|
describe "#compare" do
|
|
it "requests a structured comparison and returns parsed ratings" do
|
|
comparison_payload = {
|
|
"winner" => "Candidate 2",
|
|
"winner_explanation" => "more accurate",
|
|
"ratings" => [
|
|
{ "candidate" => "default", "rating" => 9, "explanation" => "complete" },
|
|
{ "candidate" => "custom", "rating" => 6, "explanation" => "missed details" },
|
|
],
|
|
}.to_json
|
|
|
|
allow(llm_proxy).to receive(:generate).and_return(comparison_payload)
|
|
|
|
result = judge.compare([{ label: "default", output: "A" }, { label: "custom", output: "B" }])
|
|
|
|
expect(llm_proxy).to have_received(:generate).with(
|
|
satisfy do |prompt|
|
|
prompt.messages.any? { |msg| msg[:content].include?("Candidate 2 (custom):") }
|
|
end,
|
|
user: Discourse.system_user,
|
|
temperature: 0,
|
|
response_format: DiscourseAi::Evals::Judge::COMPARISON_RESPONSE_FORMAT,
|
|
execution_context: nil,
|
|
)
|
|
expect(result[:winner]).to eq("custom")
|
|
expect(result[:ratings].map { |entry| entry[:candidate] }).to match_array(%w[default custom])
|
|
end
|
|
end
|
|
end
|