mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-05-07 03:59:09 +08:00
Introduce an "agentic" execution mode as an alternative to the default fixed-turn/tool-limit approach. In agentic mode, personas use a configurable token budget (`max_turn_tokens`) to govern how long a tool-use session can run, with automatic context compression when the conversation exceeds a configurable threshold percentage (`compression_threshold`) of the model's context window. Key changes: - Add `execution_mode`, `max_turn_tokens`, and `compression_threshold` columns to `ai_personas` via migration - Refactor `Bot#reply` to support token-budget loop control with a thread-local token accumulator, budget exhaustion hints, and a safety valve at 100 completions - Add `maybe_compress_context` which summarizes middle conversation messages when token usage crosses the compression threshold, preserving system prompt and recent tail messages - Update `StreamReplyCustomToolsSession` to track accumulated tokens across rounds and handle budget exhaustion in the custom tools path - Discount cached tokens (Anthropic) in the token accumulator to avoid over-counting reused KV cache prefixes - Update persona editor UI with execution mode selector and conditional fields (agentic shows token budget/compression; default shows max context posts)
61 lines
1.8 KiB
Ruby
61 lines
1.8 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
RSpec.describe DiscourseAi::Completions::TokenUsageTracker do
|
|
it "applies weighted request accounting from audit logs" do
|
|
tracker = described_class.new
|
|
log =
|
|
Struct.new(:request_tokens, :cache_write_tokens, :cache_read_tokens, :response_tokens).new(
|
|
1000,
|
|
0,
|
|
800,
|
|
50,
|
|
)
|
|
|
|
tracker.add_from_audit_log(log)
|
|
|
|
expect(tracker.request).to eq(1080)
|
|
expect(tracker.response).to eq(50)
|
|
expect(tracker.total).to eq(1130)
|
|
end
|
|
|
|
it "supports starting from a previous total budget" do
|
|
tracker = described_class.new(base_total: 101)
|
|
|
|
expect(tracker.request).to eq(50)
|
|
expect(tracker.response).to eq(51)
|
|
expect(tracker.total).to eq(101)
|
|
end
|
|
|
|
it "supports exact request/response initialization" do
|
|
tracker = described_class.new(base_request: 12, base_response: 34)
|
|
|
|
expect(tracker.request).to eq(12)
|
|
expect(tracker.response).to eq(34)
|
|
expect(tracker.total).to eq(46)
|
|
end
|
|
|
|
it "accumulates across multiple audit logs" do
|
|
tracker = described_class.new
|
|
log = Struct.new(:request_tokens, :cache_write_tokens, :cache_read_tokens, :response_tokens)
|
|
|
|
tracker.add_from_audit_log(log.new(100, 20, 50, 10))
|
|
tracker.add_from_audit_log(log.new(200, 0, 0, 5))
|
|
|
|
expect(tracker.request).to eq(325)
|
|
expect(tracker.response).to eq(15)
|
|
expect(tracker.total).to eq(340)
|
|
end
|
|
|
|
it "raises when request/response initialization is partial" do
|
|
expect { described_class.new(base_request: 1) }.to raise_error(
|
|
ArgumentError,
|
|
/must both be provided/,
|
|
)
|
|
end
|
|
|
|
it "raises when total and request/response are mixed" do
|
|
expect {
|
|
described_class.new(base_total: 10, base_request: 1, base_response: 2)
|
|
}.to raise_error(ArgumentError, /cannot be combined/)
|
|
end
|
|
end
|