discourse/plugins/discourse-ai/evals/lib/prompts/single_test_runner.rb
Sam b8abe100c5
FEATURE: add agentic execution mode for AI personas (#38230)
Introduce an "agentic" execution mode as an alternative to the
default fixed-turn/tool-limit approach. In agentic mode, personas
use a configurable token budget (`max_turn_tokens`) to govern how
long a tool-use session can run, with automatic context compression
when the conversation exceeds a configurable threshold percentage
(`compression_threshold`) of the model's context window.

Key changes:

- Add `execution_mode`, `max_turn_tokens`, and `compression_threshold`
  columns to `ai_personas` via migration
- Refactor `Bot#reply` to support token-budget loop control with a
  thread-local token accumulator, budget exhaustion hints, and a
  safety valve at 100 completions
- Add `maybe_compress_context` which summarizes middle conversation
  messages when token usage crosses the compression threshold,
  preserving system prompt and recent tail messages
- Update `StreamReplyCustomToolsSession` to track accumulated tokens
  across rounds and handle budget exhaustion in the custom tools path
- Discount cached tokens (Anthropic) in the token accumulator to
  avoid over-counting reused KV cache prefixes
- Update persona editor UI with execution mode selector and
  conditional fields (agentic shows token budget/compression;
  default shows max context posts)
2026-03-05 15:06:54 +11:00

138 lines
4.1 KiB
Ruby
Vendored

# frozen_string_literal: true
class DiscourseAi::Evals::PromptSingleTestRunner
def initialize(llm)
@llm = llm
end
# Run a single test with a prompt and message, and some model settings
# @param prompt [String] the prompt to use
# @param message [String] the message to use
# @param followups [Array<Hash>] an array of followups (messages) to run after the initial prompt
# @param output_thinking [Boolean] whether to output the thinking state of the model
# @param stream [Boolean] whether to stream the output of the model
# @param temperature [Float] the temperature to use when generating completions
# @param tools [Array<Hash>] an array of tools to use when generating completions
# @return [Hash] the prompt, message, and result of the test
def run_single_test(
prompt:,
message:,
followups:,
output_thinking:,
stream:,
temperature:,
tools:,
tool_results: nil,
chain_length: 1,
max_tool_calls: nil,
execution_context: nil
)
@c_prompt =
DiscourseAi::Completions::Prompt.new(prompt, messages: [{ type: :user, content: message }])
@c_prompt.tools = tools if tools
@tool_results = tool_results || {}
while chain_length > 0
generate_result(temperature, output_thinking, stream, execution_context:)
chain_length -= 1
if chain_length > 0
populate_reply(max_tool_calls:)
break if @c_prompt.messages.last[:type] != :tool
end
@c_prompt.tool_choice = :none if chain_length == 1
end
if followups
followups.each do |followup|
generate_followup(followup, output_thinking, stream, temperature, execution_context:)
end
end
{ prompt:, message:, result: @result }
end
private
def populate_reply(max_tool_calls:)
# @c_prompt contains the prompt
# @result contains the last result
#
# we need to append the result to the prompt so we can proceed with the chain
current = @result
current = [current] if !current.is_a?(Array)
result = []
current.each do |part|
if part.is_a?(DiscourseAi::Completions::ToolCall)
proposed_result = @tool_results[part.name]
raise "No tool result provided for tool #{part.name}" if !proposed_result
part.parameters.each do |key, value|
proposed_result = proposed_result.gsub("{{#{key}}}", value.to_s)
end
break if max_tool_calls && ((max_tool_calls -= 1) < 0)
result.push(part)
result.push(
DiscourseAi::Completions::ToolResult.new(content: proposed_result, tool_call: part),
)
else
result.push(part)
end
end
@c_prompt.push_model_response(result)
end
def generate_followup(followup, output_thinking, stream, temperature, execution_context: nil)
@c_prompt.push_model_response(@result)
followup_message = set_followup_tool(followup)
@c_prompt.push(**followup_message)
begin
generate_result(temperature, output_thinking, stream, execution_context:)
rescue => e
# should not happen but it helps debugging...
puts e
end
end
def set_followup_tool(followup)
@c_prompt.tools = followup[:tools] if followup[:tools]
followup_message = followup[:message]
%i[id name].each do |key|
if followup_message[key].is_a?(Array)
type, inner_key = followup_message[key]
# this allows us to dynamically set the id or name of the tool call
prev = @c_prompt.messages.reverse.find { |m| m[:type] == type.to_sym }
followup_message[key] = prev[inner_key.to_sym] if prev
end
end
followup_message
end
def generate_result(temperature, output_thinking, stream, execution_context: nil)
@result =
if stream
stream_result = []
@llm.generate(
@c_prompt,
user: Discourse.system_user,
temperature:,
output_thinking:,
execution_context:,
) { |partial| stream_result << partial }
stream_result
else
@llm.generate(
@c_prompt,
user: Discourse.system_user,
temperature:,
output_thinking:,
execution_context:,
)
end
end
end