mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-06-19 04:25:50 +08:00
Introduce an "agentic" execution mode as an alternative to the default fixed-turn/tool-limit approach. In agentic mode, personas use a configurable token budget (`max_turn_tokens`) to govern how long a tool-use session can run, with automatic context compression when the conversation exceeds a configurable threshold percentage (`compression_threshold`) of the model's context window. Key changes: - Add `execution_mode`, `max_turn_tokens`, and `compression_threshold` columns to `ai_personas` via migration - Refactor `Bot#reply` to support token-budget loop control with a thread-local token accumulator, budget exhaustion hints, and a safety valve at 100 completions - Add `maybe_compress_context` which summarizes middle conversation messages when token usage crosses the compression threshold, preserving system prompt and recent tail messages - Update `StreamReplyCustomToolsSession` to track accumulated tokens across rounds and handle budget exhaustion in the custom tools path - Discount cached tokens (Anthropic) in the token accumulator to avoid over-counting reused KV cache prefixes - Update persona editor UI with execution mode selector and conditional fields (agentic shows token budget/compression; default shows max context posts)
164 lines
4.6 KiB
Ruby
Vendored
164 lines
4.6 KiB
Ruby
Vendored
# frozen_string_literal: true
|
|
|
|
module DiscourseAi
|
|
module Completions
|
|
module Endpoints
|
|
class CannedResponse
|
|
CANNED_RESPONSE_ERROR = Class.new(StandardError)
|
|
|
|
def initialize(responses)
|
|
@responses = responses
|
|
@completions = 0
|
|
@dialect = nil
|
|
end
|
|
|
|
def normalize_model_params(model_params)
|
|
# max_tokens, temperature, stop_sequences are already supported
|
|
model_params
|
|
end
|
|
|
|
attr_reader :responses, :completions, :dialect, :model_params
|
|
|
|
def prompt_messages
|
|
dialect.prompt.messages
|
|
end
|
|
|
|
def perform_completion!(
|
|
dialect,
|
|
_user,
|
|
model_params,
|
|
feature_name: nil,
|
|
feature_context: nil,
|
|
partial_tool_calls: false,
|
|
output_thinking: false,
|
|
cancel_manager: nil,
|
|
execution_context: nil
|
|
)
|
|
@dialect = dialect
|
|
@model_params = model_params
|
|
response = responses[completions]
|
|
if response.nil?
|
|
raise CANNED_RESPONSE_ERROR,
|
|
"The number of completions you requested exceed the number of canned responses"
|
|
end
|
|
|
|
raise response if response.is_a?(StandardError)
|
|
|
|
@completions += 1
|
|
response_enum = response.is_a?(Array) ? response : [response]
|
|
if block_given?
|
|
cancelled = false
|
|
cancel_fn = lambda { cancelled = true }
|
|
|
|
response_enum.each do |chunk|
|
|
handle_response_chunk(chunk, cancel_fn) { |val| yield(val, cancel_fn) if !cancelled }
|
|
end
|
|
end
|
|
|
|
final_response =
|
|
if model_params[:response_format].present?
|
|
aggregate_structured_response(response_enum)
|
|
else
|
|
response_enum.length == 1 ? response_enum.first : response_enum
|
|
end
|
|
|
|
final_response
|
|
end
|
|
|
|
def tokenizer
|
|
DiscourseAi::Tokenizer::OpenAiTokenizer
|
|
end
|
|
|
|
private
|
|
|
|
def handle_response_chunk(chunk, cancel_fn)
|
|
if is_tool?(chunk)
|
|
yield chunk
|
|
elsif is_thinking?(chunk)
|
|
yield chunk
|
|
elsif model_params[:response_format].present?
|
|
structured =
|
|
(
|
|
if chunk.is_a?(DiscourseAi::Completions::StructuredOutput)
|
|
chunk
|
|
else
|
|
as_structured_output(chunk)
|
|
end
|
|
)
|
|
yield structured
|
|
else
|
|
chunk.to_s.each_char { |char| yield char }
|
|
end
|
|
end
|
|
|
|
def aggregate_structured_response(response_enum)
|
|
schema_properties = model_params[:response_format].dig(:json_schema, :schema, :properties)
|
|
|
|
return response_enum.first if schema_properties.blank?
|
|
|
|
output = DiscourseAi::Completions::StructuredOutput.new(schema_properties)
|
|
|
|
response_enum.each do |chunk|
|
|
structured =
|
|
if chunk.is_a?(DiscourseAi::Completions::StructuredOutput)
|
|
chunk
|
|
else
|
|
as_structured_output(chunk)
|
|
end
|
|
output << structured.to_s
|
|
end
|
|
|
|
output.finish
|
|
output
|
|
end
|
|
|
|
def is_thinking?(response)
|
|
response.is_a?(DiscourseAi::Completions::Thinking)
|
|
end
|
|
|
|
def is_tool?(response)
|
|
response.is_a?(DiscourseAi::Completions::ToolCall)
|
|
end
|
|
|
|
def as_structured_output(response)
|
|
schema_properties = model_params[:response_format].dig(:json_schema, :schema, :properties)
|
|
return response if schema_properties.blank?
|
|
|
|
parsed = parse_structured_response(response)
|
|
|
|
payload =
|
|
if parsed.is_a?(Hash)
|
|
parsed = parsed.stringify_keys
|
|
schema_properties
|
|
.keys
|
|
.each_with_object({}) do |key, memo|
|
|
string_key = key.to_s
|
|
memo[key] = parsed[string_key] if parsed.key?(string_key)
|
|
end
|
|
else
|
|
{ schema_properties.keys.first => response }
|
|
end
|
|
|
|
output = DiscourseAi::Completions::StructuredOutput.new(schema_properties)
|
|
output << payload.to_json
|
|
output.finish
|
|
|
|
output
|
|
end
|
|
|
|
def parse_structured_response(response)
|
|
case response
|
|
when Hash
|
|
response
|
|
when String
|
|
JSON.parse(response)
|
|
else
|
|
nil
|
|
end
|
|
rescue JSON::ParserError
|
|
nil
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|