mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-06-19 02:05:37 +08:00
Introduce an "agentic" execution mode as an alternative to the default fixed-turn/tool-limit approach. In agentic mode, personas use a configurable token budget (`max_turn_tokens`) to govern how long a tool-use session can run, with automatic context compression when the conversation exceeds a configurable threshold percentage (`compression_threshold`) of the model's context window. Key changes: - Add `execution_mode`, `max_turn_tokens`, and `compression_threshold` columns to `ai_personas` via migration - Refactor `Bot#reply` to support token-budget loop control with a thread-local token accumulator, budget exhaustion hints, and a safety valve at 100 completions - Add `maybe_compress_context` which summarizes middle conversation messages when token usage crosses the compression threshold, preserving system prompt and recent tail messages - Update `StreamReplyCustomToolsSession` to track accumulated tokens across rounds and handle budget exhaustion in the custom tools path - Discount cached tokens (Anthropic) in the token accumulator to avoid over-counting reused KV cache prefixes - Update persona editor UI with execution mode selector and conditional fields (agentic shows token budget/compression; default shows max context posts)
323 lines
9.4 KiB
Ruby
Vendored
323 lines
9.4 KiB
Ruby
Vendored
# frozen_string_literal: true
|
|
|
|
module DiscourseAi
|
|
module Evals
|
|
# Evaluates model outputs using the criteria embedded in the eval.
|
|
#
|
|
# Today it supports the single-output flow that scores one result against a
|
|
# rubric. It encapsulates prompt construction and rating parsing so future
|
|
# comparison judges can reuse the same entry point.
|
|
class Judge
|
|
RESPONSE_FORMAT = {
|
|
type: "json_schema",
|
|
json_schema: {
|
|
name: "judgeVerdict",
|
|
schema: {
|
|
type: "object",
|
|
additionalProperties: false,
|
|
required: %w[rating explanation],
|
|
properties: {
|
|
rating: {
|
|
type: "integer",
|
|
minimum: 1,
|
|
maximum: 10,
|
|
},
|
|
explanation: {
|
|
type: "string",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}.freeze
|
|
|
|
COMPARISON_RESPONSE_FORMAT = {
|
|
type: "json_schema",
|
|
json_schema: {
|
|
name: "judgeComparisonVerdict",
|
|
schema: {
|
|
type: "object",
|
|
additionalProperties: false,
|
|
required: %w[winner winner_explanation ratings],
|
|
properties: {
|
|
winner: {
|
|
type: "string",
|
|
},
|
|
winner_explanation: {
|
|
type: "string",
|
|
},
|
|
ratings: {
|
|
type: "array",
|
|
minItems: 2,
|
|
items: {
|
|
type: "object",
|
|
additionalProperties: false,
|
|
required: %w[candidate rating explanation],
|
|
properties: {
|
|
candidate: {
|
|
type: "string",
|
|
},
|
|
rating: {
|
|
type: "integer",
|
|
minimum: 1,
|
|
maximum: 10,
|
|
},
|
|
explanation: {
|
|
type: "string",
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
},
|
|
}.freeze
|
|
|
|
def initialize(eval_case:, judge_llm:)
|
|
judge_config = eval_case.judge || {}
|
|
@eval_case = eval_case
|
|
@judge_llm = judge_llm
|
|
@criteria = judge_config[:criteria].presence || judge_config[:prompt].to_s
|
|
@pass_rating = judge_config[:pass_rating] || 10
|
|
end
|
|
|
|
def evaluate(result, execution_context: nil)
|
|
prompt = build_prompt(result)
|
|
response =
|
|
judge_llm.to_llm.generate(
|
|
prompt,
|
|
user: Discourse.system_user,
|
|
temperature: 0,
|
|
response_format: RESPONSE_FORMAT,
|
|
execution_context:,
|
|
)
|
|
|
|
parsed = parse_response(response)
|
|
rating = parsed[:rating]
|
|
explanation = parsed[:explanation]
|
|
raw = parsed[:raw]
|
|
|
|
if rating >= pass_rating
|
|
{ result: :pass, context: explanation.presence || raw }
|
|
else
|
|
{
|
|
result: :fail,
|
|
message: "LLM Rating below threshold, it was #{rating}, expecting #{pass_rating}",
|
|
context: explanation.presence || raw,
|
|
}
|
|
end
|
|
end
|
|
|
|
def compare(candidates, execution_context: nil)
|
|
prompt = build_comparison_prompt(candidates)
|
|
response =
|
|
judge_llm.to_llm.generate(
|
|
prompt,
|
|
user: Discourse.system_user,
|
|
temperature: 0,
|
|
response_format: COMPARISON_RESPONSE_FORMAT,
|
|
execution_context:,
|
|
)
|
|
|
|
parsed = parse_comparison_response(response)
|
|
mapped_winner = map_candidate_label(parsed[:winner] || parsed[:winner_label], candidates)
|
|
|
|
if mapped_winner.present? && !mapped_winner.casecmp("tie").zero?
|
|
parsed[:winner] = mapped_winner
|
|
end
|
|
|
|
parsed
|
|
end
|
|
|
|
private
|
|
|
|
attr_reader :eval_case, :judge_llm, :criteria, :pass_rating
|
|
|
|
def build_prompt(result)
|
|
output_text, metadata = normalize_result(result)
|
|
sections = []
|
|
rubric_text =
|
|
if criteria.present?
|
|
criteria.strip
|
|
else
|
|
"Score the output purely on accuracy, completeness, and adherence to the task instructions."
|
|
end
|
|
|
|
sections << "Grading rubric:\n#{rubric_text}"
|
|
sections << "Candidate output:\n#{output_text}"
|
|
sections.concat(metadata)
|
|
|
|
sections << prompt_suffix
|
|
|
|
DiscourseAi::Completions::Prompt.new(
|
|
"You are an expert judge evaluating LLM outputs.",
|
|
messages: [{ type: :user, content: sections.join("\n\n") }],
|
|
)
|
|
end
|
|
|
|
def build_comparison_prompt(candidates)
|
|
sections = []
|
|
rubric_text =
|
|
if criteria.present?
|
|
criteria.strip
|
|
else
|
|
"Score the output purely on accuracy, completeness, and adherence to the task instructions."
|
|
end
|
|
|
|
sections << "Grading rubric:\n#{rubric_text}"
|
|
sections.concat(formatted_args)
|
|
|
|
candidates.each_with_index do |candidate, index|
|
|
label = candidate[:label].to_s.strip
|
|
label = "candidate #{index + 1}" if label.empty?
|
|
sections << "Candidate #{index + 1} (#{label}):\n#{format_placeholder_value(candidate[:output])}"
|
|
end
|
|
|
|
sections << comparison_suffix
|
|
|
|
DiscourseAi::Completions::Prompt.new(
|
|
"You are an expert judge evaluating LLM outputs.",
|
|
messages: [{ type: :user, content: sections.join("\n\n") }],
|
|
)
|
|
end
|
|
|
|
def normalize_result(result)
|
|
if result.is_a?(String)
|
|
[result, formatted_args]
|
|
elsif result.is_a?(Hash)
|
|
output = result[:result].to_s
|
|
other_metadata =
|
|
result
|
|
.except(:result)
|
|
.map { |key, value| "extra #{key}:\n#{format_placeholder_value(value)}" }
|
|
[output, formatted_args + other_metadata]
|
|
else
|
|
[result.to_s, formatted_args]
|
|
end
|
|
end
|
|
|
|
def formatted_args
|
|
args = eval_case.args
|
|
return [] unless args.is_a?(Hash)
|
|
|
|
args.map { |key, value| "Source #{key}:\n#{format_placeholder_value(value)}" }
|
|
end
|
|
|
|
def format_placeholder_value(value)
|
|
case value
|
|
when Array
|
|
value.join("\n\n")
|
|
else
|
|
value.to_s
|
|
end
|
|
end
|
|
|
|
def prompt_suffix
|
|
<<~SUFFIX
|
|
|
|
Evaluate the candidate output using the criteria above. Respond with JSON matching:
|
|
|
|
{
|
|
"rating": <integer between 1 and 10, where 10 is perfect>,
|
|
"explanation": "brief sentence explaining the score"
|
|
}
|
|
SUFFIX
|
|
end
|
|
|
|
def comparison_suffix
|
|
<<~SUFFIX
|
|
|
|
Compare every candidate using the rubric above. Respond with JSON that matches:
|
|
|
|
{
|
|
"winner": "<candidate label or tie>",
|
|
"winner_explanation": "<brief justification of the decision>",
|
|
"ratings": [
|
|
{
|
|
"candidate": "<candidate label>",
|
|
"rating": <integer between 1 and 10>,
|
|
"explanation": "<short reason describing strengths or issues>"
|
|
}
|
|
]
|
|
}
|
|
|
|
If there is no clear winner, set "winner" to "tie" and explain why.
|
|
SUFFIX
|
|
end
|
|
|
|
def parse_response(response)
|
|
rating = explanation = nil
|
|
|
|
if response.respond_to?(:read_buffered_property)
|
|
rating = response.read_buffered_property(:rating)
|
|
explanation = response.read_buffered_property(:explanation)
|
|
raw = response.to_s
|
|
else
|
|
raw_text = response.to_s
|
|
begin
|
|
parsed = JSON.parse(raw_text)
|
|
rating = parsed["rating"]
|
|
explanation = parsed["explanation"]
|
|
rescue JSON::ParserError
|
|
# leave rating nil
|
|
end
|
|
raw = raw_text
|
|
end
|
|
|
|
{ rating: rating.to_i, explanation: explanation.to_s.strip, raw: raw }
|
|
end
|
|
|
|
def parse_comparison_response(response)
|
|
raw_text = response.to_s
|
|
|
|
parsed =
|
|
begin
|
|
JSON.parse(raw_text)
|
|
rescue JSON::ParserError
|
|
{}
|
|
end
|
|
|
|
winner_label = parsed["winner"].to_s.strip
|
|
normalized_winner =
|
|
if winner_label.blank? || winner_label.casecmp("tie").zero?
|
|
nil
|
|
else
|
|
winner_label
|
|
end
|
|
|
|
ratings =
|
|
Array(parsed["ratings"]).map do |entry|
|
|
{
|
|
candidate: entry["candidate"].to_s,
|
|
rating: entry["rating"].to_i,
|
|
explanation: entry["explanation"].to_s.strip,
|
|
}
|
|
end
|
|
|
|
{
|
|
winner: normalized_winner,
|
|
winner_label: winner_label,
|
|
winner_explanation: parsed["winner_explanation"].to_s.strip,
|
|
ratings: ratings,
|
|
raw: raw_text,
|
|
}
|
|
end
|
|
|
|
def map_candidate_label(winner_label, candidates)
|
|
return winner_label if winner_label.blank?
|
|
return winner_label if winner_label.casecmp("tie").zero?
|
|
|
|
label_downcased = winner_label.to_s.strip.downcase
|
|
|
|
match = label_downcased.match(/candidate\s*(\d+)/i)
|
|
if match
|
|
index = match[1].to_i - 1
|
|
candidate_label = candidates[index]&.dig(:label)
|
|
return candidate_label if candidate_label.present?
|
|
end
|
|
|
|
candidate = candidates.find { |c| c[:label].to_s.strip.downcase == label_downcased }
|
|
candidate&.dig(:label) || winner_label
|
|
end
|
|
end
|
|
end
|
|
end
|