mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-05-26 04:58:53 +08:00
Implemented an eval “comparison matrix” that lets you run the same evals across multiple personas or multiple LLMs and have a judge model declare a winner with per-candidate scores. The CLI adds --compare personas|llms, keeps persona selection (auto-prepending default for persona mode), and always ensures a judge is configured. A dedicated ComparisonRunner reuses Workbench results to build candidate outputs and sends them to Judge#compare, which crafts a rubric-aware comparison prompt and parses structured winner/ratings JSON. Outputs are streamed to the console and individual run logs still get written. README documents how to use the new flag and what each mode does.
303 lines
9.9 KiB
Ruby
Vendored
303 lines
9.9 KiB
Ruby
Vendored
# frozen_string_literal: true
|
|
|
|
require_relative "recorder"
|
|
require_relative "eval"
|
|
require_relative "llm_repository"
|
|
require_relative "runners/base"
|
|
require_relative "runners/ai_helper"
|
|
require_relative "runners/translation"
|
|
require_relative "runners/hyde"
|
|
require_relative "runners/discoveries"
|
|
require_relative "runners/inference"
|
|
require_relative "runners/spam"
|
|
require_relative "runners/summarization"
|
|
require_relative "judge"
|
|
|
|
module DiscourseAi
|
|
module Evals
|
|
# Coordinates the execution of eval cases against one or more LLMs.
|
|
#
|
|
# The Playground drives the orchestration loop: it prepares the Structured
|
|
# Recorder, dispatches work to helpers/utilities based on the eval feature,
|
|
# and feeds the aggregated results back to the Recorder. It intentionally
|
|
# keeps higher-level scripts (`evals/run`) simple while centralizing
|
|
# instrumentation and error handling.
|
|
class Workbench
|
|
def initialize(output: $stdout, judge_llm: nil, persona_prompt: nil, persona_label: "default")
|
|
@output = output
|
|
@judge_llm = judge_llm
|
|
@persona_prompt = persona_prompt
|
|
label = persona_label.to_s.strip
|
|
@persona_label = label.empty? ? "default" : label
|
|
end
|
|
|
|
# Iterate through the provided LLM adapters and execute the eval case for
|
|
# each one, recording structured logs along the way.
|
|
#
|
|
# @param eval_case [DiscourseAi::Evals::Eval] the scenario to run.
|
|
# @param llms [Array<LlmModel>] LLMs selected by the CLI.
|
|
def run(eval_case:, llms:, &after_run)
|
|
recorder = Recorder.with_cassette(eval_case, persona_key: persona_label, output: output)
|
|
|
|
llms.each do |llm|
|
|
llm_name = llm.display_name || llm.name
|
|
start_time = Time.now.utc
|
|
|
|
if eval_case.vision && !llm.vision_enabled?
|
|
recorder.record_llm_skip(llm_name, "LLM does not support vision")
|
|
next
|
|
end
|
|
|
|
execution = execute_eval(eval_case, llm)
|
|
recorder.record_llm_results(llm_name, execution[:classified], start_time)
|
|
if after_run
|
|
after_run.call(
|
|
eval_case: eval_case,
|
|
llm: llm,
|
|
llm_name: llm_name,
|
|
persona_label: persona_label,
|
|
raw_entries: execution[:raw_entries],
|
|
classified_entries: execution[:classified],
|
|
)
|
|
end
|
|
rescue DiscourseAi::Evals::Eval::EvalError => e
|
|
recorder.record_llm_results(
|
|
llm_name,
|
|
[{ result: :fail, message: e.message, context: e.context }],
|
|
start_time,
|
|
)
|
|
rescue StandardError => e
|
|
puts e.backtrace if !Rails.env.test?
|
|
recorder.record_llm_results(llm_name, [{ result: :fail, message: e.message }], start_time)
|
|
end
|
|
ensure
|
|
recorder&.finish
|
|
end
|
|
|
|
def execute_eval(eval_case, llm)
|
|
feature = eval_case.feature
|
|
|
|
runner = find_runner(feature)
|
|
raw =
|
|
if runner
|
|
runner.run(eval_case, llm)
|
|
elsif feature == "custom:pdf_to_text"
|
|
pdf_to_text(llm, **eval_case.args)
|
|
elsif feature == "custom:image_to_text"
|
|
image_to_text(llm, **eval_case.args)
|
|
elsif feature == "custom:prompt"
|
|
DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(eval_case.args)
|
|
elsif feature == "custom:edit_artifact"
|
|
edit_artifact(llm, **eval_case.args)
|
|
else
|
|
raise ArgumentError, "Unsupported eval feature '#{feature}'"
|
|
end
|
|
|
|
entries = normalize_entries(raw)
|
|
|
|
{ raw: raw, raw_entries: entries, classified: classify_results(eval_case, entries) }
|
|
end
|
|
|
|
private
|
|
|
|
attr_reader :output, :judge_llm, :persona_prompt, :persona_label
|
|
|
|
def find_runner(feature)
|
|
DiscourseAi::Evals::Runners::Base.find_runner(feature, persona_prompt)
|
|
end
|
|
|
|
def normalize_entries(raw)
|
|
raw.is_a?(Array) ? raw : [raw]
|
|
end
|
|
|
|
def classify_results(eval_case, entries)
|
|
entries.map do |entry|
|
|
raw_value = entry.is_a?(Hash) && entry.key?(:raw) ? entry[:raw] : entry
|
|
metadata = entry.is_a?(Hash) ? entry[:metadata] : nil
|
|
|
|
classification = classify_result(eval_case, raw_value)
|
|
|
|
classification[:metadata] = metadata if metadata.present?
|
|
|
|
classification
|
|
end
|
|
end
|
|
|
|
def classify_result(eval_case, result)
|
|
if eval_case.expected_output
|
|
if result == eval_case.expected_output
|
|
{ result: :pass }
|
|
else
|
|
{ result: :fail, expected_output: eval_case.expected_output, actual_output: result }
|
|
end
|
|
elsif eval_case.expected_output_regex
|
|
if result.to_s.match?(eval_case.expected_output_regex)
|
|
{ result: :pass }
|
|
else
|
|
{
|
|
result: :fail,
|
|
expected_output: eval_case.expected_output_regex,
|
|
actual_output: result,
|
|
}
|
|
end
|
|
elsif eval_case.expected_tool_call
|
|
classify_tool_call(eval_case.expected_tool_call, result)
|
|
elsif eval_case.judge
|
|
judge_result(eval_case, result)
|
|
else
|
|
{ result: :pass }
|
|
end
|
|
end
|
|
|
|
def classify_tool_call(expected_tool_call, result)
|
|
tool_call = result
|
|
tool_call = result.find { |r| r.is_a?(DiscourseAi::Completions::ToolCall) } if result.is_a?(
|
|
Array,
|
|
)
|
|
|
|
if !tool_call.is_a?(DiscourseAi::Completions::ToolCall) ||
|
|
tool_call.name != expected_tool_call[:name] ||
|
|
tool_call.parameters != expected_tool_call[:params]
|
|
{ result: :fail, expected_output: expected_tool_call, actual_output: result }
|
|
else
|
|
{ result: :pass }
|
|
end
|
|
end
|
|
|
|
def judge_result(eval_case, result)
|
|
if judge_llm.nil?
|
|
raise DiscourseAi::Evals::Eval::EvalError.new(
|
|
"Evaluation '#{eval_case.id}' requires the --judge option to specify an LLM.",
|
|
{ eval_id: eval_case.id },
|
|
)
|
|
end
|
|
|
|
DiscourseAi::Evals::Judge.new(eval_case: eval_case, judge_llm: judge_llm).evaluate(result)
|
|
end
|
|
|
|
# Extract text from an image upload by delegating to the ImageToText helper.
|
|
#
|
|
# @param llm [LlmModel] LLM backing the OCR step.
|
|
# @param path [String] path to the source image used for OCR.
|
|
# @return [String] text extracted from the image.
|
|
def image_to_text(llm, path:)
|
|
upload =
|
|
UploadCreator.new(File.open(path), File.basename(path)).create_for(
|
|
Discourse.system_user.id,
|
|
)
|
|
|
|
text = +""
|
|
DiscourseAi::Utils::ImageToText
|
|
.new(upload: upload, llm_model: llm, user: Discourse.system_user)
|
|
.extract_text do |chunk, _error|
|
|
text << chunk if chunk
|
|
text << "\n\n" if chunk
|
|
end
|
|
text
|
|
ensure
|
|
upload.destroy if upload
|
|
end
|
|
|
|
# Extract text from a PDF, optionally falling back to LLM-guided OCR for pages.
|
|
#
|
|
# @param llm [LlmModel] LLM passed to PdfToText for OCR guidance.
|
|
# @param path [String] path to the PDF fixture.
|
|
# @return [String] text aggregated across the PDF pages.
|
|
def pdf_to_text(llm, path:)
|
|
upload =
|
|
UploadCreator.new(File.open(path), File.basename(path)).create_for(
|
|
Discourse.system_user.id,
|
|
)
|
|
|
|
text = +""
|
|
DiscourseAi::Utils::PdfToText
|
|
.new(upload: upload, user: Discourse.system_user, llm_model: llm)
|
|
.extract_text do |chunk|
|
|
text << chunk if chunk
|
|
text << "\n\n" if chunk
|
|
end
|
|
|
|
text
|
|
ensure
|
|
upload.destroy if upload
|
|
end
|
|
|
|
# Run the edit artifact flow, returning the final artifact contents.
|
|
#
|
|
# @param llm [LlmModel] LLM used to produce diffs.
|
|
# @param css_path [String] path to the CSS fixture.
|
|
# @param js_path [String] path to the JS fixture.
|
|
# @param html_path [String] path to the HTML fixture.
|
|
# @param instructions_path [String] instructions fed to the LLM.
|
|
# @return [Hash] latest artifact snapshot ({ css:, js:, html: }).
|
|
def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
|
|
css = File.read(css_path)
|
|
js = File.read(js_path)
|
|
html = File.read(html_path)
|
|
instructions = File.read(instructions_path)
|
|
artifact =
|
|
AiArtifact.create!(
|
|
css: css,
|
|
js: js,
|
|
html: html,
|
|
user_id: Discourse.system_user.id,
|
|
post_id: 1,
|
|
name: "eval artifact",
|
|
)
|
|
|
|
post = Post.new(topic_id: 1, id: 1)
|
|
diff =
|
|
DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
|
|
llm: llm.to_llm,
|
|
post: post,
|
|
user: Discourse.system_user,
|
|
artifact: artifact,
|
|
artifact_version: nil,
|
|
instructions: instructions,
|
|
)
|
|
diff.apply
|
|
|
|
if diff.failed_searches.present?
|
|
raise DiscourseAi::Evals::Eval::EvalError.new(
|
|
"Failed to apply all changes",
|
|
diff.failed_searches,
|
|
)
|
|
end
|
|
|
|
version = artifact.versions.last
|
|
unless valid_javascript?(version.js)
|
|
raise DiscourseAi::Evals::Eval::EvalError.new("Invalid JS", version.js)
|
|
end
|
|
|
|
output = { css: version.css, js: version.js, html: version.html }
|
|
|
|
artifact.destroy
|
|
output
|
|
end
|
|
|
|
def valid_javascript?(str)
|
|
require "open3"
|
|
|
|
Tempfile.create(%w[test .js]) do |f|
|
|
f.write(str)
|
|
f.flush
|
|
|
|
begin
|
|
Discourse::Utils.execute_command(
|
|
"node",
|
|
"--check",
|
|
f.path,
|
|
failure_message: "Invalid JavaScript syntax",
|
|
timeout: 30,
|
|
)
|
|
true
|
|
rescue Discourse::Utils::CommandError
|
|
false
|
|
end
|
|
end
|
|
rescue StandardError
|
|
false
|
|
end
|
|
end
|
|
end
|
|
end
|