discourse/plugins/discourse-ai/evals/lib/workbench.rb
Roman Rizzi 3a647c8e50
FEATURE: Use evals to compare LLMs and Personas' prompts (#36027)
Implemented an eval “comparison matrix” that lets you run the same evals
across multiple personas or multiple LLMs and have a judge model declare
a winner with per-candidate scores. The CLI adds --compare
personas|llms, keeps persona selection (auto-prepending default for
persona mode), and always ensures a judge is configured. A dedicated
ComparisonRunner reuses Workbench results to build candidate outputs and
sends them to Judge#compare, which crafts a rubric-aware comparison
prompt and parses structured winner/ratings JSON. Outputs are streamed
to the console and individual run logs still get written. README
documents how to use the new flag and what each mode does.
2025-11-18 10:39:52 -03:00

303 lines
9.9 KiB
Ruby
Vendored

# frozen_string_literal: true
require_relative "recorder"
require_relative "eval"
require_relative "llm_repository"
require_relative "runners/base"
require_relative "runners/ai_helper"
require_relative "runners/translation"
require_relative "runners/hyde"
require_relative "runners/discoveries"
require_relative "runners/inference"
require_relative "runners/spam"
require_relative "runners/summarization"
require_relative "judge"
module DiscourseAi
module Evals
# Coordinates the execution of eval cases against one or more LLMs.
#
# The Playground drives the orchestration loop: it prepares the Structured
# Recorder, dispatches work to helpers/utilities based on the eval feature,
# and feeds the aggregated results back to the Recorder. It intentionally
# keeps higher-level scripts (`evals/run`) simple while centralizing
# instrumentation and error handling.
class Workbench
def initialize(output: $stdout, judge_llm: nil, persona_prompt: nil, persona_label: "default")
@output = output
@judge_llm = judge_llm
@persona_prompt = persona_prompt
label = persona_label.to_s.strip
@persona_label = label.empty? ? "default" : label
end
# Iterate through the provided LLM adapters and execute the eval case for
# each one, recording structured logs along the way.
#
# @param eval_case [DiscourseAi::Evals::Eval] the scenario to run.
# @param llms [Array<LlmModel>] LLMs selected by the CLI.
def run(eval_case:, llms:, &after_run)
recorder = Recorder.with_cassette(eval_case, persona_key: persona_label, output: output)
llms.each do |llm|
llm_name = llm.display_name || llm.name
start_time = Time.now.utc
if eval_case.vision && !llm.vision_enabled?
recorder.record_llm_skip(llm_name, "LLM does not support vision")
next
end
execution = execute_eval(eval_case, llm)
recorder.record_llm_results(llm_name, execution[:classified], start_time)
if after_run
after_run.call(
eval_case: eval_case,
llm: llm,
llm_name: llm_name,
persona_label: persona_label,
raw_entries: execution[:raw_entries],
classified_entries: execution[:classified],
)
end
rescue DiscourseAi::Evals::Eval::EvalError => e
recorder.record_llm_results(
llm_name,
[{ result: :fail, message: e.message, context: e.context }],
start_time,
)
rescue StandardError => e
puts e.backtrace if !Rails.env.test?
recorder.record_llm_results(llm_name, [{ result: :fail, message: e.message }], start_time)
end
ensure
recorder&.finish
end
def execute_eval(eval_case, llm)
feature = eval_case.feature
runner = find_runner(feature)
raw =
if runner
runner.run(eval_case, llm)
elsif feature == "custom:pdf_to_text"
pdf_to_text(llm, **eval_case.args)
elsif feature == "custom:image_to_text"
image_to_text(llm, **eval_case.args)
elsif feature == "custom:prompt"
DiscourseAi::Evals::PromptEvaluator.new(llm).prompt_call(eval_case.args)
elsif feature == "custom:edit_artifact"
edit_artifact(llm, **eval_case.args)
else
raise ArgumentError, "Unsupported eval feature '#{feature}'"
end
entries = normalize_entries(raw)
{ raw: raw, raw_entries: entries, classified: classify_results(eval_case, entries) }
end
private
attr_reader :output, :judge_llm, :persona_prompt, :persona_label
def find_runner(feature)
DiscourseAi::Evals::Runners::Base.find_runner(feature, persona_prompt)
end
def normalize_entries(raw)
raw.is_a?(Array) ? raw : [raw]
end
def classify_results(eval_case, entries)
entries.map do |entry|
raw_value = entry.is_a?(Hash) && entry.key?(:raw) ? entry[:raw] : entry
metadata = entry.is_a?(Hash) ? entry[:metadata] : nil
classification = classify_result(eval_case, raw_value)
classification[:metadata] = metadata if metadata.present?
classification
end
end
def classify_result(eval_case, result)
if eval_case.expected_output
if result == eval_case.expected_output
{ result: :pass }
else
{ result: :fail, expected_output: eval_case.expected_output, actual_output: result }
end
elsif eval_case.expected_output_regex
if result.to_s.match?(eval_case.expected_output_regex)
{ result: :pass }
else
{
result: :fail,
expected_output: eval_case.expected_output_regex,
actual_output: result,
}
end
elsif eval_case.expected_tool_call
classify_tool_call(eval_case.expected_tool_call, result)
elsif eval_case.judge
judge_result(eval_case, result)
else
{ result: :pass }
end
end
def classify_tool_call(expected_tool_call, result)
tool_call = result
tool_call = result.find { |r| r.is_a?(DiscourseAi::Completions::ToolCall) } if result.is_a?(
Array,
)
if !tool_call.is_a?(DiscourseAi::Completions::ToolCall) ||
tool_call.name != expected_tool_call[:name] ||
tool_call.parameters != expected_tool_call[:params]
{ result: :fail, expected_output: expected_tool_call, actual_output: result }
else
{ result: :pass }
end
end
def judge_result(eval_case, result)
if judge_llm.nil?
raise DiscourseAi::Evals::Eval::EvalError.new(
"Evaluation '#{eval_case.id}' requires the --judge option to specify an LLM.",
{ eval_id: eval_case.id },
)
end
DiscourseAi::Evals::Judge.new(eval_case: eval_case, judge_llm: judge_llm).evaluate(result)
end
# Extract text from an image upload by delegating to the ImageToText helper.
#
# @param llm [LlmModel] LLM backing the OCR step.
# @param path [String] path to the source image used for OCR.
# @return [String] text extracted from the image.
def image_to_text(llm, path:)
upload =
UploadCreator.new(File.open(path), File.basename(path)).create_for(
Discourse.system_user.id,
)
text = +""
DiscourseAi::Utils::ImageToText
.new(upload: upload, llm_model: llm, user: Discourse.system_user)
.extract_text do |chunk, _error|
text << chunk if chunk
text << "\n\n" if chunk
end
text
ensure
upload.destroy if upload
end
# Extract text from a PDF, optionally falling back to LLM-guided OCR for pages.
#
# @param llm [LlmModel] LLM passed to PdfToText for OCR guidance.
# @param path [String] path to the PDF fixture.
# @return [String] text aggregated across the PDF pages.
def pdf_to_text(llm, path:)
upload =
UploadCreator.new(File.open(path), File.basename(path)).create_for(
Discourse.system_user.id,
)
text = +""
DiscourseAi::Utils::PdfToText
.new(upload: upload, user: Discourse.system_user, llm_model: llm)
.extract_text do |chunk|
text << chunk if chunk
text << "\n\n" if chunk
end
text
ensure
upload.destroy if upload
end
# Run the edit artifact flow, returning the final artifact contents.
#
# @param llm [LlmModel] LLM used to produce diffs.
# @param css_path [String] path to the CSS fixture.
# @param js_path [String] path to the JS fixture.
# @param html_path [String] path to the HTML fixture.
# @param instructions_path [String] instructions fed to the LLM.
# @return [Hash] latest artifact snapshot ({ css:, js:, html: }).
def edit_artifact(llm, css_path:, js_path:, html_path:, instructions_path:)
css = File.read(css_path)
js = File.read(js_path)
html = File.read(html_path)
instructions = File.read(instructions_path)
artifact =
AiArtifact.create!(
css: css,
js: js,
html: html,
user_id: Discourse.system_user.id,
post_id: 1,
name: "eval artifact",
)
post = Post.new(topic_id: 1, id: 1)
diff =
DiscourseAi::AiBot::ArtifactUpdateStrategies::Diff.new(
llm: llm.to_llm,
post: post,
user: Discourse.system_user,
artifact: artifact,
artifact_version: nil,
instructions: instructions,
)
diff.apply
if diff.failed_searches.present?
raise DiscourseAi::Evals::Eval::EvalError.new(
"Failed to apply all changes",
diff.failed_searches,
)
end
version = artifact.versions.last
unless valid_javascript?(version.js)
raise DiscourseAi::Evals::Eval::EvalError.new("Invalid JS", version.js)
end
output = { css: version.css, js: version.js, html: version.html }
artifact.destroy
output
end
def valid_javascript?(str)
require "open3"
Tempfile.create(%w[test .js]) do |f|
f.write(str)
f.flush
begin
Discourse::Utils.execute_command(
"node",
"--check",
f.path,
failure_message: "Invalid JavaScript syntax",
timeout: 30,
)
true
rescue Discourse::Utils::CommandError
false
end
end
rescue StandardError
false
end
end
end
end