discourse/plugins/discourse-ai/spec/evals/workbench_spec.rb

# frozen_string_literal: true

require_relative "../../evals/lib/workbench"
require_relative "../../evals/lib/eval"
require_relative "../../evals/lib/llm_repository"
require_relative "../../evals/lib/recorder"

RSpec.describe DiscourseAi::Evals::Workbench do
  subject(:workbench) { described_class.new(output: output) }

  let(:output) { StringIO.new }
  let(:formatter) do
    instance_double(DiscourseAi::Evals::ConsoleFormatter, announce_start: nil, finalize: nil)
  end
  let(:recorder) do
    instance_double(
      DiscourseAi::Evals::Recorder,
      execution_context: nil,
      record_llm_skip: nil,
      record_llm_results: nil,
      finish: nil,
    )
  end
  let(:eval_case) do
    instance_double(
      DiscourseAi::Evals::Eval,
      id: "example-eval",
      vision: requires_vision,
      feature: "custom:prompt",
      args: {
      },
      expected_output: nil,
      expected_output_regex: nil,
      expected_tool_call: nil,
      judge: nil,
    )
  end
  let(:requires_vision) { false }
  let(:llm) do
    Fabricate.build(
      :fake_model,
      display_name: "gpt-4",
      name: "gpt-4",
      vision_enabled: llm_supports_vision,
    )
  end
  let(:llm_supports_vision) { true }

  before do
    allow(DiscourseAi::Evals::ConsoleFormatter).to receive(:new).and_return(formatter)
    allow(DiscourseAi::Evals::Recorder).to receive(:with_cassette).and_return(recorder)
    freeze_time
  end

  describe "#run" do
    it "records results for each llm" do
      # rubocop:disable RSpec/SubjectStub
      allow(workbench).to receive(:execute_eval).and_return(
        { raw: "output", raw_entries: ["output"], classified: [{ result: :pass }] },
      )

      workbench.run_evals(eval_cases: [eval_case], llms: [llm])

      expect(DiscourseAi::Evals::Recorder).to have_received(:with_cassette).with(
        eval_case,
        output: output,
        total_targets: 1,
        agent_key: :default,
        formatter: formatter,
        announce_formatter: false,
        finalize_formatter: false,
      )
      expect(recorder).to have_received(:record_llm_results).with(
        "gpt-4",
        [{ result: :pass }],
        Time.now.utc,
        raw_entries: ["output"],
        display_label: "gpt-4",
        row_prefix: "example-eval",
      )
      expect(recorder).to have_received(:finish)
      expect(formatter).to have_received(:finalize)
    end

    context "when the eval requires vision but the llm does not support it" do
      let(:requires_vision) { true }
      let(:llm_supports_vision) { false }

      it "skips the llm and records the reason" do
        workbench.run_evals(eval_cases: [eval_case], llms: [llm])

        expect(recorder).to have_received(:record_llm_skip).with(
          "gpt-4",
          "LLM does not support vision",
          display_label: "gpt-4",
          row_prefix: "example-eval",
        )
        expect(recorder).to have_received(:finish)
      end
    end

    context "when eval execution raises an EvalError" do
      it "records the failure with the error context" do
        error = DiscourseAi::Evals::Eval::EvalError.new("boom", { foo: "bar" })
        allow(workbench).to receive(:execute_eval).and_raise(error) # rubocop:disable RSpec/SubjectStub

        workbench.run_evals(eval_cases: [eval_case], llms: [llm])

        expect(recorder).to have_received(:record_llm_results).with(
          "gpt-4",
          [{ result: :fail, message: "boom", context: { foo: "bar" } }],
          Time.now.utc,
          display_label: "gpt-4",
          row_prefix: "example-eval",
        )
        expect(recorder).to have_received(:finish)
      end
    end

    context "when eval execution raises an unexpected error" do
      it "records the failure with the exception message" do
        allow(workbench).to receive(:execute_eval).and_raise(StandardError.new("kaboom")) # rubocop:disable RSpec/SubjectStub

        workbench.run_evals(eval_cases: [eval_case], llms: [llm])

        expect(recorder).to have_received(:record_llm_results).with(
          "gpt-4",
          [{ message: "kaboom", result: :fail }],
          Time.now.utc,
          display_label: "gpt-4",
          row_prefix: "example-eval",
        )
      end
    end
  end

  describe "feature execution flows" do
    fab!(:category)
    let(:llm) { Fabricate(:fake_model, display_name: "Fake Eval Model", vision_enabled: false) }
    let(:workbench) { described_class.new(output: output) }

    before do
      enable_current_plugin
      ensure_system_agent(DiscourseAi::Agents::Summarizer)
      ensure_system_agent(DiscourseAi::Agents::ShortSummarizer)
      ensure_system_agent(DiscourseAi::Agents::SpamDetector)
      AiAgent.agent_cache.flush!
    end

    it "generates topic summaries using the summarization eval feature" do
      eval_case =
        OpenStruct.new(
          id: "topic-summary",
          feature: "summarization:topic_summaries",
          args: {
            input: "First post\nSecond post",
          },
          expected_output: "Concise summary",
          expected_output_regex: nil,
          expected_tool_call: nil,
          judge: nil,
        )

      results =
        DiscourseAi::Completions::Llm.with_prepared_responses(["Concise summary"]) do
          workbench.execute_eval(eval_case, llm)
        end

      expect(results[:classified].first[:result]).to eq(:pass)
    end

    it "always provides an execution_context to runners even when caller omits it" do
      eval_case =
        OpenStruct.new(
          id: "ctx-check",
          feature: "summarization:topic_summaries",
          args: {
            input: "Test post",
          },
          expected_output: nil,
          expected_output_regex: nil,
          expected_tool_call: nil,
          judge: nil,
          vision: false,
        )

      captured_context = nil
      allow(DiscourseAi::Agents::Bot).to receive(
        :as,
      ).and_wrap_original do |original, *args, **kwargs|
        original
          .call(*args, **kwargs)
          .tap do |bot|
            allow(bot).to receive(
              :reply,
            ).and_wrap_original do |reply_method, *rargs, **rkwargs, &blk|
              captured_context = rkwargs[:execution_context]
              reply_method.call(*rargs, **rkwargs, &blk)
            end
          end
      end

      DiscourseAi::Completions::Llm.with_prepared_responses(["Summary"]) do
        workbench.execute_eval(eval_case, llm)
      end

      expect(captured_context).to be_a(DiscourseAi::Completions::ExecutionContext)
    end

    it "flags spam posts via the spam inspection eval feature" do
      eval_case =
        OpenStruct.new(
          id: "spam-eval",
          feature: "spam:inspect_posts",
          args: {
            input: "Buy now click now http://spam.test",
            topic_title: "Limited offer",
          },
          expected_output: "true",
          expected_output_regex: nil,
          expected_tool_call: nil,
          judge: nil,
        )

      results =
        DiscourseAi::Completions::Llm.with_prepared_responses([true, "obvious spam"]) do
          workbench.execute_eval(eval_case, llm)
        end

      expect(results[:classified].first[:result]).to eq(:pass)
    end
  end

  def ensure_system_agent(agent_class)
    agent_id = DiscourseAi::Agents::Agent.system_agents[agent_class]
    base = agent_class.new

    AiAgent
      .find_or_initialize_by(id: agent_id)
      .tap do |agent|
        agent.system = true
        agent.enabled = true
        agent.priority ||= false
        agent.name ||= agent_class.name
        agent.description ||= agent_class.description
        agent.system_prompt = base.system_prompt
        agent.allowed_group_ids = [Group::AUTO_GROUPS[:everyone]]
        agent.response_format = base.response_format
        agent.examples = base.examples
        agent.temperature = base.respond_to?(:temperature) ? base.temperature : nil
        agent.top_p = base.respond_to?(:top_p) ? base.top_p : nil
        agent.show_thinking = true
        agent.tools ||= []
        agent.save!(validate: false)
      end
  end

  describe "#judge_result" do
    let(:judge_eval_case) do
      OpenStruct.new(
        id: "judge-eval",
        args: {
          input: "Source content",
        },
        judge: {
          criteria: "Score the output against the provided input, rewarding accuracy and clarity.",
          pass_rating: 7,
        },
      )
    end

    it "raises a helpful error when no judge llm is configured" do
      expect { workbench.send(:judge_result, judge_eval_case, "answer") }.to raise_error(
        DiscourseAi::Evals::Eval::EvalError,
        /requires the --judge option/,
      )
    end

    it "returns a passing result when the rating meets the threshold" do
      judge_llm = Fabricate(:fake_model)
      workbench_with_judge = described_class.new(output: output, judge_llm: judge_llm)

      response = { "rating" => 8, "explanation" => "good" }.to_json

      result =
        DiscourseAi::Completions::Llm.with_prepared_responses([response], llm: judge_llm) do
          workbench_with_judge.send(:judge_result, judge_eval_case, "answer")
        end

      expect(result[:result]).to eq(:pass)
    end

    it "returns a failure when the rating is below the threshold" do
      judge_llm = Fabricate(:fake_model)
      workbench_with_judge = described_class.new(output: output, judge_llm: judge_llm)

      response = { "rating" => 5, "explanation" => "needs work" }.to_json

      result =
        DiscourseAi::Completions::Llm.with_prepared_responses([response], llm: judge_llm) do
          workbench_with_judge.send(:judge_result, judge_eval_case, "answer")
        end

      expect(result[:result]).to eq(:fail)
      expect(result[:message]).to include("LLM Rating below threshold")
    end
  end

  describe "#judge_input" do
    it "forwards `name` and `description` from metadata so the judge sees them" do
      input =
        workbench.send(:judge_input, "SELECT 1", { name: "My Query", description: "Counts users" })

      expect(input).to eq(result: "SELECT 1", name: "My Query", description: "Counts users")
    end

    it "returns the raw result when there is no metadata" do
      expect(workbench.send(:judge_input, "SELECT 1", nil)).to eq("SELECT 1")
      expect(workbench.send(:judge_input, "SELECT 1", {})).to eq("SELECT 1")
    end
  end
end