mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-06-19 06:43:54 +08:00
160 lines
4.7 KiB
Ruby
Vendored
160 lines
4.7 KiB
Ruby
Vendored
# frozen_string_literal: true
|
|
|
|
require_relative "../../evals/lib/recorder"
|
|
require_relative "../../evals/lib/structured_logger"
|
|
require_relative "../../evals/lib/eval"
|
|
|
|
RSpec.describe DiscourseAi::Evals::Recorder do
|
|
subject(:recorder) do
|
|
described_class.new(
|
|
eval_case,
|
|
logger,
|
|
"/tmp/example.json",
|
|
structured_logger,
|
|
total_targets: 1,
|
|
agent_key: agent_key,
|
|
output: output,
|
|
)
|
|
end
|
|
|
|
let(:eval_case) do
|
|
instance_double("DiscourseAi::Evals::Eval", id: "example-eval", to_json: { foo: "bar" })
|
|
end
|
|
let(:logger) { instance_double(Logger, info: nil, error: nil) }
|
|
let(:agent_key) { "default" }
|
|
let(:formatter) do
|
|
instance_double(
|
|
DiscourseAi::Evals::ConsoleFormatter,
|
|
announce_start: nil,
|
|
record_result: nil,
|
|
record_skip: nil,
|
|
pause_progress_line: nil,
|
|
record_comparison_judged: nil,
|
|
record_comparison_expected: nil,
|
|
finalize: nil,
|
|
)
|
|
end
|
|
let(:structured_logger) do
|
|
instance_double(
|
|
DiscourseAi::Evals::StructuredLogger,
|
|
start_root: nil,
|
|
root_started?: root_started,
|
|
add_child_step: child_step,
|
|
append_entry: nil,
|
|
finish_root: nil,
|
|
to_trace_event_json: "{}",
|
|
path: "/tmp/example.json",
|
|
)
|
|
end
|
|
let(:root_started) { true }
|
|
let(:child_step) { {} }
|
|
let(:output) { StringIO.new }
|
|
|
|
before { allow(DiscourseAi::Evals::ConsoleFormatter).to receive(:new).and_return(formatter) }
|
|
|
|
describe "#execution_context" do
|
|
it "exposes recorder loggers through an explicit completion context" do
|
|
context = recorder.execution_context
|
|
|
|
expect(context.audit_logger).to eq(logger)
|
|
expect(context.structured_audit_logger).to eq(structured_logger)
|
|
end
|
|
end
|
|
|
|
describe "#running" do
|
|
it "starts a root structured log step for the eval" do
|
|
recorder.running
|
|
|
|
expect(structured_logger).to have_received(:start_root).with(
|
|
name: "Evaluating example-eval (agent: default)",
|
|
args: {
|
|
foo: "bar",
|
|
agent_key: "default",
|
|
},
|
|
)
|
|
expect(logger).to have_received(:info).with(
|
|
"Starting evaluation 'example-eval' (agent: default)",
|
|
)
|
|
end
|
|
end
|
|
|
|
describe "#record_llm_skip" do
|
|
context "when structured logging has not started" do
|
|
let(:root_started) { false }
|
|
|
|
it "raises an informative error" do
|
|
expect { recorder.record_llm_skip("gpt-4", "vision-only feature") }.to raise_error(
|
|
ArgumentError,
|
|
"You didn't instantiated this object with #with_cassette",
|
|
)
|
|
end
|
|
end
|
|
|
|
it "logs the skip reason when the structured log is active" do
|
|
recorder.record_llm_skip("gpt-4", "vision-only feature")
|
|
|
|
expect(logger).to have_received(:info).with(
|
|
"Skipping LLM: gpt-4 - Reason: vision-only feature",
|
|
)
|
|
end
|
|
end
|
|
|
|
describe "#record_llm_results" do
|
|
let(:results) do
|
|
[
|
|
{ result: :pass },
|
|
{
|
|
result: :fail,
|
|
message: "Mismatch",
|
|
expected_output: "ideal",
|
|
actual_output: "oops",
|
|
context: "details",
|
|
},
|
|
]
|
|
end
|
|
let(:start_time) { Time.utc(2024, 1, 1, 12, 0, 0) }
|
|
let(:now) { Time.utc(2024, 1, 1, 12, 1, 0) }
|
|
|
|
before { allow(Time).to receive(:now).and_return(now) }
|
|
|
|
context "when structured logging has not started" do
|
|
let(:root_started) { false }
|
|
|
|
it "raises an informative error" do
|
|
expect { recorder.record_llm_results("gpt-4", results, start_time) }.to raise_error(
|
|
ArgumentError,
|
|
"You didn't instantiated this object with #with_cassette",
|
|
)
|
|
end
|
|
end
|
|
|
|
it "records structured log entries and prints human friendly output" do
|
|
recorder.record_llm_results("gpt-4", results, start_time)
|
|
|
|
expect(structured_logger).to have_received(:add_child_step).with(
|
|
name: "Evaluating with LLM: gpt-4",
|
|
)
|
|
expect(structured_logger).to have_received(:append_entry).with(
|
|
step: child_step,
|
|
name: :good,
|
|
started_at: start_time,
|
|
ended_at: now.utc,
|
|
)
|
|
expect(structured_logger).to have_received(:append_entry).with(
|
|
step: child_step,
|
|
name: :bad,
|
|
started_at: start_time,
|
|
ended_at: now.utc,
|
|
)
|
|
|
|
expect(logger).to have_received(:info).with("Evaluating with LLM: gpt-4")
|
|
expect(logger).to have_received(:error).with("Evaluation failed with LLM: gpt-4")
|
|
|
|
expect(output.string).to include("gpt-4: ")
|
|
expect(output.string).to include("Passed 🟢")
|
|
expect(output.string).to include("Failed 🔴")
|
|
expect(output.string).to include("---- Expected ----\nideal")
|
|
expect(output.string).to include("---- Actual ----\noops")
|
|
end
|
|
end
|
|
end
|