discourse/plugins/discourse-ai/evals/lib/console_formatter.rb

# frozen_string_literal: true

module DiscourseAi
  module Evals
    # Renders richer CLI output for eval runs: a compact progress bar followed by
    # a table summarizing each case across models/agents.
    class ConsoleFormatter
      PROGRESS_BAR_WIDTH = 28
      SAMPLE_WIDTH = 46
      MIN_CELL_WIDTH = 8
      DEFAULT_MAX_WIDTH = 120
      JUDGE_COLUMN_LABEL = "judge"

      def initialize(label:, output:, total_targets:, agent_key: nil)
        @run_label = label
        @output = output
        @total_targets = total_targets
        @agent_key = agent_key
        @rows = []
        @columns = []
        @completed_units = 0
        @total_units = nil
        @case_count = nil
        @row_offsets = {}
        @max_table_width = detect_max_width
      end

      def announce_start
        header = "Starting evaluation #{run_label}"
        header << " (agent: #{agent_key})" if agent_key && agent_key != "default"
        output.puts(header)
      end

      def record_result(display_label:, llm_label:, results:, raw_entries:, row_prefix: nil)
        register_column(display_label)
        @case_count = [@case_count, results.length].compact.max
        row_start = resolve_row_start(row_prefix)

        results.each_with_index do |result, index|
          row_index = row_start + index
          row =
            rows[row_index] ||= {
              label: label_for(result[:metadata], index, row_prefix),
              cells: {
              },
            }
          row[:cells][display_label] = build_cell(result, raw_entries&.[](index))
        end

        refresh_total_units
        bump_progress(results.length, llm_label)
      end

      def record_skip(display_label:, llm_label:, reason:, row_prefix: nil)
        register_column(display_label)
        @case_count = [@case_count, 1].compact.max
        row_start = resolve_row_start(row_prefix)

        row = rows[row_start] ||= { label: label_for({}, 0, row_prefix), cells: {} }
        row[:cells][display_label] = { status: :skipped, detail: reason }

        refresh_total_units
        bump_progress(@case_count, llm_label)
      end

      def record_comparison_judged(
        row_prefix:,
        candidates:,
        result:,
        judge_label: JUDGE_COLUMN_LABEL
      )
        row_indices = row_indices_for(row_prefix)
        @case_count = [@case_count, row_indices.length].compact.max

        register_comparison_columns(candidates)
        register_column(judge_label)

        row_indices.each_with_index do |row_index, offset|
          label_suffix = label_suffix_for_winner(result[:winner], result[:winner_label])
          ensure_row(row_index, row_prefix, offset, label_suffix: label_suffix)

          rows[row_index][:cells][judge_label] = judged_summary_cell(result)

          candidates.each_with_index do |candidate, idx|
            rows[row_index][:cells][candidate[:display_label]] = comparison_cell_for(
              candidate: candidate,
              result: result,
              candidate_index: idx,
            )
          end
        end

        refresh_total_units
      end

      def record_comparison_expected(row_prefix:, candidates:, winner:, failures:, status_line:)
        row_indices = row_indices_for(row_prefix)
        @case_count = [@case_count, row_indices.length].compact.max

        register_comparison_columns(candidates)
        failure_map = failures.index_by { |failure| normalize_label(failure[:label]) }
        tie = winner.to_s == "tie"

        row_indices.each_with_index do |row_index, offset|
          label_suffix = label_suffix_for_winner(winner, nil)
          ensure_row(row_index, row_prefix, offset, label_suffix: label_suffix)

          candidates.each do |candidate|
            candidate_label = normalize_label(candidate[:label])
            rows[row_index][:cells][candidate[:display_label]] = expected_comparison_cell(
              candidate_label: candidate_label,
              winner: normalize_label(winner),
              tie: tie,
              failure: failure_map[candidate_label],
              status_line: status_line,
              output: candidate[:output],
            )
          end
        end

        refresh_total_units
      end

      def finalize
        return if rows.empty?

        clear_progress_line if progress_active?
        output.puts
        output.puts(build_table)
        output.puts(summary_line)
        output.puts(legend_line)
      end

      def pause_progress_line
        clear_progress_line if progress_active?
      end

      private

      attr_reader :run_label, :output, :rows, :columns, :agent_key, :row_offsets

      def register_column(label)
        label = label.to_s
        columns << label if columns.exclude?(label)
      end

      def resolve_row_start(row_prefix)
        key = row_prefix || :default
        row_offsets[key] ||= rows.length
      end

      def row_indices_for(row_prefix)
        start_index = row_offsets[row_prefix] || resolve_row_start(row_prefix)
        next_start = row_offsets.values.select { |offset| offset > start_index }.min
        end_index = (next_start && next_start > 0) ? next_start - 1 : rows.length - 1
        end_index = start_index if end_index < start_index

        indices = (start_index..end_index).to_a
        indices = [start_index] if indices.empty?
        indices
      end

      def ensure_row(row_index, row_prefix, relative_index, label_suffix: nil)
        rows[row_index] ||= {
          label: label_for({}, relative_index, row_prefix, label_suffix: label_suffix),
          cells: {
          },
        }
      end

      def build_cell(result, raw_entry)
        status = normalize_status(result[:result])
        detail = detail_for(result, raw_entry)

        { status: status, detail: detail }
      end

      def normalize_status(value)
        case value
        when :pass
          :pass
        when :fail
          :fail
        when :skipped
          :skipped
        else
          :unknown
        end
      end

      def detail_for(result, raw_entry)
        return truncate(detail_for_skip(result), SAMPLE_WIDTH) if result[:result] == :skipped
        return nil if result[:result] == :pass

        if result[:result] == :fail
          expected = stringify(result[:expected_output] || result[:expected_output_regex])
          actual = stringify(result[:actual_output] || extract_raw(raw_entry))
          parts = []
          parts << "Expected: #{expected}" if expected.present?
          parts << "Actual: #{actual}" if actual.present?
          return truncate(parts.join(" | "), SAMPLE_WIDTH) if parts.any?
        end

        sample = stringify(result[:actual_output] || extract_raw(raw_entry))
        truncate(sample, SAMPLE_WIDTH) if sample.present?
      end

      def comparison_cell_for(candidate:, result:, candidate_index:)
        winner = normalize_label(result[:winner])
        ratings = Array(result[:ratings])
        rating_map = ratings.index_by { |rating| normalize_label(rating[:candidate]) }
        rating = rating_map[normalize_label(candidate[:label])] || ratings[candidate_index]
        tie = tie_result?(winner, result[:winner_label])
        status =
          if tie
            :pass
          elsif winner.present?
            normalize_label(candidate[:label]) == winner ? :pass : :fail
          else
            :unknown
          end

        detail =
          comparison_detail(
            rating: rating,
            status: status,
            tie: tie,
            winner_explanation: result[:winner_explanation],
          )

        { status: status, detail: detail }
      end

      def expected_comparison_cell(candidate_label:, winner:, tie:, failure:, status_line:, output:)
        status =
          if tie
            :pass
          elsif winner.present?
            candidate_label == winner ? :pass : :fail
          else
            :unknown
          end

        detail =
          expected_comparison_detail(
            status: status,
            tie: tie,
            failure: failure,
            status_line: status_line,
            candidate_label: candidate_label,
            winner: winner,
            output: output,
          )

        { status: status, detail: detail }
      end

      def comparison_detail(rating:, status:, tie:, winner_explanation:)
        return rating_summary(rating) || "Tie" if tie
        return nil if status == :unknown

        rating_text = rating_summary(rating)
        return rating_text if rating_text.present? && status == :fail

        parts = []
        parts << "Winner" if status == :pass
        parts << rating_text if rating_text.present?
        parts << "Reason: #{winner_explanation}" if status == :pass && winner_explanation.present?

        parts.join(" • ") if parts.any?
      end

      def judged_summary_cell(result)
        winner_label = normalize_label(result[:winner]).presence || result[:winner_label]
        tie = tie_result?(result[:winner], result[:winner_label])

        status = tie ? :tie : :pass
        detail =
          if tie
            winner_reason = result[:winner_explanation]
            winner_reason.present? ? "Tie — #{winner_reason}" : "Result: tie"
          elsif winner_label.present?
            parts = []
            parts << "Winner: #{winner_label}"
            if result[:winner_explanation].present?
              parts << "Reason: #{result[:winner_explanation]}"
            end
            parts.join(" | ")
          else
            "Result: no winner"
          end

        { status: status, detail: detail }
      end

      def expected_comparison_detail(
        status:,
        tie:,
        failure:,
        status_line:,
        candidate_label:,
        winner:,
        output:
      )
        return nil if status == :unknown
        return format_failure(failure) if failure.present?
        return truncate(output, SAMPLE_WIDTH) if status == :pass && output.present?
        return truncate(status_line.to_s, SAMPLE_WIDTH) if status == :pass && status_line.present?
        return "Tie" if tie && status == :pass

        if winner.present?
          suffix = status == :pass ? "Winner" : "Lost"
          return "#{candidate_label} (#{suffix})"
        end

        nil
      end

      def rating_summary(rating)
        return if rating.blank?

        explanation = rating[:explanation].presence
        explanation ? "#{rating[:rating]}/10 — #{explanation}" : "#{rating[:rating]}/10"
      end

      def format_failure(failure)
        parts = []
        parts << "Expected: #{stringify(failure[:expected])}" if failure[:expected].present?
        parts << "Actual: #{stringify(failure[:actual])}" if failure[:actual].present?
        truncate(parts.join(" | "), SAMPLE_WIDTH)
      end

      def detail_for_skip(result)
        result[:message] || "Skipped"
      end

      def label_for(metadata, index, row_prefix, label_suffix: nil)
        base =
          if metadata.blank?
            row_prefix.presence || "Case #{index + 1}"
          else
            candidates = %i[input message query content prompt text]
            found = candidates.map { |key| metadata[key] }.compact.find { |value| value.present? }
            if found.present?
              truncate(found.to_s.gsub(/\s+/, " "), SAMPLE_WIDTH)
            else
              row_prefix.presence || "Case #{index + 1}"
            end
          end

        label = row_prefix.present? && base != row_prefix ? "[#{row_prefix}] #{base}" : base
        label_suffix.present? ? "#{label} (#{label_suffix})" : label
      end

      def bump_progress(units, llm_label)
        return if @total_targets <= 0

        @completed_units += units
        render_progress(llm_label)
      end

      def render_progress(llm_label)
        return unless progress_active?
        percent = @total_units.zero? ? 1.0 : @completed_units.to_f / @total_units
        percent = percent.clamp(0.0, 1.0)
        filled = (percent * PROGRESS_BAR_WIDTH).round
        bar = "#{"█" * filled}#{"░" * (PROGRESS_BAR_WIDTH - filled)}"
        label = truncate(llm_label, 18)
        message =
          format(
            "\rEvaluating [%s] %3d%% | %d/%d | %s",
            bar,
            (percent * 100).round,
            @completed_units,
            @total_units,
            label,
          )
        output.print(message)
        output.flush
      end

      def clear_progress_line
        output.print("\r\033[K")
      end

      def progress_active?
        @total_units.present?
      end

      def build_table
        column_widths = compute_column_widths
        lines = []
        lines << top_border(column_widths)
        lines << header_row(column_widths)
        lines << header_separator(column_widths)

        rows.each_with_index do |row, index|
          lines.concat(row_lines(row, column_widths))
          lines << middle_separator(column_widths) unless index == rows.length - 1
        end

        lines << bottom_border(column_widths)
        lines.join("\n")
      end

      def compute_column_widths
        widths = []
        widths << [case_header.length, *rows.map { |row| row[:label].to_s.length }].max

        columns.each do |column|
          column_content_widths =
            rows.map { |row| cell_lines(row[:cells][column]) }.flatten.map(&:length)
          widths << [column.length, *column_content_widths].max
        end

        clamp_widths(widths)
      end

      def case_header
        "input"
      end

      def header_row(widths)
        cells = []
        cells << padded(case_header, widths.first)
        columns.each_with_index { |col, index| cells << padded(col, widths[index + 1]) }
        "│ #{cells.join(" │ ")} │"
      end

      def top_border(widths)
        pieces = widths.map { |w| "─" * (w + 2) }
        "┌#{pieces.join("┬")}┐"
      end

      def header_separator(widths)
        pieces = widths.map { |w| "─" * (w + 2) }
        "├#{pieces.join("┼")}┤"
      end

      def middle_separator(widths)
        pieces = widths.map { |w| "─" * (w + 2) }
        "├#{pieces.join("┼")}┤"
      end

      def bottom_border(widths)
        pieces = widths.map { |w| "─" * (w + 2) }
        "└#{pieces.join("┴")}┘"
      end

      def row_lines(row, widths)
        cell_line_sets = []
        cell_line_sets << wrap_cell(row[:label].to_s, widths.first)

        columns.each_with_index do |column, index|
          cell_line_sets << wrap_cell_lines(row[:cells][column], widths[index + 1])
        end

        max_lines = cell_line_sets.map(&:length).max
        padded_sets = cell_line_sets.map { |lines| pad_lines(lines, max_lines) }

        padded_sets.transpose.map do |line_group|
          "│ #{line_group.map.with_index { |content, idx| padded(content, widths[idx]) }.join(" │ ")} │"
        end
      end

      def wrap_cell(content, width)
        wrap_text(content, width)
      end

      def wrap_cell_lines(cell, width)
        return wrap_text("—", width) if cell.nil?

        status_line =
          case cell[:status]
          when :pass
            "[PASS]"
          when :fail
            "[FAIL]"
          when :skipped
            "[SKIP]"
          when :tie
            "[TIE]"
          else
            "[N/A]"
          end

        detail_lines =
          if cell[:detail].present?
            wrap_text(cell[:detail], width)
          else
            []
          end

        [status_line] + detail_lines
      end

      def pad_lines(lines, target_size)
        lines + Array.new([target_size - lines.length, 0].max, "")
      end

      def wrap_text(text, width)
        sanitized = text.to_s.gsub(/\s+/, " ").strip
        return [""] if sanitized.empty?

        segments = []
        current = +""

        sanitized
          .split(" ")
          .each do |word|
            if current.empty?
              current << word
            elsif (current.length + 1 + word.length) <= width
              current << " " << word
            else
              segments << current
              current = word.dup
            end
          end

        segments << current unless current.empty?
        segments
      end

      def padded(content, width)
        content.to_s.ljust(width)
      end

      def cell_lines(cell)
        wrap_cell_lines(cell, SAMPLE_WIDTH)
      end

      def truncate(value, max_length)
        stringified = stringify(value)
        return stringified if stringified.length <= max_length

        "#{stringified[0...max_length - 1]}…"
      end

      def normalize_label(label)
        label.to_s
      end

      def stringify(value)
        value.is_a?(Regexp) ? value.inspect : value.to_s
      end

      def extract_raw(raw_entry)
        return if raw_entry.nil?

        if raw_entry.is_a?(Hash)
          raw_entry[:raw] || raw_entry[:output] || raw_entry[:result]
        else
          raw_entry
        end
      end

      def refresh_total_units
        @total_units = rows.length * @total_targets
      end

      def register_comparison_columns(candidates)
        candidates.each { |candidate| register_column(candidate[:display_label]) }
      end

      def summary_line
        return "" if columns.empty? || rows.empty?

        totals =
          columns.map do |column|
            stats = column_stats(column)
            "#{column}: #{stats[:pass]}/#{stats[:total]} pass"
          end

        "Summary: #{totals.join(" | ")}"
      end

      def legend_line
        "Legend: [PASS]=pass, [FAIL]=fail, [SKIP]=skipped, [TIE]=tie"
      end

      def column_stats(column)
        counts = Hash.new(0)
        rows.each do |row|
          cell = row[:cells][column]
          status = cell&.dig(:status) || :unknown
          counts[status] += 1
        end

        {
          pass: counts[:pass] + counts[:tie],
          fail: counts[:fail],
          skipped: counts[:skipped],
          total: rows.length,
        }
      end

      def clamp_widths(widths)
        return widths if @max_table_width.nil?

        clamped = widths.map { |w| [w, SAMPLE_WIDTH].min }
        while table_width(clamped) > @max_table_width && clamped.any? { |w| w > MIN_CELL_WIDTH }
          index = clamped.each_with_index.max_by { |width, _idx| width }[1]
          clamped[index] = [clamped[index] - 1, MIN_CELL_WIDTH].max
        end
        clamped
      end

      def table_width(widths)
        return 0 if widths.empty?

        widths.sum + (3 * (widths.length - 1)) + 4
      end

      def detect_max_width
        env_width =
          begin
            Integer(ENV["COLUMNS"])
          rescue StandardError
            nil
          end

        width = env_width || DEFAULT_MAX_WIDTH
        width >= 40 ? width : nil
      end

      def tie_result?(winner, winner_label)
        winner.to_s == "tie" || winner_label.to_s.casecmp("tie").zero?
      end

      def label_suffix_for_winner(winner, winner_label)
        return "tie" if tie_result?(winner, winner_label)
        return nil if winner.blank? && winner_label.blank?

        winner.presence || winner_label
      end
    end
  end
end