discourse/plugins/discourse-ai/lib/completions/dialects/converse.rb
Takao Yokoyama cea4780cb3
FIX: AI: AWS Bedrock Converse image uploads were doubly base64-encoded (#39880)
## Summary

Image uploads delivered through the `aws_bedrock_converse` LLM provider
were rejected by Bedrock with `Could not process image` whenever an
agent / LLM had `vision_enabled` set to true.

Two related bugs are fixed:

### 1. `Dialects::Converse#upload_node` — base64 string passed where raw
bytes expected

In `plugins/discourse-ai/lib/completions/dialects/converse.rb`, image
content was emitted as:

```ruby
source: { bytes: details[:base64] }
```

`details[:base64]` is the upload's base64-encoded string (as produced by
`UploadEncoder`), but `Aws::BedrockRuntime::Client#converse` expects
**raw bytes** on the `:bytes` key — the SDK then base64-encodes them on
the wire. Passing the already-base64-encoded string causes Bedrock to
receive **doubly-encoded** data, which it cannot decode into a valid
image. Decoding back to raw bytes via
`Base64.decode64(details[:base64])` resolves the round-trip.

### 2. `AwsBedrockConverse#perform_completion!` — JSON-logging fails on
binary payloads

With raw bytes now flowing through `sdk_params`, the subsequent
`sdk_params.to_json` call (used to record the request in `start_log`)
raises `EncodingError` because PNG/JPEG bytes are not valid UTF-8. The
call is wrapped in `begin / rescue EncodingError` so the request can
still proceed; a placeholder string is recorded in the audit log instead
of the binary payload.

## Test plan

- A new spec case in
`plugins/discourse-ai/spec/lib/completions/dialects/converse_spec.rb`
asserts that `details[:base64]` is decoded back to raw bytes before
being emitted as `source: { bytes: ... }`. This guards against
regression.
- Verified end-to-end against `us.anthropic.claude-sonnet-4-6` via
Bedrock Converse on `ap-northeast-1` → `us-east-1` cross-region
inference profile: with this patch the model correctly describes
uploaded PNG attachments (a Loupe Browser version warning dialog)
instead of returning `Could not process image`.

## Reproduction (before the fix)

1. Configure an `aws_bedrock_converse` LLM in Discourse and assign it to
an `AiAgent` with `vision_enabled: true`.
2. Wire up `llm_triage` (or any path that goes through
`Dialects::Converse#upload_node`) to reply to a topic that contains an
image upload.
3. Observe:
`DiscourseAi::Completions::Endpoints::Base::CompletionFailed: The model
returned the following errors: Could not process image`

## Discovered while

Standing up a Discourse instance with Bedrock-backed AI as part of an
internal forum spike. Happy to iterate on the patch (e.g. tighten the
log fallback or extract a helper) if reviewers prefer a different shape.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---------

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
Co-authored-by: Rafael Silva <xfalcox@gmail.com>
2026-05-26 15:16:49 -03:00

203 lines
5.8 KiB
Ruby
Vendored

# frozen_string_literal: true
module DiscourseAi
module Completions
module Dialects
class Converse < Dialect
class << self
def can_translate?(llm_model)
llm_model.provider == "aws_bedrock_converse"
end
end
class ConversePrompt
attr_reader :system, :messages, :tool_config
def initialize(system, messages, tool_config = nil)
@system = system
@messages = messages
@tool_config = tool_config
end
def system_prompt
system.to_s
end
def has_tools?
tool_config.present?
end
end
def translate
messages = super
system = messages.shift[:content] if messages.first&.dig(:role) == "system"
converse_messages =
messages.map { |msg| { role: msg[:role], content: build_content(msg) } }
# Converse API requires alternating user/assistant roles
interleaved = []
previous_message = nil
converse_messages.each do |message|
if previous_message
if previous_message[:role] == "user" && message[:role] == "user"
interleaved << { role: "assistant", content: [{ text: "OK" }] }
elsif previous_message[:role] == "assistant" && message[:role] == "assistant"
interleaved << { role: "user", content: [{ text: "OK" }] }
end
end
interleaved << message
previous_message = message
end
tool_config = tools_dialect.translated_tools
ConversePrompt.new(system.presence && [{ text: system }], interleaved, tool_config)
end
def max_prompt_tokens
llm_model.max_prompt_tokens
end
def native_tool_support?
true
end
def tools_dialect
@tools_dialect ||= DiscourseAi::Completions::Dialects::ConverseTools.new(prompt.tools)
end
private
def build_content(msg)
content = []
existing_content = msg[:content]
if existing_content.is_a?(Array)
content.concat(existing_content)
elsif existing_content.is_a?(Hash)
content << existing_content
elsif existing_content.is_a?(String)
content << { text: existing_content }
end
msg[:images]&.each { |image| content << image }
content
end
def detect_format(mime_type)
case mime_type
when "image/jpeg"
"jpeg"
when "image/png"
"png"
when "image/gif"
"gif"
when "image/webp"
"webp"
else
"jpeg"
end
end
def system_msg(msg)
{ role: "system", content: msg[:content] }
end
def user_msg(msg)
content_array = [msg[:content]].flatten
content_array =
to_encoded_content_array(
content: content_array,
upload_encoder: ->(details) { upload_node(details) },
text_encoder: ->(text) { { text: text } },
allow_images: vision_support?,
allow_documents: true,
allowed_attachment_types: llm_model.allowed_attachment_types,
upload_filter: ->(encoded) { converse_upload_allowed?(encoded) },
)
{ role: "user", content: content_array }
end
def converse_upload_allowed?(encoded)
return true if encoded[:kind] == :image
document_allowed?(encoded) && encoded[:text].present?
end
def upload_node(details)
return { text: details[:text] } if details[:text].present?
return if details[:kind] != :image
{
image: {
format: details[:format] || detect_format(details[:mime_type]),
source: {
# AWS SDK for Ruby expects raw bytes here and will base64-encode them
# on the wire. Passing the already-base64-encoded string causes Bedrock
# to receive doubly-encoded data and respond with
# "Could not process image".
bytes: Base64.decode64(details[:base64]),
},
},
}
end
def model_msg(msg)
content = []
provider_info = converse_reasoning(msg)
if provider_info.present?
if msg[:thinking] && provider_info[:signature]
content << {
reasoning_content: {
reasoning_text: {
text: msg[:thinking],
signature: provider_info[:signature],
},
},
}
end
if provider_info[:redacted_content]
content << {
reasoning_content: {
redacted_content: provider_info[:redacted_content],
},
}
end
end
text = msg[:content]
if text.is_a?(String)
content << { text: text }
elsif text.is_a?(Array)
content.concat(text)
end
{ role: "assistant", content: content }
end
def converse_reasoning(message)
info = message[:thinking_provider_info]
return if info.blank?
info[:bedrock_converse] || info["bedrock_converse"]
end
def tool_msg(msg)
translated = tools_dialect.from_raw_tool(msg)
{ role: "user", content: translated }
end
def tool_call_msg(msg)
translated = tools_dialect.from_raw_tool_call(msg)
{ role: "assistant", content: translated }
end
end
end
end
end