discourse/plugins/discourse-ai/lib/utils/image_to_text.rb
Sam b8abe100c5
FEATURE: add agentic execution mode for AI personas (#38230)
Introduce an "agentic" execution mode as an alternative to the
default fixed-turn/tool-limit approach. In agentic mode, personas
use a configurable token budget (`max_turn_tokens`) to govern how
long a tool-use session can run, with automatic context compression
when the conversation exceeds a configurable threshold percentage
(`compression_threshold`) of the model's context window.

Key changes:

- Add `execution_mode`, `max_turn_tokens`, and `compression_threshold`
  columns to `ai_personas` via migration
- Refactor `Bot#reply` to support token-budget loop control with a
  thread-local token accumulator, budget exhaustion hints, and a
  safety valve at 100 completions
- Add `maybe_compress_context` which summarizes middle conversation
  messages when token usage crosses the compression threshold,
  preserving system prompt and recent tail messages
- Update `StreamReplyCustomToolsSession` to track accumulated tokens
  across rounds and handle budget exhaustion in the custom tools path
- Discount cached tokens (Anthropic) in the token accumulator to
  avoid over-counting reused KV cache prefixes
- Update persona editor UI with execution mode selector and
  conditional fields (agentic shows token budget/compression;
  default shows max context posts)
2026-03-05 15:06:54 +11:00

239 lines
6.7 KiB
Ruby
Vendored

# frozen_string_literal: true
class DiscourseAi::Utils::ImageToText
BACKOFF_SECONDS = [5, 30, 60]
MAX_IMAGE_SIZE = 10.megabytes
class Reader
def initialize(uploads:, llm_model:, user:, execution_context: nil)
@uploads = uploads
@llm_model = llm_model
@user = user
@execution_context = execution_context
@buffer = +""
@to_process = uploads.dup
end
# return nil if no more data
def read(length)
# for implementation simplicity we will process one image at a time
if !@buffer.empty?
part = @buffer.slice!(0, length)
return part
end
return nil if @to_process.empty?
upload = @to_process.shift
extractor =
DiscourseAi::Utils::ImageToText.new(
upload: upload,
llm_model: @llm_model,
user: @user,
execution_context: @execution_context,
)
extractor.extract_text do |chunk, error|
if error
Discourse.warn_exception(
error,
message: "Discourse AI: Failed to extract text from image",
)
else
# this introduces chunk markers so discourse rag ingestion requires no overlaps
@buffer << "\n[[metadata ]]\n"
@buffer << chunk
end
end
read(length)
end
end
def self.as_fake_file(uploads:, llm_model:, user:, execution_context: nil)
# given our implementation for extracting text expect a file, return a simple object that can simulate read(size)
# and stream content
Reader.new(uploads: uploads, llm_model: llm_model, user: user, execution_context:)
end
def self.tesseract_installed?
if defined?(@tesseract_installed)
@tesseract_installed
else
@tesseract_installed =
begin
Discourse::Utils.execute_command("which", "tesseract")
true
rescue Discourse::Utils::CommandError
false
end
end
end
attr_reader :upload, :llm_model, :user, :execution_context
def initialize(upload:, llm_model:, user:, guidance_text: nil, execution_context: nil)
@upload = upload
@llm_model = llm_model
@user = user
@guidance_text = guidance_text
@execution_context = execution_context
end
def extract_text(retries: 3)
uploads ||= @uploaded_pages
raise "must specify a block" if !block_given?
extracted = nil
error = nil
backoff = BACKOFF_SECONDS.dup
retries.times do
seconds = nil
begin
extracted = extract_text_from_page(upload)
break
rescue => e
error = e
seconds = backoff.shift || seconds
sleep(seconds)
end
end
if extracted
extracted.each { |chunk| yield(chunk) }
else
yield(nil, error)
end
extracted || []
end
private
def system_message
<<~MSG
OCR the following page into Markdown. Tables should be formatted as Github flavored markdown.
Do not surround your output with triple backticks.
Chunk the document into sections of roughly 250 - 1000 words. Our goal is to identify parts of the page with same semantic theme. These chunks will be embedded and used in a RAG pipeline.
Always prefer returning text in Markdown vs HTML.
Describe all the images and graphs you encounter.
Only return text that will assist in the querying of data. Omit text such as "I had trouble recognizing images" and so on.
Surround the chunks with <chunk> </chunk> html tags.
MSG
end
def extract_text_from_page(page)
raw_text = @guidance_text
raw_text ||= extract_text_with_tesseract(page) if self.class.tesseract_installed?
llm = llm_model.to_llm
if raw_text.present?
messages = [
{
type: :user,
content: [
"The following text was extracted from an image using OCR. Please enhance, correct, and structure this content while maintaining the original text:\n\n#{raw_text}",
{ upload_id: page.id },
],
},
]
else
messages = [
{ type: :user, content: ["Please OCR the content in the image.", { upload_id: page.id }] },
]
end
prompt = DiscourseAi::Completions::Prompt.new(system_message, messages: messages)
result = llm.generate(prompt, user: Discourse.system_user, execution_context:)
extract_chunks(result)
end
def extract_text_with_tesseract(page)
# return nil if we can not find tessaract binary
return nil if !self.class.tesseract_installed?
upload_path =
if page.local?
Discourse.store.path_for(page)
else
Discourse.store.download(page, max_file_size_kb: MAX_IMAGE_SIZE)
end
return "" if !upload_path || !File.exist?(upload_path)
tmp_output_file = Tempfile.new(%w[tesseract_output .txt])
tmp_output = tmp_output_file.path
tmp_output_file.unlink
command = [
"tesseract",
upload_path,
tmp_output.sub(/\.txt$/, ""), # Tesseract adds .txt automatically
]
success =
Discourse::Utils.execute_command(
*command,
timeout: 20.seconds,
failure_message: "Failed to OCR image with Tesseract",
)
if success && File.exist?("#{tmp_output}")
text = File.read("#{tmp_output}")
begin
File.delete("#{tmp_output}")
rescue StandardError
nil
end
text.strip
else
Rails.logger.error("Tesseract OCR failed for #{upload_path}")
""
end
rescue => e
Rails.logger.error("Error during OCR processing: #{e.message}")
""
end
def extract_chunks(text)
return [] if text.nil? || text.empty?
if text.include?("<chunk>") && text.include?("</chunk>")
chunks = []
remaining_text = text.dup
while remaining_text.length > 0
if remaining_text.start_with?("<chunk>")
# Extract chunk content
chunk_end = remaining_text.index("</chunk>")
if chunk_end
chunk = remaining_text[7..chunk_end - 1].strip
chunks << chunk unless chunk.empty?
remaining_text = remaining_text[chunk_end + 8..-1] || ""
else
# Malformed chunk - add remaining text and break
chunks << remaining_text[7..-1].strip
break
end
else
# Handle text before next chunk if it exists
next_chunk = remaining_text.index("<chunk>")
if next_chunk
text_before = remaining_text[0...next_chunk].strip
chunks << text_before unless text_before.empty?
remaining_text = remaining_text[next_chunk..-1]
else
# No more chunks - add remaining text and break
chunks << remaining_text.strip
break
end
end
end
return chunks.reject(&:empty?)
end
[text]
end
end