discourse/plugins/discourse-ai/lib/completions/rtf_to_text.rb
Sam fa54f62348
FEATURE: extract text from document uploads for LLM prompts (#39634)
Document attachments (doc, docx, xls, xlsx, rtf, csv, md, txt) are now
converted to text before being included in LLM prompts, instead of
being forwarded as raw base64 payloads. PDFs remain the only format
sent as a raw upload, capped at 10MB.

New converters under lib/completions:

- DocToText shells out to antiword
- DocxToText parses OOXML directly with size and depth limits
- XlsToText shells out to xls2csv
- XlsxToText parses OOXML and shared strings into CSV-style text
- RtfToText is a custom RTF tokenizer with destination/group handling

Plain text formats (csv, md, txt) are read with a 1MB byte cap and
UTF-8 normalization. Extracted text is truncated to 100k characters,
with a preamble noting the original filename and size.

Dialect trimming now uses token-aware truncation against a per-message
budget so large extracted documents collapse cleanly under the prompt
limit, rather than the previous step-based slicing of raw content.

Other changes:

- LlmModel.normalize_attachment_types is shared with UploadEncoder and
  collapses "markdown" to "md" so the canonical extension is consistent
  across model config, UI defaults, and encoder output
- ai-llm-attachment-types adds csv, xls, xlsx to the default choices
- Locale strings clarify that vision controls images and
  allowed_attachment_types controls documents

---------

Co-authored-by: Rafael Silva <xfalcox@gmail.com>
2026-05-05 08:16:23 +10:00

404 lines
9.4 KiB
Ruby
Vendored
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# frozen_string_literal: true
module DiscourseAi
module Completions
class RtfToText
MAX_INPUT_BYTES = 2 * 1024 * 1024
MAX_EXTRACTED_TEXT_CHARS = 100_001
MAX_GROUP_DEPTH = 100
MAX_CONTROL_WORD_CHARS = 64
GroupState = Struct.new(:skip, :uc, keyword_init: true)
DESTINATION_CONTROL_WORDS = %w[
annotation
atnauthor
atntime
author
buptim
category
colorschememapping
colortbl
comment
company
creatim
datafield
datastore
doccomm
docvar
factoidname
falt
filetbl
fldinst
fonttbl
footer
footerf
footerl
footerr
footnote
formfield
generator
header
headerf
headerl
headerr
info
keywords
latentstyles
listoverridetable
listtable
manager
nextfile
nonshppict
object
objdata
operator
pict
pn
pnseclvl
private
protusertbl
revtbl
rsidtbl
shp
shpgrp
shpinst
stylesheet
subject
template
title
txe
userprops
xmlnstbl
].freeze
CONTROL_TEXT = {
"bullet" => "",
"cell" => "\t",
"column" => "\t",
"emdash" => "",
"emspace" => " ",
"endash" => "",
"enspace" => " ",
"ldblquote" => "",
"line" => "\n",
"lquote" => "",
"page" => "\n",
"par" => "\n",
"qmspace" => " ",
"rdblquote" => "",
"row" => "\n",
"rquote" => "",
"sect" => "\n",
"tab" => "\t",
}.freeze
ENCODING_BY_CODEPAGE = {
437 => "IBM437",
850 => "IBM850",
852 => "IBM852",
855 => "IBM855",
857 => "IBM857",
860 => "IBM860",
861 => "IBM861",
862 => "IBM862",
863 => "IBM863",
864 => "IBM864",
865 => "IBM865",
866 => "IBM866",
869 => "IBM869",
874 => "Windows-874",
932 => "Windows-31J",
936 => "GBK",
949 => "Windows-949",
950 => "Big5",
1250 => "Windows-1250",
1251 => "Windows-1251",
1252 => "Windows-1252",
1253 => "Windows-1253",
1254 => "Windows-1254",
1255 => "Windows-1255",
1256 => "Windows-1256",
1257 => "Windows-1257",
1258 => "Windows-1258",
65_001 => "UTF-8",
20_127 => "US-ASCII",
20_850 => "IBM850",
28_591 => "ISO-8859-1",
}.freeze
class ParseLimitError < StandardError
end
def self.convert(path)
new(path).convert
end
def initialize(path)
@path = path
end
def convert
@input = read_input
@index = 0
@output = +""
@group_stack = [GroupState.new(skip: false, uc: 1)]
@source_encoding = find_encoding("Windows-1252")
@fallback_chars_to_skip = 0
parse
normalize_document_text(@output)
end
private
attr_reader :path
def read_input
input = File.binread(path, MAX_INPUT_BYTES + 1).to_s
input = input.byteslice(0, MAX_INPUT_BYTES) if input.bytesize > MAX_INPUT_BYTES
input.force_encoding(Encoding::BINARY)
end
def parse
while @index < @input.bytesize && @output.length <= MAX_EXTRACTED_TEXT_CHARS
if @fallback_chars_to_skip.positive?
skip_fallback_char
@fallback_chars_to_skip -= 1
next
end
byte = current_byte
case byte
when 123 # {
push_group
when 125 # }
pop_group
when 92 # \
parse_control
when 10, 13
@index += 1
else
append_encoded_byte(byte) if !current_group.skip
@index += 1
end
end
end
def push_group
if @group_stack.length >= MAX_GROUP_DEPTH
raise ParseLimitError, "RTF group nesting is too deep"
end
@group_stack << GroupState.new(skip: current_group.skip, uc: current_group.uc)
@index += 1
end
def pop_group
@group_stack.pop if @group_stack.length > 1
@index += 1
end
def current_group
@group_stack.last
end
def current_byte
@input.getbyte(@index)
end
def parse_control
@index += 1
return if @index >= @input.bytesize
byte = current_byte
if letter?(byte)
parse_control_word
elsif byte == 39 # '
parse_hex_escape
else
parse_control_symbol(byte)
end
end
def parse_control_word
word = read_control_word
param = read_control_parameter
skip_control_space
handle_control_word(word, param)
end
def read_control_word
stored_length = 0
word = +""
while @index < @input.bytesize && letter?(current_byte)
word << current_byte.chr if stored_length < MAX_CONTROL_WORD_CHARS
stored_length += 1
@index += 1
end
word.downcase
end
def read_control_parameter
sign = 1
if current_byte == 45 # -
sign = -1
@index += 1
end
start = @index
@index += 1 while @index < @input.bytesize && digit?(current_byte)
return if start == @index
@input.byteslice(start, @index - start).to_i * sign
end
def skip_control_space
@index += 1 if current_byte == 32
end
def handle_control_word(word, param)
if DESTINATION_CONTROL_WORDS.include?(word)
current_group.skip = true
return
end
case word
when "ansi"
@source_encoding = find_encoding("Windows-1252")
when "mac"
@source_encoding = find_encoding("MacRoman")
when "pc"
@source_encoding = find_encoding("IBM437")
when "pca"
@source_encoding = find_encoding("IBM850")
when "ansicpg"
@source_encoding = encoding_for_codepage(param) if param
when "uc"
current_group.uc = param.clamp(0, 10) if param
when "bin"
@index += [param.to_i, @input.bytesize - @index].min if param.to_i.positive?
end
return if current_group.skip
if word == "u"
append_unicode(param)
@fallback_chars_to_skip = current_group.uc.to_i
elsif (text = CONTROL_TEXT[word])
append_text(text)
end
end
def parse_hex_escape
@index += 1
hex = @input.byteslice(@index, 2)
if hex&.match?(/\A[0-9a-fA-F]{2}\z/)
append_encoded_byte(hex.to_i(16)) if !current_group.skip
@index += 2
end
end
def parse_control_symbol(byte)
@index += 1
case byte
when 42 # *
current_group.skip = true
when 45,
95 # -, _
append_text("-") if !current_group.skip
when 92,
123,
125 # \, {, }
append_text(byte.chr) if !current_group.skip
when 126 # ~
append_text(" ") if !current_group.skip
end
end
def skip_fallback_char
return if @index >= @input.bytesize
if current_byte == 92 # \
@index += 1
return if @index >= @input.bytesize
if current_byte == 39 # '
@index += 1
@index += 2 if @input.byteslice(@index, 2)&.match?(/\A[0-9a-fA-F]{2}\z/)
elsif letter?(current_byte)
read_control_word
read_control_parameter
skip_control_space
else
@index += 1
end
else
@index += 1
end
end
def append_unicode(param)
return if param.nil?
codepoint = param.negative? ? param + 65_536 : param
append_text([codepoint].pack("U")) if codepoint.between?(0, 0x10FFFF)
rescue RangeError
nil
end
def append_encoded_byte(byte)
return if byte.nil? || byte == 0
if byte < 128
append_text(byte.chr)
else
text = byte.chr.b.force_encoding(@source_encoding || Encoding::Windows_1252)
append_text(text.encode("UTF-8", invalid: :replace, undef: :replace, replace: ""))
end
end
def append_text(text)
return if text.nil? || text.empty? || @output.length > MAX_EXTRACTED_TEXT_CHARS
@output << text
end
def encoding_for_codepage(codepage)
find_encoding(ENCODING_BY_CODEPAGE[codepage.to_i]) || @source_encoding
end
def find_encoding(name)
Encoding.find(name)
rescue ArgumentError
nil
end
def letter?(byte)
(byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122)
end
def digit?(byte)
byte >= 48 && byte <= 57
end
def normalize_document_text(text)
text
.to_s
.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
.gsub("\u00A0", " ")
.gsub(/\r\n?/, "\n")
.gsub(/[ \t]+\n/, "\n")
.gsub(/\n[ \t]+/, "\n")
.gsub(/\n{3,}/, "\n\n")
.strip
end
end
end
end