mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-06-19 03:23:50 +08:00
Document attachments (doc, docx, xls, xlsx, rtf, csv, md, txt) are now converted to text before being included in LLM prompts, instead of being forwarded as raw base64 payloads. PDFs remain the only format sent as a raw upload, capped at 10MB. New converters under lib/completions: - DocToText shells out to antiword - DocxToText parses OOXML directly with size and depth limits - XlsToText shells out to xls2csv - XlsxToText parses OOXML and shared strings into CSV-style text - RtfToText is a custom RTF tokenizer with destination/group handling Plain text formats (csv, md, txt) are read with a 1MB byte cap and UTF-8 normalization. Extracted text is truncated to 100k characters, with a preamble noting the original filename and size. Dialect trimming now uses token-aware truncation against a per-message budget so large extracted documents collapse cleanly under the prompt limit, rather than the previous step-based slicing of raw content. Other changes: - LlmModel.normalize_attachment_types is shared with UploadEncoder and collapses "markdown" to "md" so the canonical extension is consistent across model config, UI defaults, and encoder output - ai-llm-attachment-types adds csv, xls, xlsx to the default choices - Locale strings clarify that vision controls images and allowed_attachment_types controls documents --------- Co-authored-by: Rafael Silva <xfalcox@gmail.com>
404 lines
9.4 KiB
Ruby
Vendored
404 lines
9.4 KiB
Ruby
Vendored
# frozen_string_literal: true
|
||
|
||
module DiscourseAi
|
||
module Completions
|
||
class RtfToText
|
||
MAX_INPUT_BYTES = 2 * 1024 * 1024
|
||
MAX_EXTRACTED_TEXT_CHARS = 100_001
|
||
MAX_GROUP_DEPTH = 100
|
||
MAX_CONTROL_WORD_CHARS = 64
|
||
|
||
GroupState = Struct.new(:skip, :uc, keyword_init: true)
|
||
|
||
DESTINATION_CONTROL_WORDS = %w[
|
||
annotation
|
||
atnauthor
|
||
atntime
|
||
author
|
||
buptim
|
||
category
|
||
colorschememapping
|
||
colortbl
|
||
comment
|
||
company
|
||
creatim
|
||
datafield
|
||
datastore
|
||
doccomm
|
||
docvar
|
||
factoidname
|
||
falt
|
||
filetbl
|
||
fldinst
|
||
fonttbl
|
||
footer
|
||
footerf
|
||
footerl
|
||
footerr
|
||
footnote
|
||
formfield
|
||
generator
|
||
header
|
||
headerf
|
||
headerl
|
||
headerr
|
||
info
|
||
keywords
|
||
latentstyles
|
||
listoverridetable
|
||
listtable
|
||
manager
|
||
nextfile
|
||
nonshppict
|
||
object
|
||
objdata
|
||
operator
|
||
pict
|
||
pn
|
||
pnseclvl
|
||
private
|
||
protusertbl
|
||
revtbl
|
||
rsidtbl
|
||
shp
|
||
shpgrp
|
||
shpinst
|
||
stylesheet
|
||
subject
|
||
template
|
||
title
|
||
txe
|
||
userprops
|
||
xmlnstbl
|
||
].freeze
|
||
|
||
CONTROL_TEXT = {
|
||
"bullet" => "•",
|
||
"cell" => "\t",
|
||
"column" => "\t",
|
||
"emdash" => "—",
|
||
"emspace" => " ",
|
||
"endash" => "–",
|
||
"enspace" => " ",
|
||
"ldblquote" => "“",
|
||
"line" => "\n",
|
||
"lquote" => "‘",
|
||
"page" => "\n",
|
||
"par" => "\n",
|
||
"qmspace" => " ",
|
||
"rdblquote" => "”",
|
||
"row" => "\n",
|
||
"rquote" => "’",
|
||
"sect" => "\n",
|
||
"tab" => "\t",
|
||
}.freeze
|
||
|
||
ENCODING_BY_CODEPAGE = {
|
||
437 => "IBM437",
|
||
850 => "IBM850",
|
||
852 => "IBM852",
|
||
855 => "IBM855",
|
||
857 => "IBM857",
|
||
860 => "IBM860",
|
||
861 => "IBM861",
|
||
862 => "IBM862",
|
||
863 => "IBM863",
|
||
864 => "IBM864",
|
||
865 => "IBM865",
|
||
866 => "IBM866",
|
||
869 => "IBM869",
|
||
874 => "Windows-874",
|
||
932 => "Windows-31J",
|
||
936 => "GBK",
|
||
949 => "Windows-949",
|
||
950 => "Big5",
|
||
1250 => "Windows-1250",
|
||
1251 => "Windows-1251",
|
||
1252 => "Windows-1252",
|
||
1253 => "Windows-1253",
|
||
1254 => "Windows-1254",
|
||
1255 => "Windows-1255",
|
||
1256 => "Windows-1256",
|
||
1257 => "Windows-1257",
|
||
1258 => "Windows-1258",
|
||
65_001 => "UTF-8",
|
||
20_127 => "US-ASCII",
|
||
20_850 => "IBM850",
|
||
28_591 => "ISO-8859-1",
|
||
}.freeze
|
||
|
||
class ParseLimitError < StandardError
|
||
end
|
||
|
||
def self.convert(path)
|
||
new(path).convert
|
||
end
|
||
|
||
def initialize(path)
|
||
@path = path
|
||
end
|
||
|
||
def convert
|
||
@input = read_input
|
||
@index = 0
|
||
@output = +""
|
||
@group_stack = [GroupState.new(skip: false, uc: 1)]
|
||
@source_encoding = find_encoding("Windows-1252")
|
||
@fallback_chars_to_skip = 0
|
||
|
||
parse
|
||
normalize_document_text(@output)
|
||
end
|
||
|
||
private
|
||
|
||
attr_reader :path
|
||
|
||
def read_input
|
||
input = File.binread(path, MAX_INPUT_BYTES + 1).to_s
|
||
input = input.byteslice(0, MAX_INPUT_BYTES) if input.bytesize > MAX_INPUT_BYTES
|
||
input.force_encoding(Encoding::BINARY)
|
||
end
|
||
|
||
def parse
|
||
while @index < @input.bytesize && @output.length <= MAX_EXTRACTED_TEXT_CHARS
|
||
if @fallback_chars_to_skip.positive?
|
||
skip_fallback_char
|
||
@fallback_chars_to_skip -= 1
|
||
next
|
||
end
|
||
|
||
byte = current_byte
|
||
|
||
case byte
|
||
when 123 # {
|
||
push_group
|
||
when 125 # }
|
||
pop_group
|
||
when 92 # \
|
||
parse_control
|
||
when 10, 13
|
||
@index += 1
|
||
else
|
||
append_encoded_byte(byte) if !current_group.skip
|
||
@index += 1
|
||
end
|
||
end
|
||
end
|
||
|
||
def push_group
|
||
if @group_stack.length >= MAX_GROUP_DEPTH
|
||
raise ParseLimitError, "RTF group nesting is too deep"
|
||
end
|
||
|
||
@group_stack << GroupState.new(skip: current_group.skip, uc: current_group.uc)
|
||
@index += 1
|
||
end
|
||
|
||
def pop_group
|
||
@group_stack.pop if @group_stack.length > 1
|
||
@index += 1
|
||
end
|
||
|
||
def current_group
|
||
@group_stack.last
|
||
end
|
||
|
||
def current_byte
|
||
@input.getbyte(@index)
|
||
end
|
||
|
||
def parse_control
|
||
@index += 1
|
||
return if @index >= @input.bytesize
|
||
|
||
byte = current_byte
|
||
|
||
if letter?(byte)
|
||
parse_control_word
|
||
elsif byte == 39 # '
|
||
parse_hex_escape
|
||
else
|
||
parse_control_symbol(byte)
|
||
end
|
||
end
|
||
|
||
def parse_control_word
|
||
word = read_control_word
|
||
param = read_control_parameter
|
||
skip_control_space
|
||
|
||
handle_control_word(word, param)
|
||
end
|
||
|
||
def read_control_word
|
||
stored_length = 0
|
||
word = +""
|
||
|
||
while @index < @input.bytesize && letter?(current_byte)
|
||
word << current_byte.chr if stored_length < MAX_CONTROL_WORD_CHARS
|
||
stored_length += 1
|
||
@index += 1
|
||
end
|
||
|
||
word.downcase
|
||
end
|
||
|
||
def read_control_parameter
|
||
sign = 1
|
||
if current_byte == 45 # -
|
||
sign = -1
|
||
@index += 1
|
||
end
|
||
|
||
start = @index
|
||
@index += 1 while @index < @input.bytesize && digit?(current_byte)
|
||
return if start == @index
|
||
|
||
@input.byteslice(start, @index - start).to_i * sign
|
||
end
|
||
|
||
def skip_control_space
|
||
@index += 1 if current_byte == 32
|
||
end
|
||
|
||
def handle_control_word(word, param)
|
||
if DESTINATION_CONTROL_WORDS.include?(word)
|
||
current_group.skip = true
|
||
return
|
||
end
|
||
|
||
case word
|
||
when "ansi"
|
||
@source_encoding = find_encoding("Windows-1252")
|
||
when "mac"
|
||
@source_encoding = find_encoding("MacRoman")
|
||
when "pc"
|
||
@source_encoding = find_encoding("IBM437")
|
||
when "pca"
|
||
@source_encoding = find_encoding("IBM850")
|
||
when "ansicpg"
|
||
@source_encoding = encoding_for_codepage(param) if param
|
||
when "uc"
|
||
current_group.uc = param.clamp(0, 10) if param
|
||
when "bin"
|
||
@index += [param.to_i, @input.bytesize - @index].min if param.to_i.positive?
|
||
end
|
||
|
||
return if current_group.skip
|
||
|
||
if word == "u"
|
||
append_unicode(param)
|
||
@fallback_chars_to_skip = current_group.uc.to_i
|
||
elsif (text = CONTROL_TEXT[word])
|
||
append_text(text)
|
||
end
|
||
end
|
||
|
||
def parse_hex_escape
|
||
@index += 1
|
||
hex = @input.byteslice(@index, 2)
|
||
if hex&.match?(/\A[0-9a-fA-F]{2}\z/)
|
||
append_encoded_byte(hex.to_i(16)) if !current_group.skip
|
||
@index += 2
|
||
end
|
||
end
|
||
|
||
def parse_control_symbol(byte)
|
||
@index += 1
|
||
|
||
case byte
|
||
when 42 # *
|
||
current_group.skip = true
|
||
when 45,
|
||
95 # -, _
|
||
append_text("-") if !current_group.skip
|
||
when 92,
|
||
123,
|
||
125 # \, {, }
|
||
append_text(byte.chr) if !current_group.skip
|
||
when 126 # ~
|
||
append_text(" ") if !current_group.skip
|
||
end
|
||
end
|
||
|
||
def skip_fallback_char
|
||
return if @index >= @input.bytesize
|
||
|
||
if current_byte == 92 # \
|
||
@index += 1
|
||
return if @index >= @input.bytesize
|
||
|
||
if current_byte == 39 # '
|
||
@index += 1
|
||
@index += 2 if @input.byteslice(@index, 2)&.match?(/\A[0-9a-fA-F]{2}\z/)
|
||
elsif letter?(current_byte)
|
||
read_control_word
|
||
read_control_parameter
|
||
skip_control_space
|
||
else
|
||
@index += 1
|
||
end
|
||
else
|
||
@index += 1
|
||
end
|
||
end
|
||
|
||
def append_unicode(param)
|
||
return if param.nil?
|
||
|
||
codepoint = param.negative? ? param + 65_536 : param
|
||
append_text([codepoint].pack("U")) if codepoint.between?(0, 0x10FFFF)
|
||
rescue RangeError
|
||
nil
|
||
end
|
||
|
||
def append_encoded_byte(byte)
|
||
return if byte.nil? || byte == 0
|
||
|
||
if byte < 128
|
||
append_text(byte.chr)
|
||
else
|
||
text = byte.chr.b.force_encoding(@source_encoding || Encoding::Windows_1252)
|
||
append_text(text.encode("UTF-8", invalid: :replace, undef: :replace, replace: ""))
|
||
end
|
||
end
|
||
|
||
def append_text(text)
|
||
return if text.nil? || text.empty? || @output.length > MAX_EXTRACTED_TEXT_CHARS
|
||
|
||
@output << text
|
||
end
|
||
|
||
def encoding_for_codepage(codepage)
|
||
find_encoding(ENCODING_BY_CODEPAGE[codepage.to_i]) || @source_encoding
|
||
end
|
||
|
||
def find_encoding(name)
|
||
Encoding.find(name)
|
||
rescue ArgumentError
|
||
nil
|
||
end
|
||
|
||
def letter?(byte)
|
||
(byte >= 65 && byte <= 90) || (byte >= 97 && byte <= 122)
|
||
end
|
||
|
||
def digit?(byte)
|
||
byte >= 48 && byte <= 57
|
||
end
|
||
|
||
def normalize_document_text(text)
|
||
text
|
||
.to_s
|
||
.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
|
||
.gsub("\u00A0", " ")
|
||
.gsub(/\r\n?/, "\n")
|
||
.gsub(/[ \t]+\n/, "\n")
|
||
.gsub(/\n[ \t]+/, "\n")
|
||
.gsub(/\n{3,}/, "\n\n")
|
||
.strip
|
||
end
|
||
end
|
||
end
|
||
end
|