mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-06-19 04:25:50 +08:00
Document attachments (doc, docx, xls, xlsx, rtf, csv, md, txt) are now converted to text before being included in LLM prompts, instead of being forwarded as raw base64 payloads. PDFs remain the only format sent as a raw upload, capped at 10MB. New converters under lib/completions: - DocToText shells out to antiword - DocxToText parses OOXML directly with size and depth limits - XlsToText shells out to xls2csv - XlsxToText parses OOXML and shared strings into CSV-style text - RtfToText is a custom RTF tokenizer with destination/group handling Plain text formats (csv, md, txt) are read with a 1MB byte cap and UTF-8 normalization. Extracted text is truncated to 100k characters, with a preamble noting the original filename and size. Dialect trimming now uses token-aware truncation against a per-message budget so large extracted documents collapse cleanly under the prompt limit, rather than the previous step-based slicing of raw content. Other changes: - LlmModel.normalize_attachment_types is shared with UploadEncoder and collapses "markdown" to "md" so the canonical extension is consistent across model config, UI defaults, and encoder output - ai-llm-attachment-types adds csv, xls, xlsx to the default choices - Locale strings clarify that vision controls images and allowed_attachment_types controls documents --------- Co-authored-by: Rafael Silva <xfalcox@gmail.com>
193 lines
5.6 KiB
Ruby
Vendored
193 lines
5.6 KiB
Ruby
Vendored
# frozen_string_literal: true
|
|
|
|
module DiscourseAi
|
|
module Completions
|
|
module Dialects
|
|
class ChatGpt < Dialect
|
|
class << self
|
|
def can_translate?(llm_model)
|
|
return false if llm_model.url.to_s.include?("/v1/responses")
|
|
|
|
llm_model.provider == "open_router" || llm_model.provider == "open_ai" ||
|
|
llm_model.provider == "azure"
|
|
end
|
|
end
|
|
|
|
VALID_ID_REGEX = /\A[a-zA-Z0-9_]+\z/
|
|
def native_tool_support?
|
|
!disable_native_tools?
|
|
end
|
|
|
|
def embed_user_ids?
|
|
return @embed_user_ids if defined?(@embed_user_ids)
|
|
|
|
@embed_user_ids ||=
|
|
prompt.messages.any? do |m|
|
|
m[:id] && m[:type] == :user && !m[:id].to_s.match?(VALID_ID_REGEX)
|
|
end
|
|
end
|
|
|
|
def max_prompt_tokens
|
|
# provide a buffer of 120 tokens - our function counting is not
|
|
# 100% accurate and getting numbers to align exactly is very hard
|
|
buffer = (opts[:max_tokens] || 2500) + 50
|
|
|
|
if tools.present?
|
|
# note this is about 100 tokens over, OpenAI have a more optimal representation
|
|
@function_size ||= llm_model.tokenizer_class.size(tools.to_json.to_s)
|
|
buffer += @function_size
|
|
end
|
|
|
|
llm_model.max_prompt_tokens - buffer
|
|
end
|
|
|
|
private
|
|
|
|
def disable_native_tools?
|
|
return @disable_native_tools if defined?(@disable_native_tools)
|
|
!!@disable_native_tools = llm_model.lookup_custom_param("disable_native_tools")
|
|
end
|
|
|
|
def tools_dialect
|
|
if native_tool_support?
|
|
@tools_dialect ||= DiscourseAi::Completions::Dialects::OpenAiTools.new(prompt.tools)
|
|
else
|
|
super
|
|
end
|
|
end
|
|
|
|
# developer messages are preferred on recent reasoning models
|
|
def supports_developer_messages?
|
|
!legacy_reasoning_model? && llm_model.provider == "open_ai" &&
|
|
(
|
|
llm_model.name.start_with?("o1") || llm_model.name.start_with?("o3") ||
|
|
llm_model.name.start_with?("gpt-5")
|
|
)
|
|
end
|
|
|
|
def legacy_reasoning_model?
|
|
llm_model.provider == "open_ai" &&
|
|
(llm_model.name.start_with?("o1-preview") || llm_model.name.start_with?("o1-mini"))
|
|
end
|
|
|
|
def system_msg(msg)
|
|
content = msg[:content]
|
|
if !native_tool_support? && tools_dialect.instructions.present?
|
|
content = content + "\n\n" + tools_dialect.instructions
|
|
end
|
|
|
|
if supports_developer_messages?
|
|
{ role: "developer", content: content }
|
|
elsif legacy_reasoning_model?
|
|
{ role: "user", content: content }
|
|
else
|
|
{ role: "system", content: content }
|
|
end
|
|
end
|
|
|
|
def model_msg(msg)
|
|
message_for_role("assistant", msg)
|
|
end
|
|
|
|
def tool_call_msg(msg)
|
|
if native_tool_support?
|
|
tools_dialect.from_raw_tool_call(msg)
|
|
else
|
|
super
|
|
end
|
|
end
|
|
|
|
def tool_msg(msg)
|
|
if native_tool_support?
|
|
tools_dialect.from_raw_tool(msg)
|
|
else
|
|
super
|
|
end
|
|
end
|
|
|
|
def user_msg(msg)
|
|
message_for_role("user", msg)
|
|
end
|
|
|
|
def message_for_role(role, msg)
|
|
content_array = []
|
|
|
|
user_message = { role: }
|
|
|
|
if msg[:id]
|
|
if embed_user_ids?
|
|
content_array << "#{msg[:id]}: "
|
|
else
|
|
user_message[:name] = msg[:id]
|
|
end
|
|
end
|
|
|
|
content_array << msg[:content]
|
|
|
|
allow_images = vision_support?
|
|
|
|
content_array =
|
|
to_encoded_content_array(
|
|
content: content_array.flatten,
|
|
upload_encoder: ->(details) { upload_node(details, role) },
|
|
text_encoder: ->(text) { text_node(text, role) },
|
|
other_encoder: ->(hash) { hash },
|
|
allow_images:,
|
|
allow_documents: true,
|
|
allowed_attachment_types: llm_model.allowed_attachment_types,
|
|
upload_filter: ->(encoded) { document_allowed?(encoded) },
|
|
)
|
|
|
|
user_message[:content] = no_array_if_only_text(content_array)
|
|
user_message
|
|
end
|
|
|
|
def no_array_if_only_text(content_array)
|
|
if content_array.size == 1 && content_array.first[:type] == "text"
|
|
content_array.first[:text]
|
|
else
|
|
content_array
|
|
end
|
|
end
|
|
|
|
def text_node(text, role)
|
|
{ type: "text", text: text }
|
|
end
|
|
|
|
def upload_node(details, role)
|
|
return text_node(details[:text], role) if details[:text].present?
|
|
|
|
if details[:mime_type] == "application/pdf" || details[:kind] == :document
|
|
file_node(details)
|
|
else
|
|
image_node(details)
|
|
end
|
|
end
|
|
|
|
def image_node(details)
|
|
encoded_image = "data:#{details[:mime_type]};base64,#{details[:base64]}"
|
|
{ type: "image_url", image_url: { url: encoded_image } }
|
|
end
|
|
|
|
def file_node(details)
|
|
{
|
|
type: "file",
|
|
file: {
|
|
filename: details[:filename] || "document.pdf",
|
|
file_data: "data:#{details[:mime_type]};base64,#{details[:base64]}",
|
|
},
|
|
}
|
|
end
|
|
|
|
def per_message_overhead
|
|
# open ai defines about 4 tokens per message of overhead
|
|
4
|
|
end
|
|
|
|
def calculate_message_token(context)
|
|
llm_model.tokenizer_class.size(context[:content].to_s + context[:name].to_s)
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|