discourse/Gemfile
Sam fa54f62348
FEATURE: extract text from document uploads for LLM prompts (#39634)
Document attachments (doc, docx, xls, xlsx, rtf, csv, md, txt) are now
converted to text before being included in LLM prompts, instead of
being forwarded as raw base64 payloads. PDFs remain the only format
sent as a raw upload, capped at 10MB.

New converters under lib/completions:

- DocToText shells out to antiword
- DocxToText parses OOXML directly with size and depth limits
- XlsToText shells out to xls2csv
- XlsxToText parses OOXML and shared strings into CSV-style text
- RtfToText is a custom RTF tokenizer with destination/group handling

Plain text formats (csv, md, txt) are read with a 1MB byte cap and
UTF-8 normalization. Extracted text is truncated to 100k characters,
with a preamble noting the original filename and size.

Dialect trimming now uses token-aware truncation against a per-message
budget so large extracted documents collapse cleanly under the prompt
limit, rather than the previous step-based slicing of raw content.

Other changes:

- LlmModel.normalize_attachment_types is shared with UploadEncoder and
  collapses "markdown" to "md" so the canonical extension is consistent
  across model config, UI defaults, and encoder output
- ai-llm-attachment-types adds csv, xls, xlsx to the default choices
- Locale strings clarify that vision controls images and
  allowed_attachment_types controls documents

---------

Co-authored-by: Rafael Silva <xfalcox@gmail.com>
2026-05-05 08:16:23 +10:00

325 lines
7.9 KiB
Ruby
Vendored

# frozen_string_literal: true
ruby "~> 3.4"
source "https://rubygems.org"
# if there is a super emergency and rubygems is playing up, try
#source 'http://production.cf.rubygems.org'
gem "bootsnap", require: false, platform: :mri
gem "actionmailer", "~> 8.0.0"
gem "actionpack", "~> 8.0.0"
gem "actionview", "~> 8.0.0"
gem "activemodel", "~> 8.0.0"
gem "activerecord", "~> 8.0.0"
gem "activesupport", "~> 8.0.0"
gem "railties", "~> 8.0.0"
gem "propshaft"
gem "json"
# this will eventually be added to rails,
# allows us to precompile all our templates in the app server master
gem "actionview_precompiler", require: false
gem "discourse-seed-fu"
gem "mail"
gem "mini_mime"
gem "mini_suffix"
# NOTE: hiredis-client is recommended for high performance use of Redis
# however a recent attempt at an upgrade lead to https://meta.discourse.org/t/rebuild-error/375387
# for now we are sticking with the socked based implementation that is not sensitive to this issue
# gem "hiredis-client"
gem "redis"
# This is explicitly used by Sidekiq and is an optional dependency.
# We tell Sidekiq to use the namespace "sidekiq" which triggers this
# gem to be used. There is no explicit dependency in sidekiq cause
# redis namespace support is optional
# We already namespace stuff in DiscourseRedis, so we should consider
# just using a single implementation in core vs having 2 namespace implementations
gem "redis-namespace"
# NOTE: AM serializer gets a lot slower with recent updates
# we used an old branch which is the fastest one out there
# are long term goal here is to fork this gem so we have a
# better maintained living fork
gem "active_model_serializers", "~> 0.8.3"
gem "http_accept_language", require: false
gem "discourse-fonts", require: "discourse_fonts"
gem "discourse-emojis", require: "discourse_emojis"
gem "discourse_math_bundle"
gem "message_bus"
gem "rails_multisite"
gem "fastimage"
gem "aws-sdk-s3", require: false
gem "aws-sdk-sns", require: false
gem "aws-sdk-sts", require: false
gem "aws-sdk-mediaconvert", require: false
gem "aws-sdk-bedrockruntime", require: false
gem "excon"
gem "unf", require: false
gem "email_reply_trimmer"
gem "image_optim"
gem "multi_json"
gem "mustache"
gem "nokogiri"
gem "loofah"
gem "css_parser", require: false
gem "omniauth"
gem "omniauth-facebook"
gem "omniauth-twitter"
gem "omniauth-github"
gem "omniauth-oauth2", require: false
gem "omniauth-google-oauth2"
gem "oj"
gem "pg"
gem "mini_sql"
gem "pry-rails", require: false
gem "rtlcss", require: false
gem "messageformat-wrapper", require: false
gem "rake"
gem "thor", require: false
gem "diffy", require: false
gem "rinku"
gem "sidekiq", ">= 7.3.10" # ensuring it won't get downgraded to accomodate a connection_pool upgrade
gem "mini_scheduler"
gem "mini_racer"
gem "highline", require: false
# TODO: upgrade to Rack 3 now that Unicorn has been removed
gem "rack", "< 3"
gem "rack-protection" # security
gem "cbor", require: false
gem "cose", require: false
gem "addressable"
gem "json_schemer"
gem "net-smtp", require: false
gem "net-pop", require: false
gem "digest", require: false
gem "goldiloader"
group :test do
gem "capybara", require: false
gem "webmock", require: false
gem "simplecov", require: false
gem "test-prof"
gem "rails-dom-testing", require: false
gem "minio_runner", require: false
gem "capybara-playwright-driver"
gem "puma", require: false
end
group :test, :development do
gem "rspec"
gem "listen", require: false
gem "certified", require: false
gem "fabrication", require: false
gem "mocha", require: false
gem "rb-fsevent", require: RUBY_PLATFORM =~ /darwin/i ? "rb-fsevent" : false
gem "rspec-rails"
gem "shoulda-matchers", require: false
gem "rspec-html-matchers"
gem "debug", ">= 1.0.0", require: "debug/prelude"
gem "rubocop-discourse", require: false
gem "parallel_tests"
gem "rswag-specs"
gem "annotaterb"
gem "syntax_tree"
gem "rspec-multi-mock"
end
group :development do
gem "ruby-prof", require: false, platform: :mri
gem "bullet", require: !!ENV["BULLET"]
gem "better_errors", platform: :mri, require: !!ENV["BETTER_ERRORS"]
gem "yaml-lint"
gem "yard"
gem "ruby-lsp", require: false
gem "ruby-lsp-rails", require: false
gem "ruby-lsp-rspec", require: false
end
if ENV["ALLOW_DEV_POPULATE"] == "1"
gem "discourse_dev_assets"
gem "faker"
else
group :development, :test do
gem "discourse_dev_assets"
gem "faker"
end
end
# this is an optional gem, it provides a high performance replacement
# to String#blank? a method that is called quite frequently in current
# ActiveRecord, this may change in the future
gem "fast_blank", platform: :ruby
# this provides a very efficient lru cache
gem "lru_redux"
gem "htmlentities", require: false
# IMPORTANT: mini profiler monkey patches, so it better be required last
# If you want to amend mini profiler to do the monkey patches in the railties
# we are open to it. by deferring require to the initializer we can configure discourse installs without it
gem "rack-mini-profiler", require: ["enable_rails_patches"]
gem "pitchfork", require: false
# Used by discourse-prometheus to collect socket queue stats.
# Was previously a transitive dependency of the unicorn gem.
gem "raindrops", require: false, platform: :ruby
gem "rbtrace", require: false, platform: :mri
# required for feed importing and embedding
gem "ruby-readability", require: false
# rss gem is a bundled gem from Ruby 3 onwards
gem "rss", require: false
gem "stackprof", require: false, platform: :mri
gem "memory_profiler", require: false, platform: :mri
gem "cppjieba_rb", require: false
gem "lograge", require: false
gem "logstash-event", require: false
gem "logster"
# A fork of sassc with dart-sass support
gem "sassc-embedded"
gem "rotp", require: false
gem "rqrcode"
gem "rubyzip", require: false
install_if -> { RUBY_PLATFORM.include?("linux") } do
gem "landlock", require: false
end
gem "sshkey", require: false
gem "rchardet", require: false
gem "lz4-ruby", require: false, platform: :ruby
gem "sanitize"
if ENV["IMPORT"] == "1"
gem "mysql2"
gem "redcarpet"
# NOTE: in import mode the version of sqlite can matter a lot, so we stick it to a specific one
gem "sqlite3", "~> 1.3", ">= 1.3.13"
gem "ruby-bbcode-to-md", git: "https://github.com/nlalonde/ruby-bbcode-to-md"
gem "reverse_markdown"
gem "tiny_tds"
gem "csv"
end
group :generic_import, optional: true do
gem "sqlite3"
gem "redcarpet"
end
gem "web-push"
gem "colored2", require: false
gem "maxminddb"
gem "rails_failover", require: false
gem "faraday"
gem "faraday-retry"
# workaround for faraday-net_http, see
# https://github.com/ruby/net-imap/issues/16#issuecomment-803086765
gem "net-http"
# Workaround until Ruby ships with cgi version 0.3.6 or higher.
gem "cgi", ">= 0.3.6", require: false
gem "tzinfo-data"
gem "csv", require: false
# dependencies for the automation plugin
gem "iso8601"
gem "rrule"
group :migrations, optional: true do
gem "extralite-bundle", require: "extralite"
# auto-loading
gem "zeitwerk"
# databases
gem "trilogy"
# CLI
gem "ruby-progressbar"
# non-cryptographic hashing algorithm for generating placeholder IDs
gem "digest-xxhash"
end
gem "dry-initializer", "~> 3.1"
gem "parallel"
gem "tty-prompt", require: false
# for discourse-zendesk-plugin
gem "inflection", require: false
gem "multipart-post", require: false
gem "faraday-multipart", require: false
gem "zendesk_api", require: false
# for discourse-subscriptions
gem "stripe", require: false
# for discourse-github
gem "sawyer", require: false
gem "octokit", require: false
# for discourse-ai
gem "tokenizers", require: false
gem "tiktoken_ruby", require: false
gem "discourse_ai-tokenizers", require: false
gem "ed25519" # TODO: remove this as existing ssl gem should handle this
gem "Ascii85", require: false
gem "ruby-rc4", require: false
gem "hashery", require: false
gem "ttfunk", require: false
gem "afm", require: false
gem "pdf-reader", require: false