discourse/plugins/discourse-ai/lib/completions/ods_to_text.rb
Rafael dos Santos Silva 2e3b64fa74
FEATURE: extract text from ODT and ODS document uploads (#39711)
## Summary

Follow-up to #39634. Adds `OdtToText` and `OdsToText` converters so
OpenDocument text (`.odt`) and spreadsheet (`.ods`) attachments can be
embedded as text in LLM prompts, in line with the newly added DOCX/XLSX
support. Both formats are zip archives with a single `content.xml`, so
they reuse `Compression::SafeZipReader` and the bounded Nokogiri parsing
pattern from #39634 — no new external binaries.

- `OdtToText` walks the body's block-level children (paragraphs,
headings, lists, tables, frames, sections) and renders nested lists with
depth-aware bullet prefixes. Tables become tab-separated rows.
- `OdsToText` iterates sheets and rows, expanding
`table:number-columns-repeated` up to `MAX_COLUMNS` to avoid expansion
bombs from sparse trailing cells, and falls back to `office:value` /
`office:date-value` / `office:boolean-value` when no inline `<text:p>`
is present.
- `UploadEncoder.attachment_type_for` and `encode_document` dispatch
gain `odt` and `ods` cases.
- `ai-llm-attachment-types` `DEFAULT_CHOICES` lists `odt` next to `docx`
and `ods` next to `xlsx`.

## Test plan

- [x] `bin/rspec
plugins/discourse-ai/spec/lib/completions/odt_to_text_spec.rb` — 6 cases
- [x] `bin/rspec
plugins/discourse-ai/spec/lib/completions/ods_to_text_spec.rb` — 6 cases
- [x] `bin/rspec
plugins/discourse-ai/spec/lib/completions/upload_encoder_spec.rb` — full
encoder suite incl. 4 new ODT/ODS integration cases
- [x] `bin/lint` clean across all touched files
- [ ] Manual smoke: upload a real `.odt` and `.ods` to a topic, assign
an LLM with the new attachment types allowed, and verify the extracted
text appears in the prompt
2026-05-05 12:04:13 -03:00

189 lines
4.8 KiB
Ruby
Vendored

# frozen_string_literal: true
require "nokogiri"
require "compression/safe_zip_reader"
module DiscourseAi
module Completions
class OdsToText
MAX_ENTRY_XML_BYTES = 10 * 1024 * 1024
MAX_TOTAL_XML_BYTES = 30 * 1024 * 1024
MAX_ZIP_ENTRIES = 1_000
MAX_EXTRACTED_TEXT_CHARS = 100_001
MAX_SHEETS = 50
MAX_ROWS_PER_SHEET = 10_000
MAX_COLUMNS = 200
CONTENT_PATH = "content.xml"
class EntryTooLargeError < StandardError
end
def self.convert(path)
new(path).convert
end
def initialize(path)
@path = path
end
def convert
Compression::SafeZipReader.open(
path,
max_entries: MAX_ZIP_ENTRIES,
max_total_bytes: MAX_TOTAL_XML_BYTES,
) do |zip_file|
xml = read_xml_entry(zip_file, CONTENT_PATH)
return "" if xml.blank?
normalize_document_text(extract_sheets(xml))
end
end
private
attr_reader :path
def read_xml_entry(zip_file, name)
entry = zip_file.find_entry(name)
return if entry.blank? || entry.directory?
zip_file.read_entry(entry, max_bytes: MAX_ENTRY_XML_BYTES)
rescue Compression::SafeZipReader::EntryTooLargeError => e
raise EntryTooLargeError, e.message
end
def extract_sheets(xml)
doc = Nokogiri.XML(force_utf8(xml)) { |config| config.recover.nonet }
doc.remove_namespaces!
sheets = doc.xpath("//body/spreadsheet/table")
return "" if sheets.blank?
sheet_texts = []
extracted_chars = 0
sheets
.first(MAX_SHEETS)
.each_with_index do |sheet, index|
name = sheet["name"].presence || "Sheet#{index + 1}"
rows_text = sheet_rows(sheet)
next if rows_text.blank?
block = "Sheet: #{name}\n\n#{rows_text}"
sheet_texts << block
extracted_chars += block.length + 2
break if extracted_chars > MAX_EXTRACTED_TEXT_CHARS
end
sheet_texts.join("\n\n")
end
def sheet_rows(sheet)
rows = sheet.xpath("./table-row")
return if rows.blank?
row_texts = []
extracted_chars = 0
rows
.first(MAX_ROWS_PER_SHEET)
.each do |row|
line = row_line(row)
next if line.blank?
row_texts << line
extracted_chars += line.length + 1
break if extracted_chars > MAX_EXTRACTED_TEXT_CHARS
end
row_texts.join("\n")
end
def row_line(row)
values = []
row
.xpath("./table-cell | ./covered-table-cell")
.each do |cell|
break if values.length >= MAX_COLUMNS
repeat = cell["number-columns-repeated"].to_i
repeat = 1 if repeat < 1
text = cell.name == "covered-table-cell" ? "" : cell_text(cell)
if text.blank?
values << ""
else
slots = [repeat, MAX_COLUMNS - values.length].min
slots.times { values << text }
end
end
values.pop while values.last&.blank? && values.any?
values.join("\t")
end
def cell_text(cell)
paragraphs = cell.xpath("./p")
return normalize_inline(paragraphs.map { |p| inline_text(p) }.join("\n")) if paragraphs.any?
case cell["value-type"]
when "boolean"
cell["boolean-value"].to_s == "true" ? "TRUE" : "FALSE"
when "date"
normalize_inline(cell["date-value"])
when "time"
normalize_inline(cell["time-value"])
else
normalize_inline(cell["value"])
end
end
def inline_text(node)
out = +""
collect_inline(node, out)
out
end
def collect_inline(node, out)
node.children.each do |child|
if child.text?
out << child.content
elsif child.element?
case child.name
when "tab"
out << "\t"
when "s"
count = child["c"].to_i
count = 1 if count < 1
out << (" " * count)
when "line-break"
out << "\n"
else
collect_inline(child, out)
end
end
end
end
def normalize_inline(text)
force_utf8(text).gsub("\u00A0", " ").gsub(/[ \t\r\n]+/, " ").strip
end
def normalize_document_text(text)
force_utf8(text)
.gsub("\u00A0", " ")
.gsub(/\r\n?/, "\n")
.gsub(/[ \t]+\n/, "\n")
.gsub(/\n{3,}/, "\n\n")
.strip
end
def force_utf8(text)
text.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
end
end
end
end