discourse/plugins/discourse-ai/spec/lib/completions/ods_to_text_spec.rb
Rafael dos Santos Silva 2e3b64fa74
FEATURE: extract text from ODT and ODS document uploads (#39711)
## Summary

Follow-up to #39634. Adds `OdtToText` and `OdsToText` converters so
OpenDocument text (`.odt`) and spreadsheet (`.ods`) attachments can be
embedded as text in LLM prompts, in line with the newly added DOCX/XLSX
support. Both formats are zip archives with a single `content.xml`, so
they reuse `Compression::SafeZipReader` and the bounded Nokogiri parsing
pattern from #39634 — no new external binaries.

- `OdtToText` walks the body's block-level children (paragraphs,
headings, lists, tables, frames, sections) and renders nested lists with
depth-aware bullet prefixes. Tables become tab-separated rows.
- `OdsToText` iterates sheets and rows, expanding
`table:number-columns-repeated` up to `MAX_COLUMNS` to avoid expansion
bombs from sparse trailing cells, and falls back to `office:value` /
`office:date-value` / `office:boolean-value` when no inline `<text:p>`
is present.
- `UploadEncoder.attachment_type_for` and `encode_document` dispatch
gain `odt` and `ods` cases.
- `ai-llm-attachment-types` `DEFAULT_CHOICES` lists `odt` next to `docx`
and `ods` next to `xlsx`.

## Test plan

- [x] `bin/rspec
plugins/discourse-ai/spec/lib/completions/odt_to_text_spec.rb` — 6 cases
- [x] `bin/rspec
plugins/discourse-ai/spec/lib/completions/ods_to_text_spec.rb` — 6 cases
- [x] `bin/rspec
plugins/discourse-ai/spec/lib/completions/upload_encoder_spec.rb` — full
encoder suite incl. 4 new ODT/ODS integration cases
- [x] `bin/lint` clean across all touched files
- [ ] Manual smoke: upload a real `.odt` and `.ods` to a topic, assign
an LLM with the new attachment types allowed, and verify the extracted
text appears in the prompt
2026-05-05 12:04:13 -03:00

122 lines
4.4 KiB
Ruby
Vendored

# frozen_string_literal: true
RSpec.describe DiscourseAi::Completions::OdsToText do
def with_ods(entries)
tempfile = Tempfile.new(%w[spreadsheet .ods])
path = tempfile.path
tempfile.close
FileUtils.rm_f(path)
::Zip::File.open(path, create: true) do |zip_file|
entries.each do |name, content|
zip_file.get_output_stream(name) { |stream| stream.write(content) }
end
end
yield path
ensure
tempfile&.close
FileUtils.rm_f(path) if path
end
def ods_content(body)
<<~XML
<?xml version="1.0" encoding="UTF-8"?>
<office:document-content
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
xmlns:table="urn:oasis:names:tc:opendocument:xmlns:table:1.0">
<office:body>
<office:spreadsheet>
#{body}
</office:spreadsheet>
</office:body>
</office:document-content>
XML
end
it "extracts cells from each named sheet" do
with_ods("content.xml" => ods_content(<<~XML)) do |path|
<table:table table:name="Summary">
<table:table-row>
<table:table-cell office:value-type="string"><text:p>Name</text:p></table:table-cell>
<table:table-cell office:value-type="string"><text:p>Value</text:p></table:table-cell>
</table:table-row>
<table:table-row>
<table:table-cell office:value-type="string"><text:p>Alice</text:p></table:table-cell>
<table:table-cell office:value-type="float" office:value="1"><text:p>1</text:p></table:table-cell>
</table:table-row>
</table:table>
<table:table table:name="Notes">
<table:table-row>
<table:table-cell office:value-type="string"><text:p>hello</text:p></table:table-cell>
</table:table-row>
</table:table>
XML
expect(described_class.convert(path)).to eq(
"Sheet: Summary\n\nName\tValue\nAlice\t1\n\nSheet: Notes\n\nhello",
)
end
end
it "uses Sheet1, Sheet2 ... when no name is set" do
with_ods("content.xml" => ods_content(<<~XML)) do |path|
<table:table>
<table:table-row>
<table:table-cell office:value-type="string"><text:p>only</text:p></table:table-cell>
</table:table-row>
</table:table>
XML
expect(described_class.convert(path)).to eq("Sheet: Sheet1\n\nonly")
end
end
it "renders typed values when no inline paragraph is present" do
with_ods("content.xml" => ods_content(<<~XML)) do |path|
<table:table table:name="Types">
<table:table-row>
<table:table-cell office:value-type="boolean" office:boolean-value="true"/>
<table:table-cell office:value-type="date" office:date-value="2026-05-04"/>
<table:table-cell office:value-type="float" office:value="42"/>
</table:table-row>
</table:table>
XML
expect(described_class.convert(path)).to eq("Sheet: Types\n\nTRUE\t2026-05-04\t42")
end
end
it "expands a non-empty number-columns-repeated up to MAX_COLUMNS" do
with_ods("content.xml" => ods_content(<<~XML)) do |path|
<table:table table:name="Repeats">
<table:table-row>
<table:table-cell office:value-type="string" table:number-columns-repeated="3">
<text:p>x</text:p>
</table:table-cell>
<table:table-cell office:value-type="string"><text:p>y</text:p></table:table-cell>
</table:table-row>
</table:table>
XML
expect(described_class.convert(path)).to eq("Sheet: Repeats\n\nx\tx\tx\ty")
end
end
it "ignores covered (merge-continuation) cells and trims trailing empties" do
with_ods("content.xml" => ods_content(<<~XML)) do |path|
<table:table table:name="Merge">
<table:table-row>
<table:table-cell office:value-type="string"><text:p>head</text:p></table:table-cell>
<table:covered-table-cell/>
<table:table-cell/>
</table:table-row>
</table:table>
XML
expect(described_class.convert(path)).to eq("Sheet: Merge\n\nhead")
end
end
it "returns blank text when content.xml is missing" do
with_ods("META-INF/manifest.xml" => "<manifest/>") do |path|
expect(described_class.convert(path)).to eq("")
end
end
end