discourse/plugins/discourse-ai/spec/lib/completions/odt_to_text_spec.rb
Rafael dos Santos Silva 2e3b64fa74
FEATURE: extract text from ODT and ODS document uploads (#39711)
## Summary

Follow-up to #39634. Adds `OdtToText` and `OdsToText` converters so
OpenDocument text (`.odt`) and spreadsheet (`.ods`) attachments can be
embedded as text in LLM prompts, in line with the newly added DOCX/XLSX
support. Both formats are zip archives with a single `content.xml`, so
they reuse `Compression::SafeZipReader` and the bounded Nokogiri parsing
pattern from #39634 — no new external binaries.

- `OdtToText` walks the body's block-level children (paragraphs,
headings, lists, tables, frames, sections) and renders nested lists with
depth-aware bullet prefixes. Tables become tab-separated rows.
- `OdsToText` iterates sheets and rows, expanding
`table:number-columns-repeated` up to `MAX_COLUMNS` to avoid expansion
bombs from sparse trailing cells, and falls back to `office:value` /
`office:date-value` / `office:boolean-value` when no inline `<text:p>`
is present.
- `UploadEncoder.attachment_type_for` and `encode_document` dispatch
gain `odt` and `ods` cases.
- `ai-llm-attachment-types` `DEFAULT_CHOICES` lists `odt` next to `docx`
and `ods` next to `xlsx`.

## Test plan

- [x] `bin/rspec
plugins/discourse-ai/spec/lib/completions/odt_to_text_spec.rb` — 6 cases
- [x] `bin/rspec
plugins/discourse-ai/spec/lib/completions/ods_to_text_spec.rb` — 6 cases
- [x] `bin/rspec
plugins/discourse-ai/spec/lib/completions/upload_encoder_spec.rb` — full
encoder suite incl. 4 new ODT/ODS integration cases
- [x] `bin/lint` clean across all touched files
- [ ] Manual smoke: upload a real `.odt` and `.ods` to a topic, assign
an LLM with the new attachment types allowed, and verify the extracted
text appears in the prompt
2026-05-05 12:04:13 -03:00

113 lines
3.8 KiB
Ruby
Vendored

# frozen_string_literal: true
RSpec.describe DiscourseAi::Completions::OdtToText do
def with_odt(entries)
tempfile = Tempfile.new(%w[document .odt])
path = tempfile.path
tempfile.close
FileUtils.rm_f(path)
::Zip::File.open(path, create: true) do |zip_file|
entries.each do |name, content|
zip_file.get_output_stream(name) { |stream| stream.write(content) }
end
end
yield path
ensure
tempfile&.close
FileUtils.rm_f(path) if path
end
def odt_content(body)
<<~XML
<?xml version="1.0" encoding="UTF-8"?>
<office:document-content
xmlns:office="urn:oasis:names:tc:opendocument:xmlns:office:1.0"
xmlns:text="urn:oasis:names:tc:opendocument:xmlns:text:1.0"
xmlns:table="urn:oasis:names:tc:opendocument:xmlns:table:1.0"
xmlns:draw="urn:oasis:names:tc:opendocument:xmlns:drawing:1.0">
<office:body>
<office:text>
#{body}
</office:text>
</office:body>
</office:document-content>
XML
end
it "extracts headings, paragraphs, tabs, line breaks, and spans" do
with_odt("content.xml" => odt_content(<<~XML)) do |path|
<text:h text:outline-level="1">Title</text:h>
<text:p>Hello<text:tab/>world<text:line-break/>second line</text:p>
<text:p>Run with <text:span>nested <text:span>span</text:span></text:span> text</text:p>
XML
expect(described_class.convert(path)).to eq(
"Title\nHello\tworld\nsecond line\nRun with nested span text",
)
end
end
it "expands repeated spaces from text:s c=N" do
with_odt("content.xml" => odt_content(<<~XML)) do |path|
<text:p>A<text:s text:c="3"/>B</text:p>
XML
expect(described_class.convert(path)).to eq("A B")
end
end
it "renders nested lists with depth-aware bullet prefixes" do
with_odt("content.xml" => odt_content(<<~XML)) do |path|
<text:list>
<text:list-item><text:p>One</text:p></text:list-item>
<text:list-item>
<text:p>Two</text:p>
<text:list>
<text:list-item><text:p>Two.a</text:p></text:list-item>
<text:list-item><text:p>Two.b</text:p></text:list-item>
</text:list>
</text:list-item>
<text:list-item><text:p>Three</text:p></text:list-item>
</text:list>
XML
expect(described_class.convert(path)).to eq("- One\n- Two\n - Two.a\n - Two.b\n- Three")
end
end
it "renders tables as tab-separated rows" do
with_odt("content.xml" => odt_content(<<~XML)) do |path|
<table:table>
<table:table-row>
<table:table-cell><text:p>Name</text:p></table:table-cell>
<table:table-cell><text:p>Value</text:p></table:table-cell>
</table:table-row>
<table:table-row>
<table:table-cell><text:p>Alice</text:p></table:table-cell>
<table:table-cell><text:p>1</text:p></table:table-cell>
</table:table-row>
</table:table>
XML
expect(described_class.convert(path)).to eq("Name\tValue\nAlice\t1")
end
end
it "walks into frames and text-boxes for callout text" do
with_odt("content.xml" => odt_content(<<~XML)) do |path|
<text:p>Body before</text:p>
<draw:frame>
<draw:text-box>
<text:p>Callout</text:p>
</draw:text-box>
</draw:frame>
<text:p>Body after</text:p>
XML
expect(described_class.convert(path)).to eq("Body before\nCallout\nBody after")
end
end
it "returns blank text when content.xml is missing" do
with_odt("META-INF/manifest.xml" => "<manifest/>") do |path|
expect(described_class.convert(path)).to eq("")
end
end
end