mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-06-19 07:43:46 +08:00
Document attachments (doc, docx, xls, xlsx, rtf, csv, md, txt) are now converted to text before being included in LLM prompts, instead of being forwarded as raw base64 payloads. PDFs remain the only format sent as a raw upload, capped at 10MB. New converters under lib/completions: - DocToText shells out to antiword - DocxToText parses OOXML directly with size and depth limits - XlsToText shells out to xls2csv - XlsxToText parses OOXML and shared strings into CSV-style text - RtfToText is a custom RTF tokenizer with destination/group handling Plain text formats (csv, md, txt) are read with a 1MB byte cap and UTF-8 normalization. Extracted text is truncated to 100k characters, with a preamble noting the original filename and size. Dialect trimming now uses token-aware truncation against a per-message budget so large extracted documents collapse cleanly under the prompt limit, rather than the previous step-based slicing of raw content. Other changes: - LlmModel.normalize_attachment_types is shared with UploadEncoder and collapses "markdown" to "md" so the canonical extension is consistent across model config, UI defaults, and encoder output - ai-llm-attachment-types adds csv, xls, xlsx to the default choices - Locale strings clarify that vision controls images and allowed_attachment_types controls documents --------- Co-authored-by: Rafael Silva <xfalcox@gmail.com>
158 lines
4.8 KiB
Ruby
Vendored
158 lines
4.8 KiB
Ruby
Vendored
# frozen_string_literal: true
|
|
|
|
RSpec.describe DiscourseAi::Completions::DocxToText do
|
|
def with_docx(entries)
|
|
tempfile = Tempfile.new(%w[document .docx])
|
|
path = tempfile.path
|
|
tempfile.close
|
|
FileUtils.rm_f(path)
|
|
|
|
::Zip::File.open(path, create: true) do |zip_file|
|
|
entries.each do |name, content|
|
|
zip_file.get_output_stream(name) { |stream| stream.write(content) }
|
|
end
|
|
end
|
|
|
|
yield path
|
|
ensure
|
|
tempfile&.close
|
|
FileUtils.rm_f(path) if path
|
|
end
|
|
|
|
def word_xml(body)
|
|
<<~XML
|
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
#{body}
|
|
</w:document>
|
|
XML
|
|
end
|
|
|
|
def numbered_paragraph(text, num_id:, ilvl: 0)
|
|
<<~XML
|
|
<w:p>
|
|
<w:pPr>
|
|
<w:numPr>
|
|
<w:ilvl w:val="#{ilvl}"/>
|
|
<w:numId w:val="#{num_id}"/>
|
|
</w:numPr>
|
|
</w:pPr>
|
|
<w:r><w:t>#{text}</w:t></w:r>
|
|
</w:p>
|
|
XML
|
|
end
|
|
|
|
def numbering_xml
|
|
<<~XML
|
|
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
<w:numbering xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:abstractNum w:abstractNumId="1">
|
|
<w:lvl w:ilvl="0">
|
|
<w:start w:val="1"/>
|
|
<w:numFmt w:val="decimal"/>
|
|
<w:lvlText w:val="%1."/>
|
|
</w:lvl>
|
|
<w:lvl w:ilvl="1">
|
|
<w:start w:val="1"/>
|
|
<w:numFmt w:val="lowerLetter"/>
|
|
<w:lvlText w:val="%2)"/>
|
|
</w:lvl>
|
|
</w:abstractNum>
|
|
<w:num w:numId="7">
|
|
<w:abstractNumId w:val="1"/>
|
|
</w:num>
|
|
<w:abstractNum w:abstractNumId="2">
|
|
<w:lvl w:ilvl="0">
|
|
<w:numFmt w:val="bullet"/>
|
|
<w:lvlText w:val="•"/>
|
|
</w:lvl>
|
|
</w:abstractNum>
|
|
<w:num w:numId="8">
|
|
<w:abstractNumId w:val="2"/>
|
|
</w:num>
|
|
</w:numbering>
|
|
XML
|
|
end
|
|
|
|
it "extracts paragraph text from the main document" do
|
|
with_docx("word/document.xml" => word_xml(<<~XML)) do |path|
|
|
<w:body>
|
|
<w:p>
|
|
<w:r><w:t>Hello</w:t></w:r>
|
|
<w:r><w:tab/></w:r>
|
|
<w:r><w:t>world</w:t></w:r>
|
|
<w:r><w:br/></w:r>
|
|
<w:r><w:t>with a break</w:t></w:r>
|
|
</w:p>
|
|
<w:p><w:r><w:t>Second paragraph</w:t></w:r></w:p>
|
|
</w:body>
|
|
XML
|
|
expect(described_class.convert(path)).to eq("Hello\tworld\nwith a break\nSecond paragraph")
|
|
end
|
|
end
|
|
|
|
it "extracts supported text parts in a stable order" do
|
|
part = ->(text) { word_xml("<w:p><w:r><w:t>#{text}</w:t></w:r></w:p>") }
|
|
|
|
with_docx(
|
|
"word/footer1.xml" => part.call("Footer"),
|
|
"word/header2.xml" => part.call("Header two"),
|
|
"word/document.xml" => part.call("Body"),
|
|
"word/comments.xml" => part.call("Comment"),
|
|
"word/header1.xml" => part.call("Header one"),
|
|
"word/endnotes.xml" => part.call("Endnote"),
|
|
"word/footnotes.xml" => part.call("Footnote"),
|
|
) do |path|
|
|
expect(described_class.convert(path)).to eq(
|
|
"Body\n\nHeader one\n\nHeader two\n\nFooter\n\nFootnote\n\nEndnote\n\nComment",
|
|
)
|
|
end
|
|
end
|
|
|
|
it "extracts image alt text inline" do
|
|
with_docx("word/document.xml" => word_xml(<<~XML)) do |path|
|
|
<w:body>
|
|
<w:p>
|
|
<w:r><w:t>See </w:t></w:r>
|
|
<w:r>
|
|
<w:drawing>
|
|
<wp:inline xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing">
|
|
<wp:docPr id="1" name="Picture 1" descr="a diagram of the upload flow" title="Upload flow"/>
|
|
</wp:inline>
|
|
</w:drawing>
|
|
</w:r>
|
|
<w:r><w:t> for details</w:t></w:r>
|
|
</w:p>
|
|
</w:body>
|
|
XML
|
|
expect(described_class.convert(path)).to eq(
|
|
"See [Image: a diagram of the upload flow - Upload flow] for details",
|
|
)
|
|
end
|
|
end
|
|
|
|
it "adds prefixes for numbered and bulleted lists" do
|
|
with_docx(
|
|
"word/numbering.xml" => numbering_xml,
|
|
"word/document.xml" => word_xml(<<~XML),
|
|
<w:body>
|
|
#{numbered_paragraph("First", num_id: 7)}
|
|
#{numbered_paragraph("Second", num_id: 7)}
|
|
#{numbered_paragraph("Nested", num_id: 7, ilvl: 1)}
|
|
#{numbered_paragraph("Third", num_id: 7)}
|
|
#{numbered_paragraph("Bullet", num_id: 8)}
|
|
</w:body>
|
|
XML
|
|
) do |path|
|
|
expect(described_class.convert(path)).to eq(
|
|
"1. First\n2. Second\n a) Nested\n3. Third\n• Bullet",
|
|
)
|
|
end
|
|
end
|
|
|
|
it "returns blank text when the docx has no supported text parts" do
|
|
with_docx("docProps/core.xml" => "<properties />") do |path|
|
|
expect(described_class.convert(path)).to eq("")
|
|
end
|
|
end
|
|
end
|