FEATURE: convert incoming emails in HTML to markdown

- remove incoming_email_prefer_html site setting - remove HtmlCleaner class
2025-08-21 19:11:18 +08:00 · 2017-04-26 16:49:06 +02:00 · 2017-04-26 16:49:06 +02:00 · b76674f640
commit b76674f640
parent e155cb6db1
6 changed files with 10 additions and 157 deletions
--- a/config/locales/server.en.yml
+++ b/config/locales/server.en.yml
@ -1306,7 +1306,6 @@ en:
    reply_by_email_enabled: "Enable replying to topics via email."
    reply_by_email_address: "Template for reply by email incoming email address, for example: %{reply_key}@reply.example.com or replies+%{reply_key}@example.com"
    alternative_reply_by_email_addresses: "List of alternative templates for reply by email incoming email addresses. Example: %{reply_key}@reply.example.com|replies+%{reply_key}@example.com"
    incoming_email_prefer_html: "Use the HTML instead of the text for incoming email. May cause unexpected formatting issues!"
    disable_emails: "Prevent Discourse from sending any kind of emails"
--- a/config/site_settings.yml
+++ b/config/site_settings.yml
@ -646,8 +646,6 @@ email:
  pop3_polling_username: ''
  pop3_polling_password: ''
  log_mail_processing_failures: false
  incoming_email_prefer_html:
    default: false
  email_in:
    default: false
    client: true
--- a/lib/email/html_cleaner.rb
+++ b/lib/email/html_cleaner.rb
@ -1,132 +0,0 @@
 module Email
  # HtmlCleaner cleans up the extremely dirty HTML that many email clients
  # generate by stripping out any excess divs or spans, removing styling in
  # the process (which also makes the html more suitable to be parsed as
  # Markdown).
  class HtmlCleaner
    # Elements to hoist all children out of
    HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
    # Node types to always delete
    HTML_DELETE_ELEMENT_TYPES = [
      Nokogiri::XML::Node::DTD_NODE,
      Nokogiri::XML::Node::COMMENT_NODE,
    ]
    # Private variables:
    #   @doc - nokogiri document
    #   @out - same as @doc, but only if trimming has occured
    def initialize(html)
      if String === html
        @doc = Nokogiri::HTML(html)
      else
        @doc = html
      end
    end
    class << self
      # Email::HtmlCleaner.trim(inp, opts={})
      #
      # Arguments:
      #   inp - Either a HTML string or a Nokogiri document.
      # Options:
      #   :return => :doc, :string
      #     Specify the desired return type.
      #     Defaults to the type of the input.
      #     A value of :string is equivalent to calling get_document_text()
      #     on the returned document.
      def trim(inp, opts={})
        cleaner = HtmlCleaner.new(inp)
        opts[:return] ||= ((String === inp) ? :string : :doc)
        if opts[:return] == :string
          cleaner.output_html
        else
          cleaner.output_document
        end
      end
      # Email::HtmlCleaner.get_document_text(doc)
      #
      # Get the body portion of the document, including html, as a string.
      def get_document_text(doc)
        body = doc.xpath('//body')
        if body
          body.inner_html
        else
          doc.inner_html
        end
      end
    end
    def output_document
      @out ||= begin
                 doc = @doc
                 trim_process_node doc
                 add_newlines doc
                 doc
      end
    end
    def output_html
      HtmlCleaner.get_document_text(output_document)
    end
    private
    def add_newlines(doc)
      # Replace <br> tags with a markdown \n
      doc.xpath('//br').each do |br|
        br.replace(new_linebreak_node doc, 2)
      end
      # Surround <p> tags with newlines, to help with line-wise postprocessing
      # and ensure markdown paragraphs
      doc.xpath('//p').each do |p|
        p.before(new_linebreak_node doc)
        p.after(new_linebreak_node doc, 2)
      end
    end
    def new_linebreak_node(doc, count=1)
      Nokogiri::XML::Text.new("\n" * count, doc)
    end
    def trim_process_node(node)
      if should_hoist?(node)
        hoisted = trim_hoist_element node
        hoisted.each { |child| trim_process_node child }
      elsif should_delete?(node)
        node.remove
      else
        if children = node.children
          children.each { |child| trim_process_node child }
        end
      end
      node
    end
    def trim_hoist_element(element)
      hoisted = []
      element.children.each do |child|
        element.before(child)
        hoisted << child
      end
      element.remove
      hoisted
    end
    def should_hoist?(node)
      return false unless node.element?
      HTML_HOIST_ELEMENTS.include? node.name
    end
    def should_delete?(node)
      return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
      return true if node.element? && node.name == 'head'
      return true if node.text? && node.text.strip.blank?
      false
    end
  end
 end
--- a/lib/email/receiver.rb
+++ b/lib/email/receiver.rb
@ -1,7 +1,7 @@
 require "digest"
 require_dependency "new_post_manager"
 require_dependency "post_action_creator"
-require_dependency "email/html_cleaner"
+require_dependency "html_to_markdown"
 module Email
@ -188,18 +188,18 @@ module Email
        text = fix_charset(@mail)
      end
      if html.present? && (SiteSetting.incoming_email_prefer_html || text.blank?)
        html = Email::HtmlCleaner.new(html).output_html
        html = trim_discourse_markers(html)
        html, elided = EmailReplyTrimmer.trim(html, true)
        return [html, elided]
      end
      if text.present?
        text = trim_discourse_markers(text)
        text, elided = EmailReplyTrimmer.trim(text, true)
        return [text, elided]
      end
      if html.present?
        markdown = HtmlToMarkdown.new(html).to_markdown
        markdown = trim_discourse_markers(markdown)
        markdown, elided = EmailReplyTrimmer.trim(markdown, true)
        return [markdown, elided]
      end
    end
    def fix_charset(mail_part)
--- a/lib/html_to_markdown.rb
+++ b/lib/html_to_markdown.rb
@ -115,7 +115,7 @@ class HtmlToMarkdown
    RUBY
  end
-  WHITELISTED ||= %w{del ins kbd s small strike sub sup table tbody td tfoot th thead tr}
+  WHITELISTED ||= %w{del ins kbd s small strike sub sup}
  WHITELISTED.each do |tag|
    class_eval <<-RUBY
      def visit_#{tag}(node)
--- a/spec/components/email/receiver_spec.rb
+++ b/spec/components/email/receiver_spec.rb
@ -157,7 +157,7 @@ describe Email::Receiver do
      expect(topic.posts.last.cooked).not_to match(/<br/)
      expect { process(:html_reply) }.to change { topic.posts.count }
-      expect(topic.posts.last.raw).to eq("This is a <b>HTML</b> reply ;)")
+      expect(topic.posts.last.raw).to eq("This is a **HTML** reply ;)")
      expect { process(:hebrew_reply) }.to change { topic.posts.count }
      expect(topic.posts.last.raw).to eq("שלום! מה שלומך היום?")
@ -174,18 +174,6 @@ describe Email::Receiver do
      expect(topic.posts.last.raw).to eq("This is the *text* part.")
    end
    it "prefers html over text when site setting is enabled" do
      SiteSetting.incoming_email_prefer_html = true
      expect { process(:text_and_html_reply) }.to change { topic.posts.count }
      expect(topic.posts.last.raw).to eq('This is the <b>html</b> part.')
    end
    it "uses text when prefer_html site setting is enabled but no html is available" do
      SiteSetting.incoming_email_prefer_html = true
      expect { process(:text_reply) }.to change { topic.posts.count }
      expect(topic.posts.last.raw).to eq("This is a text reply :)")
    end
    it "removes the 'on <date>, <contact> wrote' quoting line" do
      expect { process(:on_date_contact_wrote) }.to change { topic.posts.count }
      expect(topic.posts.last.raw).to eq("This is the actual reply.")