FEATURE: convert incoming emails in HTML to markdown

- remove incoming_email_prefer_html site setting - remove HtmlCleaner class
2025-09-05 08:59:27 +08:00 · 2017-04-26 16:49:06 +02:00 · 2017-04-26 16:49:06 +02:00 · b76674f640
commit b76674f640
parent e155cb6db1
6 changed files with 10 additions and 157 deletions
--- a/lib/email/html_cleaner.rb
+++ b/lib/email/html_cleaner.rb
@ -1,132 +0,0 @@
-module Email
-  # HtmlCleaner cleans up the extremely dirty HTML that many email clients
-  # generate by stripping out any excess divs or spans, removing styling in
-  # the process (which also makes the html more suitable to be parsed as
-  # Markdown).
-  class HtmlCleaner
-    # Elements to hoist all children out of
-    HTML_HOIST_ELEMENTS = %w(div span font table tbody th tr td)
-    # Node types to always delete
-    HTML_DELETE_ELEMENT_TYPES = [
-      Nokogiri::XML::Node::DTD_NODE,
-      Nokogiri::XML::Node::COMMENT_NODE,
-    ]
-
-    # Private variables:
-    #   @doc - nokogiri document
-    #   @out - same as @doc, but only if trimming has occured
-    def initialize(html)
-      if String === html
-        @doc = Nokogiri::HTML(html)
-      else
-        @doc = html
-      end
-    end
-
-    class << self
-      # Email::HtmlCleaner.trim(inp, opts={})
-      #
-      # Arguments:
-      #   inp - Either a HTML string or a Nokogiri document.
-      # Options:
-      #   :return => :doc, :string
-      #     Specify the desired return type.
-      #     Defaults to the type of the input.
-      #     A value of :string is equivalent to calling get_document_text()
-      #     on the returned document.
-      def trim(inp, opts={})
-        cleaner = HtmlCleaner.new(inp)
-
-        opts[:return] ||= ((String === inp) ? :string : :doc)
-
-        if opts[:return] == :string
-          cleaner.output_html
-        else
-          cleaner.output_document
-        end
-      end
-
-      # Email::HtmlCleaner.get_document_text(doc)
-      #
-      # Get the body portion of the document, including html, as a string.
-      def get_document_text(doc)
-        body = doc.xpath('//body')
-        if body
-          body.inner_html
-        else
-          doc.inner_html
-        end
-      end
-    end
-
-    def output_document
-      @out ||= begin
-                 doc = @doc
-                 trim_process_node doc
-                 add_newlines doc
-                 doc
-      end
-    end
-
-    def output_html
-      HtmlCleaner.get_document_text(output_document)
-    end
-
-    private
-
-    def add_newlines(doc)
-      # Replace <br> tags with a markdown \n
-      doc.xpath('//br').each do |br|
-        br.replace(new_linebreak_node doc, 2)
-      end
-      # Surround <p> tags with newlines, to help with line-wise postprocessing
-      # and ensure markdown paragraphs
-      doc.xpath('//p').each do |p|
-        p.before(new_linebreak_node doc)
-        p.after(new_linebreak_node doc, 2)
-      end
-    end
-
-    def new_linebreak_node(doc, count=1)
-      Nokogiri::XML::Text.new("\n" * count, doc)
-    end
-
-    def trim_process_node(node)
-      if should_hoist?(node)
-        hoisted = trim_hoist_element node
-        hoisted.each { |child| trim_process_node child }
-      elsif should_delete?(node)
-        node.remove
-      else
-        if children = node.children
-          children.each { |child| trim_process_node child }
-        end
-      end
-
-      node
-    end
-
-    def trim_hoist_element(element)
-      hoisted = []
-      element.children.each do |child|
-        element.before(child)
-        hoisted << child
-      end
-      element.remove
-      hoisted
-    end
-
-    def should_hoist?(node)
-      return false unless node.element?
-      HTML_HOIST_ELEMENTS.include? node.name
-    end
-
-    def should_delete?(node)
-      return true if HTML_DELETE_ELEMENT_TYPES.include? node.type
-      return true if node.element? && node.name == 'head'
-      return true if node.text? && node.text.strip.blank?
-
-      false
-    end
-  end
-end
--- a/lib/email/receiver.rb
+++ b/lib/email/receiver.rb
@ -1,7 +1,7 @@
 require "digest"
 require_dependency "new_post_manager"
 require_dependency "post_action_creator"
-require_dependency "email/html_cleaner"
+require_dependency "html_to_markdown"

 module Email

@ -188,18 +188,18 @@ module Email
        text = fix_charset(@mail)
      end

-      if html.present? && (SiteSetting.incoming_email_prefer_html || text.blank?)
-        html = Email::HtmlCleaner.new(html).output_html
-        html = trim_discourse_markers(html)
-        html, elided = EmailReplyTrimmer.trim(html, true)
-        return [html, elided]
-      end
-
      if text.present?
        text = trim_discourse_markers(text)
        text, elided = EmailReplyTrimmer.trim(text, true)
        return [text, elided]
      end
+
+      if html.present?
+        markdown = HtmlToMarkdown.new(html).to_markdown
+        markdown = trim_discourse_markers(markdown)
+        markdown, elided = EmailReplyTrimmer.trim(markdown, true)
+        return [markdown, elided]
+      end
    end

    def fix_charset(mail_part)