diff --git a/lib/html_to_markdown.rb b/lib/html_to_markdown.rb index 30c394648d6..5cfbc9b6495 100644 --- a/lib/html_to_markdown.rb +++ b/lib/html_to_markdown.rb @@ -136,7 +136,7 @@ class HtmlToMarkdown end def visit_img(node) - if is_valid_url?(node["src"]) && is_visible_img?(node) + if is_valid_src?(node["src"]) && is_visible_img?(node) if @opts[:keep_img_tags] @stack[-1].markdown << node.to_html else @@ -147,7 +147,7 @@ class HtmlToMarkdown end def visit_a(node) - if is_valid_url?(node["href"]) + if is_valid_href?(node["href"]) @stack[-1].markdown << "[" traverse(node) @stack[-1].markdown << "](#{node["href"]})" @@ -206,14 +206,20 @@ class HtmlToMarkdown (lines + [""]).join("\n") end - def is_valid_url?(url) - url.present? && (url.start_with?("http") || url.start_with?("www.")) + def is_valid_href?(href) + href.present? && (href.start_with?("http") || href.start_with?("www.")) + end + + def is_valid_src?(src) + return false if src.blank? + return true if @opts[:keep_cid_imgs] && src.start_with?("cid:") + src.start_with?("http") || src.start_with?("www.") end def is_visible_img?(img) - return false if img["width"].present? && img["width"].to_i == 0 + return false if img["width"].present? && img["width"].to_i == 0 return false if img["height"].present? && img["height"].to_i == 0 - return false if img["style"].present? && img["style"][/(width|height)\s*:\s*0/] + return false if img["style"].present? && img["style"][/(width|height)\s*:\s*0/] true end diff --git a/spec/components/html_to_markdown_spec.rb b/spec/components/html_to_markdown_spec.rb index 091bc79ee80..8a4a8077f8a 100644 --- a/spec/components/html_to_markdown_spec.rb +++ b/spec/components/html_to_markdown_spec.rb @@ -3,8 +3,8 @@ require 'html_to_markdown' describe HtmlToMarkdown do - def html_to_markdown(html) - HtmlToMarkdown.new(html).to_markdown + def html_to_markdown(html, opts={}) + HtmlToMarkdown.new(html, opts).to_markdown end it "remove whitespaces" do @@ -55,14 +55,15 @@ describe HtmlToMarkdown do expect(html_to_markdown(%Q{Discourse})).to eq("Discourse") end - HTML_WITH_IMG ||= %Q{Discourse Logo} + HTML_WITH_IMG ||= %Q{Discourse Logo} + HTML_WITH_CID_IMG ||= %Q{Discourse Logo} it "converts " do expect(html_to_markdown(HTML_WITH_IMG)).to eq("![Discourse Logo](https://www.discourse.org/logo.svg)") end it "keeps with 'keep_img_tags'" do - expect(HtmlToMarkdown.new(HTML_WITH_IMG, keep_img_tags: true).to_markdown).to eq(HTML_WITH_IMG) + expect(html_to_markdown(HTML_WITH_IMG, keep_img_tags: true)).to eq(HTML_WITH_IMG) end it "removes empty & invalid " do @@ -71,6 +72,11 @@ describe HtmlToMarkdown do expect(html_to_markdown(%Q{})).to eq("") end + it "keeps with src='cid:' whith 'keep_cid_imgs'" do + expect(html_to_markdown(HTML_WITH_CID_IMG, keep_cid_imgs: true)).to eq("![Discourse Logo](cid:ii_1525434659ddb4cb)") + expect(html_to_markdown(HTML_WITH_CID_IMG, keep_img_tags: true, keep_cid_imgs: true)).to eq("\"Discourse") + end + it "skips hidden " do expect(html_to_markdown(%Q{})).to eq("") expect(html_to_markdown(%Q{})).to eq("")