From 4c2d5158c58f04c2043ca90f1761a70717792ffc Mon Sep 17 00:00:00 2001 From: Roman Rizzi Date: Fri, 1 Oct 2021 12:48:21 -0300 Subject: [PATCH] FIX: Follow the canonical URL when importing a remote topic. (#14489) FinalDestination now supports the `follow_canonical` option, which will perform an initial GET request, parse the canonical link if present, and perform a HEAD request to it. We use this mode during embeds to avoid treating URLs with different query parameters as different topics. --- app/models/topic_embed.rb | 3 +- lib/final_destination.rb | 31 ++++++- spec/components/final_destination_spec.rb | 40 +++++++++ spec/models/topic_embed_spec.rb | 101 +++++++++++----------- 4 files changed, 122 insertions(+), 53 deletions(-) diff --git a/app/models/topic_embed.rb b/app/models/topic_embed.rb index 61554f6f5cf..1e9f06a3223 100644 --- a/app/models/topic_embed.rb +++ b/app/models/topic_embed.rb @@ -113,7 +113,8 @@ class TopicEmbed < ActiveRecord::Base fd = FinalDestination.new( url, validate_uri: true, - max_redirects: 5 + max_redirects: 5, + follow_canonical: true, ) url = fd.resolve diff --git a/lib/final_destination.rb b/lib/final_destination.rb index 3831abff43a..e3805479c83 100644 --- a/lib/final_destination.rb +++ b/lib/final_destination.rb @@ -63,7 +63,8 @@ class FinalDestination end @status = :ready - @http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head + @follow_canonical = @opts[:follow_canonical] + @http_verb = http_verb(@force_get_hosts, @follow_canonical) @cookie = nil @limited_ips = [] @verbose = @opts[:verbose] || false @@ -77,6 +78,14 @@ class FinalDestination 20 end + def http_verb(force_get_hosts, follow_canonical) + if follow_canonical || force_get_hosts.any? { |host| hostname_matches?(host) } + :get + else + :head + end + end + def timeout @timeout || FinalDestination.connection_timeout end @@ -215,6 +224,18 @@ class FinalDestination end end + if @follow_canonical + next_url = uri(fetch_canonical_url(response.body)) + + if next_url.to_s.present? && next_url != @uri + @follow_canonical = false + @uri = next_url + @http_verb = http_verb(@force_get_hosts, @follow_canonical) + + return resolve + end + end + @content_type = response.headers['Content-Type'] if response.headers.has_key?('Content-Type') @status = :resolved return @uri @@ -458,4 +479,12 @@ class FinalDestination end end + def fetch_canonical_url(body) + return if body.blank? + canonical_link = Nokogiri::HTML5(body).at("link[rel='canonical']") + + return if canonical_link.nil? + + canonical_link['href'] + end end diff --git a/spec/components/final_destination_spec.rb b/spec/components/final_destination_spec.rb index 8c626eb6575..8b781324bd5 100644 --- a/spec/components/final_destination_spec.rb +++ b/spec/components/final_destination_spec.rb @@ -49,6 +49,13 @@ describe FinalDestination do } end + def canonical_follow(from, dest) + stub_request(:get, from).to_return( + status: 200, + body: "" + ) + end + def redirect_response(from, dest) stub_request(:head, from).to_return( status: 302, @@ -175,6 +182,39 @@ describe FinalDestination do end end + context 'follows canonical links' do + it 'resolves the canonical link as the final destination' do + canonical_follow("https://eviltrout.com", "https://codinghorror.com/blog") + stub_request(:head, "https://codinghorror.com/blog").to_return(doc_response) + + final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true)) + + expect(final.resolve.to_s).to eq("https://codinghorror.com/blog") + expect(final.redirected?).to eq(false) + expect(final.status).to eq(:resolved) + end + + it "does not follow the canonical link if it's the same as the current URL" do + canonical_follow("https://eviltrout.com", "https://eviltrout.com") + + final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true)) + + expect(final.resolve.to_s).to eq("https://eviltrout.com") + expect(final.redirected?).to eq(false) + expect(final.status).to eq(:resolved) + end + + it "does not follow the canonical link if it's invalid" do + canonical_follow("https://eviltrout.com", "") + + final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true)) + + expect(final.resolve.to_s).to eq("https://eviltrout.com") + expect(final.redirected?).to eq(false) + expect(final.status).to eq(:resolved) + end + end + context "GET can be forced" do before do stub_request(:head, 'https://force.get.com/posts?page=4') diff --git a/spec/models/topic_embed_spec.rb b/spec/models/topic_embed_spec.rb index 694ad884248..8659eb7d569 100644 --- a/spec/models/topic_embed_spec.rb +++ b/spec/models/topic_embed_spec.rb @@ -190,18 +190,20 @@ describe TopicEmbed do end describe '.find_remote' do + fab!(:embeddable_host) { Fabricate(:embeddable_host) } + let(:file) { StringIO.new } + + before do + TopicEmbed.stubs(:open).returns file + end context ".title_scrub" do - let(:url) { 'http://eviltrout.com/123' } let(:contents) { "Through the Looking Glass - Classic Bookssome content here" } - fab!(:embeddable_host) { Fabricate(:embeddable_host) } - let!(:file) { StringIO.new } before do file.stubs(:read).returns contents - TopicEmbed.stubs(:open).returns file - stub_request(:head, url) + stub_request(:get, url) end it "doesn't scrub the title by default" do @@ -214,44 +216,38 @@ describe TopicEmbed do response = TopicEmbed.find_remote(url) expect(response.title).to eq("Through the Looking Glass") end - end context 'post with allowed classes "foo" and "emoji"' do fab!(:user) { Fabricate(:user) } let(:url) { 'http://eviltrout.com/123' } let(:contents) { "my normal size emoji

Hi

" } - fab!(:embeddable_host) { Fabricate(:embeddable_host) } - let!(:file) { StringIO.new } - - response = nil before do SiteSetting.allowed_embed_classnames = 'emoji, foo' file.stubs(:read).returns contents - TopicEmbed.stubs(:open).returns file - stub_request(:head, url) - response = TopicEmbed.find_remote(url) + stub_request(:get, url) + @response = TopicEmbed.find_remote(url) end it "has no author tag" do - expect(response.author).to be_blank + expect(@response.author).to be_blank end it 'img node has emoji class' do - expect(response.body).to have_tag('img', with: { class: 'emoji' }) + expect(@response.body).to have_tag('img', with: { class: 'emoji' }) end it 'img node has foo class' do - expect(response.body).to have_tag('img', with: { class: 'foo' }) + expect(@response.body).to have_tag('img', with: { class: 'foo' }) end it 'p node has foo class' do - expect(response.body).to have_tag('p', with: { class: 'foo' }) + expect(@response.body).to have_tag('p', with: { class: 'foo' }) end it 'nodes removes classes other than emoji' do - expect(response.body).to have_tag('img', without: { class: 'other' }) + expect(@response.body).to have_tag('img', without: { class: 'other' }) end end @@ -259,67 +255,56 @@ describe TopicEmbed do fab!(:user) { Fabricate(:user, username: 'eviltrout') } let(:url) { 'http://eviltrout.com/321' } let(:contents) { 'rich and morty' } - fab!(:embeddable_host) { Fabricate(:embeddable_host) } - let!(:file) { StringIO.new } - - response = nil before(:each) do file.stubs(:read).returns contents TopicEmbed.stubs(:open).returns file - stub_request(:head, url) - response = TopicEmbed.find_remote(url) + stub_request(:get, url) end it "has no author tag" do + response = TopicEmbed.find_remote(url) + expect(response.author).to eq(user) end end context 'post with no allowed classes' do - fab!(:user) { Fabricate(:user) } let(:url) { 'http://eviltrout.com/123' } let(:contents) { "my normal size emoji

Hi

" } - fab!(:embeddable_host) { Fabricate(:embeddable_host) } - let!(:file) { StringIO.new } - - response = nil before(:each) do SiteSetting.allowed_embed_classnames = '' file.stubs(:read).returns contents - TopicEmbed.stubs(:open).returns file - stub_request(:head, url) - response = TopicEmbed.find_remote(url) + stub_request(:get, url) + @response = TopicEmbed.find_remote(url) end it 'img node doesn\'t have emoji class' do - expect(response.body).to have_tag('img', without: { class: 'emoji' }) + expect(@response.body).to have_tag('img', without: { class: 'emoji' }) end it 'img node doesn\'t have foo class' do - expect(response.body).to have_tag('img', without: { class: 'foo' }) + expect(@response.body).to have_tag('img', without: { class: 'foo' }) end it 'p node doesn\'t foo class' do - expect(response.body).to have_tag('p', without: { class: 'foo' }) + expect(@response.body).to have_tag('p', without: { class: 'foo' }) end it 'img node doesn\'t have other class' do - expect(response.body).to have_tag('img', without: { class: 'other' }) + expect(@response.body).to have_tag('img', without: { class: 'other' }) end end context "non-ascii URL" do let(:url) { 'http://eviltrout.com/test/ماهی' } let(:contents) { "سلاماین یک پاراگراف آزمون است." } - fab!(:embeddable_host) { Fabricate(:embeddable_host) } - let!(:file) { StringIO.new } before do - stub_request(:head, url) - stub_request(:get, url).to_return(body: contents).then.to_raise + stub_request(:get, url) + file.stubs(:read).returns contents end it "doesn't throw an error" do @@ -331,12 +316,10 @@ describe TopicEmbed do context "encoded URL" do let(:url) { 'http://example.com/hello%20world' } let(:contents) { "Hello World!" } - fab!(:embeddable_host) { Fabricate(:embeddable_host) } - let!(:file) { StringIO.new } before do - stub_request(:head, url) - stub_request(:get, url).to_return(body: contents).then.to_raise + stub_request(:get, url) + file.stubs(:read).returns contents end it "doesn't throw an error" do @@ -356,17 +339,15 @@ describe TopicEmbed do context "emails" do let(:url) { 'http://example.com/foo' } let(:contents) { '

URL encoded @ symbol

normal mailto link

' } - fab!(:embeddable_host) { Fabricate(:embeddable_host) } - let!(:file) { StringIO.new } before do file.stubs(:read).returns contents - TopicEmbed.stubs(:open).returns file + stub_request(:get, url) end it "handles mailto links" do - stub_request(:head, url) response = TopicEmbed.find_remote(url) + expect(response.body).to have_tag('a', with: { href: 'mailto:foo%40example.com' }) expect(response.body).to have_tag('a', with: { href: 'mailto:bar@example.com' }) end @@ -375,18 +356,36 @@ describe TopicEmbed do context "malformed href" do let(:url) { 'http://example.com/foo' } let(:contents) { '

Baz

' } - let!(:file) { StringIO.new } before do file.stubs(:read).returns contents - TopicEmbed.stubs(:open).returns file + stub_request(:get, url) end it "doesn’t raise an exception" do - stub_request(:head, url) expect { TopicEmbed.find_remote(url) }.not_to raise_error end end + + context 'canonical links' do + let(:url) { 'http://eviltrout.com/123?asd' } + let(:canonical_url) { 'http://eviltrout.com/123' } + let(:content) { "" } + let(:canonical_content) { "Canonical" } + + before do + file.stubs(:read).returns canonical_content + stub_request(:get, url) + stub_request(:head, canonical_url) + end + + it 'a' do + response = TopicEmbed.find_remote(url) + + expect(response.title).to eq("Canonical") + end + + end end describe '.absolutize_urls' do