2
0
Fork 0
mirror of https://github.com/discourse/discourse.git synced 2025-09-06 10:50:21 +08:00

FIX: Follow the canonical URL when importing a remote topic. (#14489)

FinalDestination now supports the `follow_canonical` option, which will perform an initial GET request, parse the canonical link if present, and perform a HEAD request to it.

We use this mode during embeds to avoid treating URLs with different query parameters as different topics.
This commit is contained in:
Roman Rizzi 2021-10-01 12:48:21 -03:00 committed by GitHub
parent 0359adc0b8
commit 4c2d5158c5
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 122 additions and 53 deletions

View file

@ -113,7 +113,8 @@ class TopicEmbed < ActiveRecord::Base
fd = FinalDestination.new( fd = FinalDestination.new(
url, url,
validate_uri: true, validate_uri: true,
max_redirects: 5 max_redirects: 5,
follow_canonical: true,
) )
url = fd.resolve url = fd.resolve

View file

@ -63,7 +63,8 @@ class FinalDestination
end end
@status = :ready @status = :ready
@http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head @follow_canonical = @opts[:follow_canonical]
@http_verb = http_verb(@force_get_hosts, @follow_canonical)
@cookie = nil @cookie = nil
@limited_ips = [] @limited_ips = []
@verbose = @opts[:verbose] || false @verbose = @opts[:verbose] || false
@ -77,6 +78,14 @@ class FinalDestination
20 20
end end
def http_verb(force_get_hosts, follow_canonical)
if follow_canonical || force_get_hosts.any? { |host| hostname_matches?(host) }
:get
else
:head
end
end
def timeout def timeout
@timeout || FinalDestination.connection_timeout @timeout || FinalDestination.connection_timeout
end end
@ -215,6 +224,18 @@ class FinalDestination
end end
end end
if @follow_canonical
next_url = uri(fetch_canonical_url(response.body))
if next_url.to_s.present? && next_url != @uri
@follow_canonical = false
@uri = next_url
@http_verb = http_verb(@force_get_hosts, @follow_canonical)
return resolve
end
end
@content_type = response.headers['Content-Type'] if response.headers.has_key?('Content-Type') @content_type = response.headers['Content-Type'] if response.headers.has_key?('Content-Type')
@status = :resolved @status = :resolved
return @uri return @uri
@ -458,4 +479,12 @@ class FinalDestination
end end
end end
def fetch_canonical_url(body)
return if body.blank?
canonical_link = Nokogiri::HTML5(body).at("link[rel='canonical']")
return if canonical_link.nil?
canonical_link['href']
end
end end

View file

@ -49,6 +49,13 @@ describe FinalDestination do
} }
end end
def canonical_follow(from, dest)
stub_request(:get, from).to_return(
status: 200,
body: "<head><link rel=\"canonical\" href=\"#{dest}\"></head>"
)
end
def redirect_response(from, dest) def redirect_response(from, dest)
stub_request(:head, from).to_return( stub_request(:head, from).to_return(
status: 302, status: 302,
@ -175,6 +182,39 @@ describe FinalDestination do
end end
end end
context 'follows canonical links' do
it 'resolves the canonical link as the final destination' do
canonical_follow("https://eviltrout.com", "https://codinghorror.com/blog")
stub_request(:head, "https://codinghorror.com/blog").to_return(doc_response)
final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true))
expect(final.resolve.to_s).to eq("https://codinghorror.com/blog")
expect(final.redirected?).to eq(false)
expect(final.status).to eq(:resolved)
end
it "does not follow the canonical link if it's the same as the current URL" do
canonical_follow("https://eviltrout.com", "https://eviltrout.com")
final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true))
expect(final.resolve.to_s).to eq("https://eviltrout.com")
expect(final.redirected?).to eq(false)
expect(final.status).to eq(:resolved)
end
it "does not follow the canonical link if it's invalid" do
canonical_follow("https://eviltrout.com", "")
final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true))
expect(final.resolve.to_s).to eq("https://eviltrout.com")
expect(final.redirected?).to eq(false)
expect(final.status).to eq(:resolved)
end
end
context "GET can be forced" do context "GET can be forced" do
before do before do
stub_request(:head, 'https://force.get.com/posts?page=4') stub_request(:head, 'https://force.get.com/posts?page=4')

View file

@ -190,18 +190,20 @@ describe TopicEmbed do
end end
describe '.find_remote' do describe '.find_remote' do
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
let(:file) { StringIO.new }
before do
TopicEmbed.stubs(:open).returns file
end
context ".title_scrub" do context ".title_scrub" do
let(:url) { 'http://eviltrout.com/123' } let(:url) { 'http://eviltrout.com/123' }
let(:contents) { "<title>Through the Looking Glass - Classic Books</title><body>some content here</body>" } let(:contents) { "<title>Through the Looking Glass - Classic Books</title><body>some content here</body>" }
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
let!(:file) { StringIO.new }
before do before do
file.stubs(:read).returns contents file.stubs(:read).returns contents
TopicEmbed.stubs(:open).returns file stub_request(:get, url)
stub_request(:head, url)
end end
it "doesn't scrub the title by default" do it "doesn't scrub the title by default" do
@ -214,44 +216,38 @@ describe TopicEmbed do
response = TopicEmbed.find_remote(url) response = TopicEmbed.find_remote(url)
expect(response.title).to eq("Through the Looking Glass") expect(response.title).to eq("Through the Looking Glass")
end end
end end
context 'post with allowed classes "foo" and "emoji"' do context 'post with allowed classes "foo" and "emoji"' do
fab!(:user) { Fabricate(:user) } fab!(:user) { Fabricate(:user) }
let(:url) { 'http://eviltrout.com/123' } let(:url) { 'http://eviltrout.com/123' }
let(:contents) { "my normal size emoji <p class='foo'>Hi</p> <img class='emoji other foo' src='/images/smiley.jpg'>" } let(:contents) { "my normal size emoji <p class='foo'>Hi</p> <img class='emoji other foo' src='/images/smiley.jpg'>" }
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
let!(:file) { StringIO.new }
response = nil
before do before do
SiteSetting.allowed_embed_classnames = 'emoji, foo' SiteSetting.allowed_embed_classnames = 'emoji, foo'
file.stubs(:read).returns contents file.stubs(:read).returns contents
TopicEmbed.stubs(:open).returns file stub_request(:get, url)
stub_request(:head, url) @response = TopicEmbed.find_remote(url)
response = TopicEmbed.find_remote(url)
end end
it "has no author tag" do it "has no author tag" do
expect(response.author).to be_blank expect(@response.author).to be_blank
end end
it 'img node has emoji class' do it 'img node has emoji class' do
expect(response.body).to have_tag('img', with: { class: 'emoji' }) expect(@response.body).to have_tag('img', with: { class: 'emoji' })
end end
it 'img node has foo class' do it 'img node has foo class' do
expect(response.body).to have_tag('img', with: { class: 'foo' }) expect(@response.body).to have_tag('img', with: { class: 'foo' })
end end
it 'p node has foo class' do it 'p node has foo class' do
expect(response.body).to have_tag('p', with: { class: 'foo' }) expect(@response.body).to have_tag('p', with: { class: 'foo' })
end end
it 'nodes removes classes other than emoji' do it 'nodes removes classes other than emoji' do
expect(response.body).to have_tag('img', without: { class: 'other' }) expect(@response.body).to have_tag('img', without: { class: 'other' })
end end
end end
@ -259,67 +255,56 @@ describe TopicEmbed do
fab!(:user) { Fabricate(:user, username: 'eviltrout') } fab!(:user) { Fabricate(:user, username: 'eviltrout') }
let(:url) { 'http://eviltrout.com/321' } let(:url) { 'http://eviltrout.com/321' }
let(:contents) { '<html><head><meta name="author" content="eviltrout"></head><body>rich and morty</body></html>' } let(:contents) { '<html><head><meta name="author" content="eviltrout"></head><body>rich and morty</body></html>' }
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
let!(:file) { StringIO.new }
response = nil
before(:each) do before(:each) do
file.stubs(:read).returns contents file.stubs(:read).returns contents
TopicEmbed.stubs(:open).returns file TopicEmbed.stubs(:open).returns file
stub_request(:head, url) stub_request(:get, url)
response = TopicEmbed.find_remote(url)
end end
it "has no author tag" do it "has no author tag" do
response = TopicEmbed.find_remote(url)
expect(response.author).to eq(user) expect(response.author).to eq(user)
end end
end end
context 'post with no allowed classes' do context 'post with no allowed classes' do
fab!(:user) { Fabricate(:user) } fab!(:user) { Fabricate(:user) }
let(:url) { 'http://eviltrout.com/123' } let(:url) { 'http://eviltrout.com/123' }
let(:contents) { "my normal size emoji <p class='foo'>Hi</p> <img class='emoji other foo' src='/images/smiley.jpg'>" } let(:contents) { "my normal size emoji <p class='foo'>Hi</p> <img class='emoji other foo' src='/images/smiley.jpg'>" }
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
let!(:file) { StringIO.new }
response = nil
before(:each) do before(:each) do
SiteSetting.allowed_embed_classnames = '' SiteSetting.allowed_embed_classnames = ''
file.stubs(:read).returns contents file.stubs(:read).returns contents
TopicEmbed.stubs(:open).returns file stub_request(:get, url)
stub_request(:head, url) @response = TopicEmbed.find_remote(url)
response = TopicEmbed.find_remote(url)
end end
it 'img node doesn\'t have emoji class' do it 'img node doesn\'t have emoji class' do
expect(response.body).to have_tag('img', without: { class: 'emoji' }) expect(@response.body).to have_tag('img', without: { class: 'emoji' })
end end
it 'img node doesn\'t have foo class' do it 'img node doesn\'t have foo class' do
expect(response.body).to have_tag('img', without: { class: 'foo' }) expect(@response.body).to have_tag('img', without: { class: 'foo' })
end end
it 'p node doesn\'t foo class' do it 'p node doesn\'t foo class' do
expect(response.body).to have_tag('p', without: { class: 'foo' }) expect(@response.body).to have_tag('p', without: { class: 'foo' })
end end
it 'img node doesn\'t have other class' do it 'img node doesn\'t have other class' do
expect(response.body).to have_tag('img', without: { class: 'other' }) expect(@response.body).to have_tag('img', without: { class: 'other' })
end end
end end
context "non-ascii URL" do context "non-ascii URL" do
let(:url) { 'http://eviltrout.com/test/ماهی' } let(:url) { 'http://eviltrout.com/test/ماهی' }
let(:contents) { "<title>سلام</title><body>این یک پاراگراف آزمون است.</body>" } let(:contents) { "<title>سلام</title><body>این یک پاراگراف آزمون است.</body>" }
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
let!(:file) { StringIO.new }
before do before do
stub_request(:head, url) stub_request(:get, url)
stub_request(:get, url).to_return(body: contents).then.to_raise file.stubs(:read).returns contents
end end
it "doesn't throw an error" do it "doesn't throw an error" do
@ -331,12 +316,10 @@ describe TopicEmbed do
context "encoded URL" do context "encoded URL" do
let(:url) { 'http://example.com/hello%20world' } let(:url) { 'http://example.com/hello%20world' }
let(:contents) { "<title>Hello World!</title><body></body>" } let(:contents) { "<title>Hello World!</title><body></body>" }
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
let!(:file) { StringIO.new }
before do before do
stub_request(:head, url) stub_request(:get, url)
stub_request(:get, url).to_return(body: contents).then.to_raise file.stubs(:read).returns contents
end end
it "doesn't throw an error" do it "doesn't throw an error" do
@ -356,17 +339,15 @@ describe TopicEmbed do
context "emails" do context "emails" do
let(:url) { 'http://example.com/foo' } let(:url) { 'http://example.com/foo' }
let(:contents) { '<p><a href="mailto:foo%40example.com">URL encoded @ symbol</a></p><p><a href="mailto:bar@example.com">normal mailto link</a></p>' } let(:contents) { '<p><a href="mailto:foo%40example.com">URL encoded @ symbol</a></p><p><a href="mailto:bar@example.com">normal mailto link</a></p>' }
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
let!(:file) { StringIO.new }
before do before do
file.stubs(:read).returns contents file.stubs(:read).returns contents
TopicEmbed.stubs(:open).returns file stub_request(:get, url)
end end
it "handles mailto links" do it "handles mailto links" do
stub_request(:head, url)
response = TopicEmbed.find_remote(url) response = TopicEmbed.find_remote(url)
expect(response.body).to have_tag('a', with: { href: 'mailto:foo%40example.com' }) expect(response.body).to have_tag('a', with: { href: 'mailto:foo%40example.com' })
expect(response.body).to have_tag('a', with: { href: 'mailto:bar@example.com' }) expect(response.body).to have_tag('a', with: { href: 'mailto:bar@example.com' })
end end
@ -375,18 +356,36 @@ describe TopicEmbed do
context "malformed href" do context "malformed href" do
let(:url) { 'http://example.com/foo' } let(:url) { 'http://example.com/foo' }
let(:contents) { '<p><a href="(http://foo.bar)">Baz</a></p>' } let(:contents) { '<p><a href="(http://foo.bar)">Baz</a></p>' }
let!(:file) { StringIO.new }
before do before do
file.stubs(:read).returns contents file.stubs(:read).returns contents
TopicEmbed.stubs(:open).returns file stub_request(:get, url)
end end
it "doesnt raise an exception" do it "doesnt raise an exception" do
stub_request(:head, url)
expect { TopicEmbed.find_remote(url) }.not_to raise_error expect { TopicEmbed.find_remote(url) }.not_to raise_error
end end
end end
context 'canonical links' do
let(:url) { 'http://eviltrout.com/123?asd' }
let(:canonical_url) { 'http://eviltrout.com/123' }
let(:content) { "<head><link rel=\"canonical\" href=\"#{canonical_url}\"></head>" }
let(:canonical_content) { "<title>Canonical</title><body></body>" }
before do
file.stubs(:read).returns canonical_content
stub_request(:get, url)
stub_request(:head, canonical_url)
end
it 'a' do
response = TopicEmbed.find_remote(url)
expect(response.title).to eq("Canonical")
end
end
end end
describe '.absolutize_urls' do describe '.absolutize_urls' do