mirror of
https://github.com/discourse/discourse.git
synced 2025-09-07 12:02:53 +08:00
FIX: Follow the canonical URL when importing a remote topic. (#14489)
FinalDestination now supports the `follow_canonical` option, which will perform an initial GET request, parse the canonical link if present, and perform a HEAD request to it. We use this mode during embeds to avoid treating URLs with different query parameters as different topics.
This commit is contained in:
parent
0359adc0b8
commit
4c2d5158c5
4 changed files with 122 additions and 53 deletions
|
@ -113,7 +113,8 @@ class TopicEmbed < ActiveRecord::Base
|
||||||
fd = FinalDestination.new(
|
fd = FinalDestination.new(
|
||||||
url,
|
url,
|
||||||
validate_uri: true,
|
validate_uri: true,
|
||||||
max_redirects: 5
|
max_redirects: 5,
|
||||||
|
follow_canonical: true,
|
||||||
)
|
)
|
||||||
|
|
||||||
url = fd.resolve
|
url = fd.resolve
|
||||||
|
|
|
@ -63,7 +63,8 @@ class FinalDestination
|
||||||
end
|
end
|
||||||
|
|
||||||
@status = :ready
|
@status = :ready
|
||||||
@http_verb = @force_get_hosts.any? { |host| hostname_matches?(host) } ? :get : :head
|
@follow_canonical = @opts[:follow_canonical]
|
||||||
|
@http_verb = http_verb(@force_get_hosts, @follow_canonical)
|
||||||
@cookie = nil
|
@cookie = nil
|
||||||
@limited_ips = []
|
@limited_ips = []
|
||||||
@verbose = @opts[:verbose] || false
|
@verbose = @opts[:verbose] || false
|
||||||
|
@ -77,6 +78,14 @@ class FinalDestination
|
||||||
20
|
20
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def http_verb(force_get_hosts, follow_canonical)
|
||||||
|
if follow_canonical || force_get_hosts.any? { |host| hostname_matches?(host) }
|
||||||
|
:get
|
||||||
|
else
|
||||||
|
:head
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def timeout
|
def timeout
|
||||||
@timeout || FinalDestination.connection_timeout
|
@timeout || FinalDestination.connection_timeout
|
||||||
end
|
end
|
||||||
|
@ -215,6 +224,18 @@ class FinalDestination
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
if @follow_canonical
|
||||||
|
next_url = uri(fetch_canonical_url(response.body))
|
||||||
|
|
||||||
|
if next_url.to_s.present? && next_url != @uri
|
||||||
|
@follow_canonical = false
|
||||||
|
@uri = next_url
|
||||||
|
@http_verb = http_verb(@force_get_hosts, @follow_canonical)
|
||||||
|
|
||||||
|
return resolve
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
@content_type = response.headers['Content-Type'] if response.headers.has_key?('Content-Type')
|
@content_type = response.headers['Content-Type'] if response.headers.has_key?('Content-Type')
|
||||||
@status = :resolved
|
@status = :resolved
|
||||||
return @uri
|
return @uri
|
||||||
|
@ -458,4 +479,12 @@ class FinalDestination
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def fetch_canonical_url(body)
|
||||||
|
return if body.blank?
|
||||||
|
canonical_link = Nokogiri::HTML5(body).at("link[rel='canonical']")
|
||||||
|
|
||||||
|
return if canonical_link.nil?
|
||||||
|
|
||||||
|
canonical_link['href']
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -49,6 +49,13 @@ describe FinalDestination do
|
||||||
}
|
}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def canonical_follow(from, dest)
|
||||||
|
stub_request(:get, from).to_return(
|
||||||
|
status: 200,
|
||||||
|
body: "<head><link rel=\"canonical\" href=\"#{dest}\"></head>"
|
||||||
|
)
|
||||||
|
end
|
||||||
|
|
||||||
def redirect_response(from, dest)
|
def redirect_response(from, dest)
|
||||||
stub_request(:head, from).to_return(
|
stub_request(:head, from).to_return(
|
||||||
status: 302,
|
status: 302,
|
||||||
|
@ -175,6 +182,39 @@ describe FinalDestination do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context 'follows canonical links' do
|
||||||
|
it 'resolves the canonical link as the final destination' do
|
||||||
|
canonical_follow("https://eviltrout.com", "https://codinghorror.com/blog")
|
||||||
|
stub_request(:head, "https://codinghorror.com/blog").to_return(doc_response)
|
||||||
|
|
||||||
|
final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true))
|
||||||
|
|
||||||
|
expect(final.resolve.to_s).to eq("https://codinghorror.com/blog")
|
||||||
|
expect(final.redirected?).to eq(false)
|
||||||
|
expect(final.status).to eq(:resolved)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "does not follow the canonical link if it's the same as the current URL" do
|
||||||
|
canonical_follow("https://eviltrout.com", "https://eviltrout.com")
|
||||||
|
|
||||||
|
final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true))
|
||||||
|
|
||||||
|
expect(final.resolve.to_s).to eq("https://eviltrout.com")
|
||||||
|
expect(final.redirected?).to eq(false)
|
||||||
|
expect(final.status).to eq(:resolved)
|
||||||
|
end
|
||||||
|
|
||||||
|
it "does not follow the canonical link if it's invalid" do
|
||||||
|
canonical_follow("https://eviltrout.com", "")
|
||||||
|
|
||||||
|
final = FinalDestination.new('https://eviltrout.com', opts.merge(follow_canonical: true))
|
||||||
|
|
||||||
|
expect(final.resolve.to_s).to eq("https://eviltrout.com")
|
||||||
|
expect(final.redirected?).to eq(false)
|
||||||
|
expect(final.status).to eq(:resolved)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
context "GET can be forced" do
|
context "GET can be forced" do
|
||||||
before do
|
before do
|
||||||
stub_request(:head, 'https://force.get.com/posts?page=4')
|
stub_request(:head, 'https://force.get.com/posts?page=4')
|
||||||
|
|
|
@ -190,18 +190,20 @@ describe TopicEmbed do
|
||||||
end
|
end
|
||||||
|
|
||||||
describe '.find_remote' do
|
describe '.find_remote' do
|
||||||
|
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
|
||||||
|
let(:file) { StringIO.new }
|
||||||
|
|
||||||
|
before do
|
||||||
|
TopicEmbed.stubs(:open).returns file
|
||||||
|
end
|
||||||
|
|
||||||
context ".title_scrub" do
|
context ".title_scrub" do
|
||||||
|
|
||||||
let(:url) { 'http://eviltrout.com/123' }
|
let(:url) { 'http://eviltrout.com/123' }
|
||||||
let(:contents) { "<title>Through the Looking Glass - Classic Books</title><body>some content here</body>" }
|
let(:contents) { "<title>Through the Looking Glass - Classic Books</title><body>some content here</body>" }
|
||||||
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
|
|
||||||
let!(:file) { StringIO.new }
|
|
||||||
|
|
||||||
before do
|
before do
|
||||||
file.stubs(:read).returns contents
|
file.stubs(:read).returns contents
|
||||||
TopicEmbed.stubs(:open).returns file
|
stub_request(:get, url)
|
||||||
stub_request(:head, url)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
it "doesn't scrub the title by default" do
|
it "doesn't scrub the title by default" do
|
||||||
|
@ -214,44 +216,38 @@ describe TopicEmbed do
|
||||||
response = TopicEmbed.find_remote(url)
|
response = TopicEmbed.find_remote(url)
|
||||||
expect(response.title).to eq("Through the Looking Glass")
|
expect(response.title).to eq("Through the Looking Glass")
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
||||||
context 'post with allowed classes "foo" and "emoji"' do
|
context 'post with allowed classes "foo" and "emoji"' do
|
||||||
fab!(:user) { Fabricate(:user) }
|
fab!(:user) { Fabricate(:user) }
|
||||||
let(:url) { 'http://eviltrout.com/123' }
|
let(:url) { 'http://eviltrout.com/123' }
|
||||||
let(:contents) { "my normal size emoji <p class='foo'>Hi</p> <img class='emoji other foo' src='/images/smiley.jpg'>" }
|
let(:contents) { "my normal size emoji <p class='foo'>Hi</p> <img class='emoji other foo' src='/images/smiley.jpg'>" }
|
||||||
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
|
|
||||||
let!(:file) { StringIO.new }
|
|
||||||
|
|
||||||
response = nil
|
|
||||||
|
|
||||||
before do
|
before do
|
||||||
SiteSetting.allowed_embed_classnames = 'emoji, foo'
|
SiteSetting.allowed_embed_classnames = 'emoji, foo'
|
||||||
file.stubs(:read).returns contents
|
file.stubs(:read).returns contents
|
||||||
TopicEmbed.stubs(:open).returns file
|
stub_request(:get, url)
|
||||||
stub_request(:head, url)
|
@response = TopicEmbed.find_remote(url)
|
||||||
response = TopicEmbed.find_remote(url)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
it "has no author tag" do
|
it "has no author tag" do
|
||||||
expect(response.author).to be_blank
|
expect(@response.author).to be_blank
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'img node has emoji class' do
|
it 'img node has emoji class' do
|
||||||
expect(response.body).to have_tag('img', with: { class: 'emoji' })
|
expect(@response.body).to have_tag('img', with: { class: 'emoji' })
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'img node has foo class' do
|
it 'img node has foo class' do
|
||||||
expect(response.body).to have_tag('img', with: { class: 'foo' })
|
expect(@response.body).to have_tag('img', with: { class: 'foo' })
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'p node has foo class' do
|
it 'p node has foo class' do
|
||||||
expect(response.body).to have_tag('p', with: { class: 'foo' })
|
expect(@response.body).to have_tag('p', with: { class: 'foo' })
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'nodes removes classes other than emoji' do
|
it 'nodes removes classes other than emoji' do
|
||||||
expect(response.body).to have_tag('img', without: { class: 'other' })
|
expect(@response.body).to have_tag('img', without: { class: 'other' })
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -259,67 +255,56 @@ describe TopicEmbed do
|
||||||
fab!(:user) { Fabricate(:user, username: 'eviltrout') }
|
fab!(:user) { Fabricate(:user, username: 'eviltrout') }
|
||||||
let(:url) { 'http://eviltrout.com/321' }
|
let(:url) { 'http://eviltrout.com/321' }
|
||||||
let(:contents) { '<html><head><meta name="author" content="eviltrout"></head><body>rich and morty</body></html>' }
|
let(:contents) { '<html><head><meta name="author" content="eviltrout"></head><body>rich and morty</body></html>' }
|
||||||
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
|
|
||||||
let!(:file) { StringIO.new }
|
|
||||||
|
|
||||||
response = nil
|
|
||||||
|
|
||||||
before(:each) do
|
before(:each) do
|
||||||
file.stubs(:read).returns contents
|
file.stubs(:read).returns contents
|
||||||
TopicEmbed.stubs(:open).returns file
|
TopicEmbed.stubs(:open).returns file
|
||||||
stub_request(:head, url)
|
stub_request(:get, url)
|
||||||
response = TopicEmbed.find_remote(url)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
it "has no author tag" do
|
it "has no author tag" do
|
||||||
|
response = TopicEmbed.find_remote(url)
|
||||||
|
|
||||||
expect(response.author).to eq(user)
|
expect(response.author).to eq(user)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
context 'post with no allowed classes' do
|
context 'post with no allowed classes' do
|
||||||
|
|
||||||
fab!(:user) { Fabricate(:user) }
|
fab!(:user) { Fabricate(:user) }
|
||||||
let(:url) { 'http://eviltrout.com/123' }
|
let(:url) { 'http://eviltrout.com/123' }
|
||||||
let(:contents) { "my normal size emoji <p class='foo'>Hi</p> <img class='emoji other foo' src='/images/smiley.jpg'>" }
|
let(:contents) { "my normal size emoji <p class='foo'>Hi</p> <img class='emoji other foo' src='/images/smiley.jpg'>" }
|
||||||
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
|
|
||||||
let!(:file) { StringIO.new }
|
|
||||||
|
|
||||||
response = nil
|
|
||||||
|
|
||||||
before(:each) do
|
before(:each) do
|
||||||
SiteSetting.allowed_embed_classnames = ''
|
SiteSetting.allowed_embed_classnames = ''
|
||||||
file.stubs(:read).returns contents
|
file.stubs(:read).returns contents
|
||||||
TopicEmbed.stubs(:open).returns file
|
stub_request(:get, url)
|
||||||
stub_request(:head, url)
|
@response = TopicEmbed.find_remote(url)
|
||||||
response = TopicEmbed.find_remote(url)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'img node doesn\'t have emoji class' do
|
it 'img node doesn\'t have emoji class' do
|
||||||
expect(response.body).to have_tag('img', without: { class: 'emoji' })
|
expect(@response.body).to have_tag('img', without: { class: 'emoji' })
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'img node doesn\'t have foo class' do
|
it 'img node doesn\'t have foo class' do
|
||||||
expect(response.body).to have_tag('img', without: { class: 'foo' })
|
expect(@response.body).to have_tag('img', without: { class: 'foo' })
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'p node doesn\'t foo class' do
|
it 'p node doesn\'t foo class' do
|
||||||
expect(response.body).to have_tag('p', without: { class: 'foo' })
|
expect(@response.body).to have_tag('p', without: { class: 'foo' })
|
||||||
end
|
end
|
||||||
|
|
||||||
it 'img node doesn\'t have other class' do
|
it 'img node doesn\'t have other class' do
|
||||||
expect(response.body).to have_tag('img', without: { class: 'other' })
|
expect(@response.body).to have_tag('img', without: { class: 'other' })
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
context "non-ascii URL" do
|
context "non-ascii URL" do
|
||||||
let(:url) { 'http://eviltrout.com/test/ماهی' }
|
let(:url) { 'http://eviltrout.com/test/ماهی' }
|
||||||
let(:contents) { "<title>سلام</title><body>این یک پاراگراف آزمون است.</body>" }
|
let(:contents) { "<title>سلام</title><body>این یک پاراگراف آزمون است.</body>" }
|
||||||
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
|
|
||||||
let!(:file) { StringIO.new }
|
|
||||||
|
|
||||||
before do
|
before do
|
||||||
stub_request(:head, url)
|
stub_request(:get, url)
|
||||||
stub_request(:get, url).to_return(body: contents).then.to_raise
|
file.stubs(:read).returns contents
|
||||||
end
|
end
|
||||||
|
|
||||||
it "doesn't throw an error" do
|
it "doesn't throw an error" do
|
||||||
|
@ -331,12 +316,10 @@ describe TopicEmbed do
|
||||||
context "encoded URL" do
|
context "encoded URL" do
|
||||||
let(:url) { 'http://example.com/hello%20world' }
|
let(:url) { 'http://example.com/hello%20world' }
|
||||||
let(:contents) { "<title>Hello World!</title><body></body>" }
|
let(:contents) { "<title>Hello World!</title><body></body>" }
|
||||||
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
|
|
||||||
let!(:file) { StringIO.new }
|
|
||||||
|
|
||||||
before do
|
before do
|
||||||
stub_request(:head, url)
|
stub_request(:get, url)
|
||||||
stub_request(:get, url).to_return(body: contents).then.to_raise
|
file.stubs(:read).returns contents
|
||||||
end
|
end
|
||||||
|
|
||||||
it "doesn't throw an error" do
|
it "doesn't throw an error" do
|
||||||
|
@ -356,17 +339,15 @@ describe TopicEmbed do
|
||||||
context "emails" do
|
context "emails" do
|
||||||
let(:url) { 'http://example.com/foo' }
|
let(:url) { 'http://example.com/foo' }
|
||||||
let(:contents) { '<p><a href="mailto:foo%40example.com">URL encoded @ symbol</a></p><p><a href="mailto:bar@example.com">normal mailto link</a></p>' }
|
let(:contents) { '<p><a href="mailto:foo%40example.com">URL encoded @ symbol</a></p><p><a href="mailto:bar@example.com">normal mailto link</a></p>' }
|
||||||
fab!(:embeddable_host) { Fabricate(:embeddable_host) }
|
|
||||||
let!(:file) { StringIO.new }
|
|
||||||
|
|
||||||
before do
|
before do
|
||||||
file.stubs(:read).returns contents
|
file.stubs(:read).returns contents
|
||||||
TopicEmbed.stubs(:open).returns file
|
stub_request(:get, url)
|
||||||
end
|
end
|
||||||
|
|
||||||
it "handles mailto links" do
|
it "handles mailto links" do
|
||||||
stub_request(:head, url)
|
|
||||||
response = TopicEmbed.find_remote(url)
|
response = TopicEmbed.find_remote(url)
|
||||||
|
|
||||||
expect(response.body).to have_tag('a', with: { href: 'mailto:foo%40example.com' })
|
expect(response.body).to have_tag('a', with: { href: 'mailto:foo%40example.com' })
|
||||||
expect(response.body).to have_tag('a', with: { href: 'mailto:bar@example.com' })
|
expect(response.body).to have_tag('a', with: { href: 'mailto:bar@example.com' })
|
||||||
end
|
end
|
||||||
|
@ -375,18 +356,36 @@ describe TopicEmbed do
|
||||||
context "malformed href" do
|
context "malformed href" do
|
||||||
let(:url) { 'http://example.com/foo' }
|
let(:url) { 'http://example.com/foo' }
|
||||||
let(:contents) { '<p><a href="(http://foo.bar)">Baz</a></p>' }
|
let(:contents) { '<p><a href="(http://foo.bar)">Baz</a></p>' }
|
||||||
let!(:file) { StringIO.new }
|
|
||||||
|
|
||||||
before do
|
before do
|
||||||
file.stubs(:read).returns contents
|
file.stubs(:read).returns contents
|
||||||
TopicEmbed.stubs(:open).returns file
|
stub_request(:get, url)
|
||||||
end
|
end
|
||||||
|
|
||||||
it "doesn’t raise an exception" do
|
it "doesn’t raise an exception" do
|
||||||
stub_request(:head, url)
|
|
||||||
expect { TopicEmbed.find_remote(url) }.not_to raise_error
|
expect { TopicEmbed.find_remote(url) }.not_to raise_error
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
context 'canonical links' do
|
||||||
|
let(:url) { 'http://eviltrout.com/123?asd' }
|
||||||
|
let(:canonical_url) { 'http://eviltrout.com/123' }
|
||||||
|
let(:content) { "<head><link rel=\"canonical\" href=\"#{canonical_url}\"></head>" }
|
||||||
|
let(:canonical_content) { "<title>Canonical</title><body></body>" }
|
||||||
|
|
||||||
|
before do
|
||||||
|
file.stubs(:read).returns canonical_content
|
||||||
|
stub_request(:get, url)
|
||||||
|
stub_request(:head, canonical_url)
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'a' do
|
||||||
|
response = TopicEmbed.find_remote(url)
|
||||||
|
|
||||||
|
expect(response.title).to eq("Canonical")
|
||||||
|
end
|
||||||
|
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
describe '.absolutize_urls' do
|
describe '.absolutize_urls' do
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue