mirror of
https://gh.wpcy.net/https://github.com/discourse/discourse.git
synced 2026-05-25 16:42:28 +08:00
Previously, direct upload requests with crawler localization parameters could still be parsed as HTML when the response carried crawler HTML headers, raising Nokogiri document depth errors. This change skips localization rewriting for non-localizable request paths before parsing the response, including relative URL root installs.
391 lines
12 KiB
Ruby
Vendored
391 lines
12 KiB
Ruby
Vendored
# frozen_string_literal: true
|
|
|
|
describe Middleware::CrawlerHooks do
|
|
let(:crawler_user_agent) { "GoogleBot/2.1 (+https://www.google.com/bot.html)" }
|
|
let(:regular_user_agent) { "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" }
|
|
let(:html_response) do
|
|
html_arr = [
|
|
"<html><body><a href=\"/test\">Test</a><a href=\"https://discourse.org/page\">External</a></body></html>",
|
|
]
|
|
def html_arr.body
|
|
join("")
|
|
end
|
|
html_arr
|
|
end
|
|
|
|
let(:middleware) { Middleware::CrawlerHooks.new(app) }
|
|
let(:app) do
|
|
lambda do |env|
|
|
headers = { "Content-Type" => "text/html; charset=utf-8" }
|
|
headers["X-Discourse-Crawler-View"] = "true" if env["X-Discourse-Crawler-View"]
|
|
[200, headers, html_response]
|
|
end
|
|
end
|
|
let(:json_middleware) do
|
|
Middleware::CrawlerHooks.new(
|
|
lambda do |_|
|
|
[
|
|
200,
|
|
{
|
|
"Content-Type" => "application/json; charset=utf-8",
|
|
"X-Discourse-Crawler-View" => "true",
|
|
},
|
|
['{ "key": "value" }'],
|
|
]
|
|
end,
|
|
)
|
|
end
|
|
let(:binary_middleware) do
|
|
Middleware::CrawlerHooks.new(
|
|
lambda do |_|
|
|
[
|
|
200,
|
|
{
|
|
"Content-Type" => "application/zip",
|
|
"Content-Disposition" => "attachment; filename=\"file.zip\"",
|
|
"X-Discourse-Crawler-View" => "true",
|
|
},
|
|
["PK\x03\x04binarydata".b],
|
|
]
|
|
end,
|
|
)
|
|
end
|
|
let(:error_middleware) do
|
|
Middleware::CrawlerHooks.new(
|
|
lambda do |_|
|
|
[
|
|
404,
|
|
{ "Content-Type" => "text/html; charset=utf-8" },
|
|
["<html><body>Not found</body></html>"],
|
|
]
|
|
end,
|
|
)
|
|
end
|
|
|
|
def env(opts = {})
|
|
path = opts.delete(:path) || "https://discourse.site"
|
|
params = opts.delete(:params) || {}
|
|
Rack::MockRequest.env_for(path, params: params).merge(opts)
|
|
end
|
|
|
|
before do
|
|
SiteSetting.content_localization_enabled = false
|
|
SiteSetting.content_localization_crawler_param = false
|
|
end
|
|
|
|
describe "handling regular users" do
|
|
it "does not modify responses for non-crawler requests" do
|
|
status, headers, response = middleware.call(env("HTTP_USER_AGENT" => regular_user_agent))
|
|
|
|
expect(status).to eq(200)
|
|
expect(headers["Content-Type"]).to include("text/html")
|
|
expect(response).to eq(html_response)
|
|
end
|
|
end
|
|
|
|
describe "handling crawler requests" do
|
|
it "does not modify responses without X-Discourse-Crawler-View header" do
|
|
status, headers, response = middleware.call(env("HTTP_USER_AGENT" => crawler_user_agent))
|
|
|
|
expect(status).to eq(200)
|
|
expect(headers["Content-Type"]).to include("text/html")
|
|
expect(response).to eq(html_response)
|
|
end
|
|
|
|
it "does not modify responses for non-HTML content types" do
|
|
SiteSetting.content_localization_enabled = true
|
|
SiteSetting.content_localization_crawler_param = true
|
|
|
|
status, headers, response =
|
|
json_middleware.call(
|
|
env(
|
|
:path => "https://discourse.site",
|
|
:params => {
|
|
Discourse::LOCALE_PARAM => "fr",
|
|
},
|
|
"HTTP_USER_AGENT" => crawler_user_agent,
|
|
),
|
|
)
|
|
|
|
expect(status).to eq(200)
|
|
expect(headers["Content-Type"]).to include("application/json")
|
|
expect(response).to eq(['{ "key": "value" }'])
|
|
end
|
|
|
|
it "does not parse binary file downloads as HTML" do
|
|
SiteSetting.content_localization_enabled = true
|
|
SiteSetting.content_localization_crawler_param = true
|
|
|
|
status, headers, response =
|
|
binary_middleware.call(
|
|
env(
|
|
:path => "https://discourse.site/uploads/short-url/abc.zip",
|
|
:params => {
|
|
Discourse::LOCALE_PARAM => "fr",
|
|
},
|
|
"HTTP_USER_AGENT" => crawler_user_agent,
|
|
),
|
|
)
|
|
|
|
expect(status).to eq(200)
|
|
expect(headers["Content-Type"]).to eq("application/zip")
|
|
expect(response).to eq(["PK\x03\x04binarydata".b])
|
|
end
|
|
|
|
it "does not parse direct upload requests served with an HTML content type" do
|
|
SiteSetting.content_localization_enabled = true
|
|
SiteSetting.content_localization_crawler_param = true
|
|
|
|
response_body = ["<div>" * 500]
|
|
def response_body.body
|
|
join("")
|
|
end
|
|
|
|
middleware =
|
|
Middleware::CrawlerHooks.new(
|
|
lambda do |_|
|
|
[
|
|
200,
|
|
{
|
|
"Content-Type" => "text/html; charset=utf-8",
|
|
"X-Discourse-Crawler-View" => "true",
|
|
},
|
|
response_body,
|
|
]
|
|
end,
|
|
)
|
|
|
|
status, headers, response =
|
|
middleware.call(
|
|
env(
|
|
:path => "https://discourse.site/uploads/short-url/abc.zip",
|
|
:params => {
|
|
Discourse::LOCALE_PARAM => "fr",
|
|
},
|
|
"HTTP_USER_AGENT" => crawler_user_agent,
|
|
),
|
|
)
|
|
|
|
expect(status).to eq(200)
|
|
expect(headers["Content-Type"]).to include("text/html")
|
|
expect(response).to eq(response_body)
|
|
end
|
|
|
|
it "does not parse direct upload requests served from a base path" do
|
|
SiteSetting.content_localization_enabled = true
|
|
SiteSetting.content_localization_crawler_param = true
|
|
Discourse.stubs(:base_path).returns("/forum")
|
|
|
|
response_body = ["<div>" * 500]
|
|
def response_body.body
|
|
join("")
|
|
end
|
|
|
|
middleware =
|
|
Middleware::CrawlerHooks.new(
|
|
lambda do |_|
|
|
[
|
|
200,
|
|
{
|
|
"Content-Type" => "text/html; charset=utf-8",
|
|
"X-Discourse-Crawler-View" => "true",
|
|
},
|
|
response_body,
|
|
]
|
|
end,
|
|
)
|
|
|
|
status, headers, response =
|
|
middleware.call(
|
|
env(
|
|
:path => "https://discourse.site/forum/uploads/short-url/abc.zip",
|
|
:params => {
|
|
Discourse::LOCALE_PARAM => "fr",
|
|
},
|
|
"HTTP_USER_AGENT" => crawler_user_agent,
|
|
),
|
|
)
|
|
|
|
expect(status).to eq(200)
|
|
expect(headers["Content-Type"]).to include("text/html")
|
|
expect(response).to eq(response_body)
|
|
end
|
|
|
|
it "does not modify responses for non-200 status codes" do
|
|
status, headers, response =
|
|
error_middleware.call(env("HTTP_USER_AGENT" => crawler_user_agent))
|
|
|
|
expect(status).to eq(404)
|
|
expect(response).to eq(["<html><body>Not found</body></html>"])
|
|
end
|
|
|
|
it "does not modify HTML responses when content_localization is disabled" do
|
|
SiteSetting.content_localization_enabled = false
|
|
SiteSetting.content_localization_crawler_param = true
|
|
|
|
status, headers, response =
|
|
middleware.call(
|
|
env(
|
|
:path => "https://discourse.site",
|
|
:params => {
|
|
"locale" => "fr",
|
|
},
|
|
"HTTP_USER_AGENT" => crawler_user_agent,
|
|
"X-Discourse-Crawler-View" => true,
|
|
),
|
|
)
|
|
|
|
expect(status).to eq(200)
|
|
expect(headers["Content-Type"]).to include("text/html")
|
|
expect(response).to eq(html_response)
|
|
end
|
|
|
|
it "does not modify HTML responses when crawler_param is disabled" do
|
|
SiteSetting.content_localization_enabled = true
|
|
SiteSetting.content_localization_crawler_param = false
|
|
|
|
status, headers, response =
|
|
middleware.call(
|
|
env(
|
|
:path => "https://discourse.site",
|
|
:params => {
|
|
"locale" => "fr",
|
|
},
|
|
"HTTP_USER_AGENT" => crawler_user_agent,
|
|
"X-Discourse-Crawler-View" => true,
|
|
),
|
|
)
|
|
|
|
expect(status).to eq(200)
|
|
expect(headers["Content-Type"]).to include("text/html")
|
|
expect(response).to eq(html_response)
|
|
end
|
|
|
|
it "appends locale parameter to links in HTML responses when both settings are enabled and crawler header present" do
|
|
SiteSetting.content_localization_enabled = true
|
|
SiteSetting.content_localization_crawler_param = true
|
|
|
|
# Create a real request with parameters
|
|
test_env =
|
|
Rack::MockRequest.env_for(
|
|
"https://discourse.site/page",
|
|
params: {
|
|
Discourse::LOCALE_PARAM => "fr",
|
|
},
|
|
)
|
|
request = Rack::Request.new(test_env)
|
|
|
|
# Create our middleware and test response transformation
|
|
middleware_instance =
|
|
Middleware::CrawlerHooks.new(
|
|
lambda { |_| [200, { "X-Discourse-Crawler-View" => "true" }, []] },
|
|
)
|
|
response = html_response
|
|
|
|
transformed_response =
|
|
middleware_instance.send(:transform_response, request: request, response: response)
|
|
|
|
expect(transformed_response).not_to eq(html_response)
|
|
expect(transformed_response.first).to include("href=\"/test?#{Discourse::LOCALE_PARAM}=fr\"")
|
|
expect(transformed_response.first).to include("href=\"https://discourse.org/page\"")
|
|
expect(Nokogiri::HTML5.parse(transformed_response.first).css("a").size).to eq(2)
|
|
end
|
|
|
|
it "does not modify links in HTML responses when locale parameter is not present" do
|
|
SiteSetting.content_localization_enabled = true
|
|
SiteSetting.content_localization_crawler_param = true
|
|
|
|
status, headers, response =
|
|
middleware.call(
|
|
env(
|
|
:path => "https://discourse.site",
|
|
"HTTP_USER_AGENT" => crawler_user_agent,
|
|
"X-Discourse-Crawler-View" => true,
|
|
),
|
|
)
|
|
|
|
expect(status).to eq(200)
|
|
expect(headers["Content-Type"]).to include("text/html")
|
|
expect(response).to eq(html_response)
|
|
end
|
|
|
|
it "does not append the locale param to upload download links" do
|
|
SiteSetting.content_localization_enabled = true
|
|
SiteSetting.content_localization_crawler_param = true
|
|
|
|
base_url = Discourse.base_url
|
|
html = [
|
|
"<html><body>" \
|
|
"<a href=\"/t/topic-slug/123\">Topic</a>" \
|
|
"<a href=\"/uploads/short-url/abc.zip\">Download</a>" \
|
|
"<a href=\"#{base_url}/uploads/default/original/1X/abc.png\">Image</a>" \
|
|
"<a href=\"/secure-uploads/original/1X/def.pdf\">Secure</a>" \
|
|
"<a href=\"/secure-media-uploads/original/1X/ghi.pdf\">Legacy secure</a>" \
|
|
"</body></html>",
|
|
]
|
|
def html.body
|
|
join("")
|
|
end
|
|
|
|
test_env =
|
|
Rack::MockRequest.env_for(
|
|
"https://discourse.site/t/topic-slug/123",
|
|
params: {
|
|
Discourse::LOCALE_PARAM => "fr",
|
|
},
|
|
)
|
|
request = Rack::Request.new(test_env)
|
|
|
|
middleware_instance =
|
|
Middleware::CrawlerHooks.new(
|
|
lambda { |_| [200, { "X-Discourse-Crawler-View" => "true" }, []] },
|
|
)
|
|
|
|
transformed_response =
|
|
middleware_instance.send(:transform_response, request: request, response: html)
|
|
|
|
transformed = transformed_response.first
|
|
expect(transformed).to include("href=\"/t/topic-slug/123?#{Discourse::LOCALE_PARAM}=fr\"")
|
|
expect(transformed).to include("href=\"/uploads/short-url/abc.zip\"")
|
|
expect(transformed).to include("href=\"#{base_url}/uploads/default/original/1X/abc.png\"")
|
|
expect(transformed).to include("href=\"/secure-uploads/original/1X/def.pdf\"")
|
|
expect(transformed).to include("href=\"/secure-media-uploads/original/1X/ghi.pdf\"")
|
|
end
|
|
|
|
it "modifies external links that start with the base URL" do
|
|
SiteSetting.content_localization_enabled = true
|
|
SiteSetting.content_localization_crawler_param = true
|
|
|
|
base_url = Discourse.base_url
|
|
html_with_base_url = ["<html><body><a href=\"#{base_url}/cat\">Cats</a></body></html>"]
|
|
def html_with_base_url.body
|
|
join("")
|
|
end
|
|
|
|
test_env =
|
|
Rack::MockRequest.env_for(
|
|
"https://discourse.site",
|
|
params: {
|
|
Discourse::LOCALE_PARAM => "fr",
|
|
},
|
|
)
|
|
request = Rack::Request.new(test_env)
|
|
|
|
middleware_instance =
|
|
Middleware::CrawlerHooks.new(
|
|
lambda { |_| [200, { "X-Discourse-Crawler-View" => "true" }, []] },
|
|
)
|
|
|
|
transformed_response =
|
|
middleware_instance.send(
|
|
:transform_response,
|
|
request: request,
|
|
response: html_with_base_url,
|
|
)
|
|
|
|
expect(transformed_response.first).to include(
|
|
"href=\"#{base_url}/cat?#{Discourse::LOCALE_PARAM}=fr\"",
|
|
)
|
|
end
|
|
end
|
|
end
|