discourse/spec/lib/middleware/crawler_hooks_spec.rb
Rafael dos Santos Silva 48518e8d00
PERF: Skip locale param rewriting for upload paths in CrawlerHooks (#39611)
**Previously**, the `CrawlerHooks` middleware appended `?tl=<locale>` to
every internal `<a>` href in the crawler-rendered HTML, including links
to `/uploads/`, `/secure-uploads/`, and `/secure-media-uploads/` — which
are binary file downloads where the locale param has no effect, but it
created duplicate per-locale variants for crawlers to fetch.

**In this update**, paths under those prefixes are skipped during link
rewriting so each upload has a single canonical URL.
2026-04-28 13:31:52 -03:00

312 lines
10 KiB
Ruby

# frozen_string_literal: true
describe Middleware::CrawlerHooks do
let(:crawler_user_agent) { "GoogleBot/2.1 (+https://www.google.com/bot.html)" }
let(:regular_user_agent) { "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" }
let(:html_response) do
html_arr = [
"<html><body><a href=\"/test\">Test</a><a href=\"https://discourse.org/page\">External</a></body></html>",
]
def html_arr.body
join("")
end
html_arr
end
let(:middleware) { Middleware::CrawlerHooks.new(app) }
let(:app) do
lambda do |env|
headers = { "Content-Type" => "text/html; charset=utf-8" }
headers["X-Discourse-Crawler-View"] = "true" if env["X-Discourse-Crawler-View"]
[200, headers, html_response]
end
end
let(:json_middleware) do
Middleware::CrawlerHooks.new(
lambda do |_|
[
200,
{
"Content-Type" => "application/json; charset=utf-8",
"X-Discourse-Crawler-View" => "true",
},
['{ "key": "value" }'],
]
end,
)
end
let(:binary_middleware) do
Middleware::CrawlerHooks.new(
lambda do |_|
[
200,
{
"Content-Type" => "application/zip",
"Content-Disposition" => "attachment; filename=\"file.zip\"",
"X-Discourse-Crawler-View" => "true",
},
["PK\x03\x04binarydata".b],
]
end,
)
end
let(:error_middleware) do
Middleware::CrawlerHooks.new(
lambda do |_|
[
404,
{ "Content-Type" => "text/html; charset=utf-8" },
["<html><body>Not found</body></html>"],
]
end,
)
end
def env(opts = {})
path = opts.delete(:path) || "https://discourse.site"
params = opts.delete(:params) || {}
Rack::MockRequest.env_for(path, params: params).merge(opts)
end
before do
SiteSetting.content_localization_enabled = false
SiteSetting.content_localization_crawler_param = false
end
describe "handling regular users" do
it "does not modify responses for non-crawler requests" do
status, headers, response = middleware.call(env("HTTP_USER_AGENT" => regular_user_agent))
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(html_response)
end
end
describe "handling crawler requests" do
it "does not modify responses without X-Discourse-Crawler-View header" do
status, headers, response = middleware.call(env("HTTP_USER_AGENT" => crawler_user_agent))
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(html_response)
end
it "does not modify responses for non-HTML content types" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
status, headers, response =
json_middleware.call(
env(
:path => "https://discourse.site",
:params => {
Discourse::LOCALE_PARAM => "fr",
},
"HTTP_USER_AGENT" => crawler_user_agent,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("application/json")
expect(response).to eq(['{ "key": "value" }'])
end
it "does not parse binary file downloads as HTML" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
status, headers, response =
binary_middleware.call(
env(
:path => "https://discourse.site/uploads/short-url/abc.zip",
:params => {
Discourse::LOCALE_PARAM => "fr",
},
"HTTP_USER_AGENT" => crawler_user_agent,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to eq("application/zip")
expect(response).to eq(["PK\x03\x04binarydata".b])
end
it "does not modify responses for non-200 status codes" do
status, headers, response =
error_middleware.call(env("HTTP_USER_AGENT" => crawler_user_agent))
expect(status).to eq(404)
expect(response).to eq(["<html><body>Not found</body></html>"])
end
it "does not modify HTML responses when content_localization is disabled" do
SiteSetting.content_localization_enabled = false
SiteSetting.content_localization_crawler_param = true
status, headers, response =
middleware.call(
env(
:path => "https://discourse.site",
:params => {
"locale" => "fr",
},
"HTTP_USER_AGENT" => crawler_user_agent,
"X-Discourse-Crawler-View" => true,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(html_response)
end
it "does not modify HTML responses when crawler_param is disabled" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = false
status, headers, response =
middleware.call(
env(
:path => "https://discourse.site",
:params => {
"locale" => "fr",
},
"HTTP_USER_AGENT" => crawler_user_agent,
"X-Discourse-Crawler-View" => true,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(html_response)
end
it "appends locale parameter to links in HTML responses when both settings are enabled and crawler header present" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
# Create a real request with parameters
test_env =
Rack::MockRequest.env_for(
"https://discourse.site/page",
params: {
Discourse::LOCALE_PARAM => "fr",
},
)
request = Rack::Request.new(test_env)
# Create our middleware and test response transformation
middleware_instance =
Middleware::CrawlerHooks.new(
lambda { |_| [200, { "X-Discourse-Crawler-View" => "true" }, []] },
)
response = html_response
transformed_response =
middleware_instance.send(:transform_response, request: request, response: response)
expect(transformed_response).not_to eq(html_response)
expect(transformed_response.first).to include("href=\"/test?#{Discourse::LOCALE_PARAM}=fr\"")
expect(transformed_response.first).to include("href=\"https://discourse.org/page\"")
expect(Nokogiri::HTML5.parse(transformed_response.first).css("a").size).to eq(2)
end
it "does not modify links in HTML responses when locale parameter is not present" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
status, headers, response =
middleware.call(
env(
:path => "https://discourse.site",
"HTTP_USER_AGENT" => crawler_user_agent,
"X-Discourse-Crawler-View" => true,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(html_response)
end
it "does not append the locale param to upload download links" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
base_url = Discourse.base_url
html = [
"<html><body>" \
"<a href=\"/t/topic-slug/123\">Topic</a>" \
"<a href=\"/uploads/short-url/abc.zip\">Download</a>" \
"<a href=\"#{base_url}/uploads/default/original/1X/abc.png\">Image</a>" \
"<a href=\"/secure-uploads/original/1X/def.pdf\">Secure</a>" \
"<a href=\"/secure-media-uploads/original/1X/ghi.pdf\">Legacy secure</a>" \
"</body></html>",
]
def html.body
join("")
end
test_env =
Rack::MockRequest.env_for(
"https://discourse.site/t/topic-slug/123",
params: {
Discourse::LOCALE_PARAM => "fr",
},
)
request = Rack::Request.new(test_env)
middleware_instance =
Middleware::CrawlerHooks.new(
lambda { |_| [200, { "X-Discourse-Crawler-View" => "true" }, []] },
)
transformed_response =
middleware_instance.send(:transform_response, request: request, response: html)
transformed = transformed_response.first
expect(transformed).to include("href=\"/t/topic-slug/123?#{Discourse::LOCALE_PARAM}=fr\"")
expect(transformed).to include("href=\"/uploads/short-url/abc.zip\"")
expect(transformed).to include("href=\"#{base_url}/uploads/default/original/1X/abc.png\"")
expect(transformed).to include("href=\"/secure-uploads/original/1X/def.pdf\"")
expect(transformed).to include("href=\"/secure-media-uploads/original/1X/ghi.pdf\"")
end
it "modifies external links that start with the base URL" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
base_url = Discourse.base_url
html_with_base_url = ["<html><body><a href=\"#{base_url}/cat\">Cats</a></body></html>"]
def html_with_base_url.body
join("")
end
test_env =
Rack::MockRequest.env_for(
"https://discourse.site",
params: {
Discourse::LOCALE_PARAM => "fr",
},
)
request = Rack::Request.new(test_env)
middleware_instance =
Middleware::CrawlerHooks.new(
lambda { |_| [200, { "X-Discourse-Crawler-View" => "true" }, []] },
)
transformed_response =
middleware_instance.send(
:transform_response,
request: request,
response: html_with_base_url,
)
expect(transformed_response.first).to include(
"href=\"#{base_url}/cat?#{Discourse::LOCALE_PARAM}=fr\"",
)
end
end
end