discourse/spec/lib/middleware/crawler_hooks_spec.rb
Rafael dos Santos Silva 410f75066b
FIX: Skip crawler localization for direct upload requests (#39849)
Previously, direct upload requests with crawler localization parameters
could still be parsed as HTML when the response carried crawler HTML
headers, raising Nokogiri document depth errors.

This change skips localization rewriting for non-localizable request
paths before parsing the response, including relative URL root installs.
2026-05-08 10:53:35 -03:00

391 lines
12 KiB
Ruby
Vendored

# frozen_string_literal: true
describe Middleware::CrawlerHooks do
let(:crawler_user_agent) { "GoogleBot/2.1 (+https://www.google.com/bot.html)" }
let(:regular_user_agent) { "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" }
let(:html_response) do
html_arr = [
"<html><body><a href=\"/test\">Test</a><a href=\"https://discourse.org/page\">External</a></body></html>",
]
def html_arr.body
join("")
end
html_arr
end
let(:middleware) { Middleware::CrawlerHooks.new(app) }
let(:app) do
lambda do |env|
headers = { "Content-Type" => "text/html; charset=utf-8" }
headers["X-Discourse-Crawler-View"] = "true" if env["X-Discourse-Crawler-View"]
[200, headers, html_response]
end
end
let(:json_middleware) do
Middleware::CrawlerHooks.new(
lambda do |_|
[
200,
{
"Content-Type" => "application/json; charset=utf-8",
"X-Discourse-Crawler-View" => "true",
},
['{ "key": "value" }'],
]
end,
)
end
let(:binary_middleware) do
Middleware::CrawlerHooks.new(
lambda do |_|
[
200,
{
"Content-Type" => "application/zip",
"Content-Disposition" => "attachment; filename=\"file.zip\"",
"X-Discourse-Crawler-View" => "true",
},
["PK\x03\x04binarydata".b],
]
end,
)
end
let(:error_middleware) do
Middleware::CrawlerHooks.new(
lambda do |_|
[
404,
{ "Content-Type" => "text/html; charset=utf-8" },
["<html><body>Not found</body></html>"],
]
end,
)
end
def env(opts = {})
path = opts.delete(:path) || "https://discourse.site"
params = opts.delete(:params) || {}
Rack::MockRequest.env_for(path, params: params).merge(opts)
end
before do
SiteSetting.content_localization_enabled = false
SiteSetting.content_localization_crawler_param = false
end
describe "handling regular users" do
it "does not modify responses for non-crawler requests" do
status, headers, response = middleware.call(env("HTTP_USER_AGENT" => regular_user_agent))
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(html_response)
end
end
describe "handling crawler requests" do
it "does not modify responses without X-Discourse-Crawler-View header" do
status, headers, response = middleware.call(env("HTTP_USER_AGENT" => crawler_user_agent))
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(html_response)
end
it "does not modify responses for non-HTML content types" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
status, headers, response =
json_middleware.call(
env(
:path => "https://discourse.site",
:params => {
Discourse::LOCALE_PARAM => "fr",
},
"HTTP_USER_AGENT" => crawler_user_agent,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("application/json")
expect(response).to eq(['{ "key": "value" }'])
end
it "does not parse binary file downloads as HTML" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
status, headers, response =
binary_middleware.call(
env(
:path => "https://discourse.site/uploads/short-url/abc.zip",
:params => {
Discourse::LOCALE_PARAM => "fr",
},
"HTTP_USER_AGENT" => crawler_user_agent,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to eq("application/zip")
expect(response).to eq(["PK\x03\x04binarydata".b])
end
it "does not parse direct upload requests served with an HTML content type" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
response_body = ["<div>" * 500]
def response_body.body
join("")
end
middleware =
Middleware::CrawlerHooks.new(
lambda do |_|
[
200,
{
"Content-Type" => "text/html; charset=utf-8",
"X-Discourse-Crawler-View" => "true",
},
response_body,
]
end,
)
status, headers, response =
middleware.call(
env(
:path => "https://discourse.site/uploads/short-url/abc.zip",
:params => {
Discourse::LOCALE_PARAM => "fr",
},
"HTTP_USER_AGENT" => crawler_user_agent,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(response_body)
end
it "does not parse direct upload requests served from a base path" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
Discourse.stubs(:base_path).returns("/forum")
response_body = ["<div>" * 500]
def response_body.body
join("")
end
middleware =
Middleware::CrawlerHooks.new(
lambda do |_|
[
200,
{
"Content-Type" => "text/html; charset=utf-8",
"X-Discourse-Crawler-View" => "true",
},
response_body,
]
end,
)
status, headers, response =
middleware.call(
env(
:path => "https://discourse.site/forum/uploads/short-url/abc.zip",
:params => {
Discourse::LOCALE_PARAM => "fr",
},
"HTTP_USER_AGENT" => crawler_user_agent,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(response_body)
end
it "does not modify responses for non-200 status codes" do
status, headers, response =
error_middleware.call(env("HTTP_USER_AGENT" => crawler_user_agent))
expect(status).to eq(404)
expect(response).to eq(["<html><body>Not found</body></html>"])
end
it "does not modify HTML responses when content_localization is disabled" do
SiteSetting.content_localization_enabled = false
SiteSetting.content_localization_crawler_param = true
status, headers, response =
middleware.call(
env(
:path => "https://discourse.site",
:params => {
"locale" => "fr",
},
"HTTP_USER_AGENT" => crawler_user_agent,
"X-Discourse-Crawler-View" => true,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(html_response)
end
it "does not modify HTML responses when crawler_param is disabled" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = false
status, headers, response =
middleware.call(
env(
:path => "https://discourse.site",
:params => {
"locale" => "fr",
},
"HTTP_USER_AGENT" => crawler_user_agent,
"X-Discourse-Crawler-View" => true,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(html_response)
end
it "appends locale parameter to links in HTML responses when both settings are enabled and crawler header present" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
# Create a real request with parameters
test_env =
Rack::MockRequest.env_for(
"https://discourse.site/page",
params: {
Discourse::LOCALE_PARAM => "fr",
},
)
request = Rack::Request.new(test_env)
# Create our middleware and test response transformation
middleware_instance =
Middleware::CrawlerHooks.new(
lambda { |_| [200, { "X-Discourse-Crawler-View" => "true" }, []] },
)
response = html_response
transformed_response =
middleware_instance.send(:transform_response, request: request, response: response)
expect(transformed_response).not_to eq(html_response)
expect(transformed_response.first).to include("href=\"/test?#{Discourse::LOCALE_PARAM}=fr\"")
expect(transformed_response.first).to include("href=\"https://discourse.org/page\"")
expect(Nokogiri::HTML5.parse(transformed_response.first).css("a").size).to eq(2)
end
it "does not modify links in HTML responses when locale parameter is not present" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
status, headers, response =
middleware.call(
env(
:path => "https://discourse.site",
"HTTP_USER_AGENT" => crawler_user_agent,
"X-Discourse-Crawler-View" => true,
),
)
expect(status).to eq(200)
expect(headers["Content-Type"]).to include("text/html")
expect(response).to eq(html_response)
end
it "does not append the locale param to upload download links" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
base_url = Discourse.base_url
html = [
"<html><body>" \
"<a href=\"/t/topic-slug/123\">Topic</a>" \
"<a href=\"/uploads/short-url/abc.zip\">Download</a>" \
"<a href=\"#{base_url}/uploads/default/original/1X/abc.png\">Image</a>" \
"<a href=\"/secure-uploads/original/1X/def.pdf\">Secure</a>" \
"<a href=\"/secure-media-uploads/original/1X/ghi.pdf\">Legacy secure</a>" \
"</body></html>",
]
def html.body
join("")
end
test_env =
Rack::MockRequest.env_for(
"https://discourse.site/t/topic-slug/123",
params: {
Discourse::LOCALE_PARAM => "fr",
},
)
request = Rack::Request.new(test_env)
middleware_instance =
Middleware::CrawlerHooks.new(
lambda { |_| [200, { "X-Discourse-Crawler-View" => "true" }, []] },
)
transformed_response =
middleware_instance.send(:transform_response, request: request, response: html)
transformed = transformed_response.first
expect(transformed).to include("href=\"/t/topic-slug/123?#{Discourse::LOCALE_PARAM}=fr\"")
expect(transformed).to include("href=\"/uploads/short-url/abc.zip\"")
expect(transformed).to include("href=\"#{base_url}/uploads/default/original/1X/abc.png\"")
expect(transformed).to include("href=\"/secure-uploads/original/1X/def.pdf\"")
expect(transformed).to include("href=\"/secure-media-uploads/original/1X/ghi.pdf\"")
end
it "modifies external links that start with the base URL" do
SiteSetting.content_localization_enabled = true
SiteSetting.content_localization_crawler_param = true
base_url = Discourse.base_url
html_with_base_url = ["<html><body><a href=\"#{base_url}/cat\">Cats</a></body></html>"]
def html_with_base_url.body
join("")
end
test_env =
Rack::MockRequest.env_for(
"https://discourse.site",
params: {
Discourse::LOCALE_PARAM => "fr",
},
)
request = Rack::Request.new(test_env)
middleware_instance =
Middleware::CrawlerHooks.new(
lambda { |_| [200, { "X-Discourse-Crawler-View" => "true" }, []] },
)
transformed_response =
middleware_instance.send(
:transform_response,
request: request,
response: html_with_base_url,
)
expect(transformed_response.first).to include(
"href=\"#{base_url}/cat?#{Discourse::LOCALE_PARAM}=fr\"",
)
end
end
end