2
0
Fork 0
mirror of https://github.com/discourse/discourse.git synced 2025-10-04 17:32:34 +08:00

FIX: Add ld+json VideoObject to each lazy video (#34678)
Some checks are pending
Licenses / run (push) Waiting to run
Linting / run (push) Waiting to run
Publish Assets / publish-assets (push) Waiting to run
Tests / core backend (push) Waiting to run
Tests / plugins backend (push) Waiting to run
Tests / core frontend (Chrome) (push) Waiting to run
Tests / plugins frontend (push) Waiting to run
Tests / themes frontend (push) Waiting to run
Tests / core system (push) Waiting to run
Tests / plugins system (push) Waiting to run
Tests / themes system (push) Waiting to run
Tests / core frontend (Firefox ESR) (push) Waiting to run
Tests / core frontend (Firefox Evergreen) (push) Waiting to run
Tests / chat system (push) Waiting to run
Tests / merge (push) Blocked by required conditions

Lazily loaded videos currently won't be picked up by search engines as
they do not contain an iframe or VideoObject.

This commit uses a new API `build_plugin_html
"server:topic-show-crawler-post-end"` to insert populated
[ld+json](https://developers.google.com/search/docs/appearance/structured-data/intro-structured-data#supported-formats)
into the end of each post which has a video.

Results:
-
https://search.google.com/test/rich-results/result?id=vegb8Qkz7HThsMbFaeN2AQ
-
https://search.google.com/test/rich-results/result?id=asumX41RneM2tMstchv45A
- (vimeo)
https://search.google.com/test/rich-results/result?id=EZWOMJCkDqEQIHfOLkSkcw

Note: 

There were originally two ways around this 
- add each VideoObject to the `post.cook` all the time agnostic of
crawler view
- or add them to the `<head>` with appropriate metadata, needing to
duplicate all post meta data within a `isPartOf` attribute.

I chose to create a new api so that we can satisfy both "only add the
meta data in crawler view" and prevent post metadata duplication.
This commit is contained in:
Natalie Tay 2025-09-03 22:01:14 +08:00 committed by GitHub
parent 7ca6fe119e
commit a9b1fb9b43
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 402 additions and 11 deletions

View file

@ -593,9 +593,9 @@ module ApplicationHelper
current_user&.user_option&.homepage || HomepageHelper.resolve(request, current_user)
end

def build_plugin_html(name)
def build_plugin_html(name, **kwargs)
return "" unless allow_plugins?
DiscoursePluginRegistry.build_html(name, controller) || ""
DiscoursePluginRegistry.build_html(name, controller, **kwargs) || ""
end

# If there is plugin HTML return that, otherwise yield to the template

View file

@ -120,6 +120,8 @@
<% end %>
</div>
<% end %>

<%= build_plugin_html "server:topic-show-crawler-post-end", post: post %>
</div>
<% end %>
<% end %>

View file

@ -229,9 +229,9 @@ class DiscoursePluginRegistry
html_builders[name] << block
end

def self.build_html(name, ctx = nil)
def self.build_html(name, ctx = nil, **kwargs)
builders = html_builders[name] || []
builders.map { |b| b.call(ctx) }.join("\n").html_safe
builders.map { |b| b.call(ctx, **kwargs) }.join("\n").html_safe
end

def self.seed_paths

View file

@ -14,6 +14,10 @@ module Onebox
path.match?(%r{^(/@.+/video/\d+|/v/\d+)(/\w+)?/?$})
end

def self.embed_url(video_id)
"https://www.tiktok.com/embed/v2/#{video_id}"
end

def placeholder_html
<<-HTML
<img
@ -33,7 +37,7 @@ module Onebox
<<-HTML
<iframe
class="tiktok-onebox"
src="https://www.tiktok.com/embed/v2/#{oembed_data.embed_product_id}"
src="#{self.class.embed_url(oembed_data.embed_product_id)}"
sandbox="allow-popups allow-popups-to-escape-sandbox allow-scripts allow-top-navigation allow-same-origin"
frameborder="0"
seamless="seamless"

View file

@ -14,13 +14,17 @@ module Onebox
path.match?(%r{^/\d+(/\w+)?/?$})
end

def self.embed_url(video_id)
"https://player.vimeo.com/video/#{video_id}"
end

def placeholder_html
::Onebox::Helpers.video_placeholder_html
end

def to_html
video_src = Nokogiri::HTML5.fragment(oembed_data[:html]).at_css("iframe")&.[]("src")
video_src = "https://player.vimeo.com/video/#{oembed_data[:video_id]}" if video_src.blank?
video_src = self.class.embed_url(oembed_data[:video_id]) if video_src.blank?
video_src = video_src.gsub("autoplay=1", "").chomp("?")

<<-HTML

View file

@ -17,11 +17,15 @@ module Onebox
WIDTH = 480
HEIGHT = 360

def self.embed_url(video_id)
"https://www.youtube.com/embed/#{video_id}"
end

def parse_embed_response
return unless video_id
return @parse_embed_response if defined?(@parse_embed_response)

embed_url = "https://www.youtube.com/embed/#{video_id}"
embed_url = self.class.embed_url(video_id)
@embed_doc ||= Onebox::Helpers.fetch_html_doc(embed_url)

begin
@ -60,7 +64,7 @@ module Onebox
if video_id
<<-HTML
<iframe
src="https://www.youtube.com/embed/#{video_id}?#{embed_params}"
src="#{self.class.embed_url(video_id)}?#{embed_params}"
width="#{WIDTH}"
height="#{HEIGHT}"
frameborder="0"

View file

@ -0,0 +1,102 @@
# frozen_string_literal: true

module DiscourseLazyVideos
class CrawlerPostEnd
attr_reader :controller, :post

PROVIDER_NAMES = { "youtube" => "YouTube", "vimeo" => "Vimeo", "tiktok" => "TikTok" }.freeze
SCRIPT_ESCAPE_REGEX = %r{</script}i.freeze
LAZY_VIDEO_CONTAINER = "lazy-video-container".freeze

def initialize(controller, post)
@controller = controller
@post = post
end

def html
return "" if !controller.instance_of?(TopicsController)
return "" if !SiteSetting.lazy_videos_enabled
return "" if !post
return "" if post.cooked.exclude?(LAZY_VIDEO_CONTAINER)

generate_video_schemas
end

private

def generate_video_schemas
videos = extract_videos_from_post(post)
return "" if videos.empty?

@post_excerpt ||= post.excerpt(200, strip_links: true, text_entities: true)

videos
.each_with_object([]) do |video, scripts|
schema = build_video_object(video, post, @post_excerpt)
next if !schema

scripts << build_json_script(schema)
end
.join("\n")
end

def extract_videos_from_post(post)
@parsed_doc ||= Nokogiri::HTML5.fragment(post.cooked)
videos = []

@parsed_doc
.css(".lazy-video-container")
.each do |container|
video_data = {
provider: container["data-provider-name"],
id: container["data-video-id"],
title: container["data-video-title"],
url: container.at_css("a")&.[]("href"),
thumbnail: container.at_css("img")&.[]("src"),
}

videos << video_data if video_data[:provider] && video_data[:id]
end

videos
end

def build_video_object(video, post, post_excerpt)
embed_url = get_embed_url(video[:provider], video[:id])
return nil if !embed_url

schema = {
"@context" => "https://schema.org",
"@type" => "VideoObject",
"name" => video[:title] || "#{PROVIDER_NAMES[video[:provider]] || video[:provider]} Video",
"embedUrl" => embed_url,
"url" => post.full_url,
"uploadDate" => post.created_at.iso8601,
}

schema["description"] = post_excerpt if post_excerpt.present?
schema["thumbnailUrl"] = video[:thumbnail] if video[:thumbnail]
schema["contentUrl"] = video[:url] if video[:url]

schema
end

def build_json_script(schema)
json = MultiJson.dump(schema).gsub(SCRIPT_ESCAPE_REGEX, "<\\/script")
"<script type=\"application/ld+json\">#{json}</script>"
end

def get_embed_url(provider, video_id)
case provider
when "youtube"
Onebox::Engine::YoutubeOnebox.embed_url(video_id)
when "vimeo"
Onebox::Engine::VimeoOnebox.embed_url(video_id)
when "tiktok"
Onebox::Engine::TiktokOnebox.embed_url(video_id)
else
nil
end
end
end
end

View file

@ -10,11 +10,16 @@ enabled_site_setting :lazy_videos_enabled

register_asset "stylesheets/lazy-videos.scss"

require_relative "lib/lazy-videos/lazy_youtube"
require_relative "lib/lazy-videos/lazy_vimeo"
require_relative "lib/lazy-videos/lazy_tiktok"
require_relative "lib/discourse_lazy_videos/lazy_youtube"
require_relative "lib/discourse_lazy_videos/lazy_vimeo"
require_relative "lib/discourse_lazy_videos/lazy_tiktok"
require_relative "lib/discourse_lazy_videos/crawler_post_end"

after_initialize do
register_html_builder("server:topic-show-crawler-post-end") do |controller, post:|
DiscourseLazyVideos::CrawlerPostEnd.new(controller, post).html
end

on(:reduce_cooked) do |fragment|
fragment
.css(".lazy-video-container")

View file

@ -0,0 +1,270 @@
# frozen_string_literal: true

describe DiscourseLazyVideos::CrawlerPostEnd do
fab!(:topic)
fab!(:post1) { Fabricate(:post, topic: topic, post_number: 1) }
fab!(:post2) { Fabricate(:post, topic: topic, post_number: 2) }

let(:controller) { TopicsController.new }
let(:topic_view) { TopicView.new(topic) }
let(:post_crawler_schema) { described_class.new(controller, post1) }

before do
SiteSetting.lazy_videos_enabled = true
controller.instance_variable_set(:@topic_view, topic_view)
end

describe "#html" do
context "when not on TopicsController" do
let(:controller) { ApplicationController.new }
let(:schema) { described_class.new(controller, post1) }

it "returns empty string" do
expect(schema.html).to eq("")
end
end

context "when lazy_videos_enabled is false" do
before { SiteSetting.lazy_videos_enabled = false }

it "returns empty string" do
expect(post_crawler_schema.html).to eq("")
end
end

context "when no post is set" do
let(:schema) { described_class.new(controller, nil) }

it "returns empty string" do
expect(schema.html).to eq("")
end
end

context "when post has no videos" do
it "returns empty string" do
post1.update!(cooked: "<p>Just some text without videos</p>")

expect(post_crawler_schema.html).to eq("")
end
end

context "when first post has a YouTube video" do
before { post1.update!(cooked: <<~HTML) }
<div class="youtube-onebox lazy-video-container"
data-video-id="dQw4w9WgXcQ"
data-video-title="Rick Astley - Never Gonna Give You Up"
data-provider-name="youtube">
<a href="https://www.youtube.com/watch?v=dQw4w9WgXcQ">
<img src="https://img.youtube.com/vi/dQw4w9WgXcQ/maxresdefault.jpg"
title="Rick Astley - Never Gonna Give You Up">
</a>
</div>
HTML

it "generates VideoObject schema" do
html = post_crawler_schema.html

expect(html).to include('<script type="application/ld+json">')
expect(html).to include('"@type":"VideoObject"')
expect(html).to include('"name":"Rick Astley - Never Gonna Give You Up"')
expect(html).to include('"embedUrl":"https://www.youtube.com/embed/dQw4w9WgXcQ"')
expect(html).to include(
'"thumbnailUrl":"https://img.youtube.com/vi/dQw4w9WgXcQ/maxresdefault.jpg"',
)
end

it "includes post URL and upload date" do
html = post_crawler_schema.html

expect(html).to include('"url":"' + post1.full_url)
expect(html).to include('"uploadDate":"' + post1.created_at.iso8601)
end
end

context "when non-first post has a video" do
before { post2.update!(cooked: <<~HTML) }
<div class="vimeo-onebox lazy-video-container"
data-video-id="123456"
data-video-title="Vimeo Video"
data-provider-name="vimeo">
<a href="https://vimeo.com/123456">
<img src="https://example.com/thumbnail.jpg">
</a>
</div>
HTML

it "uses the comment post URL" do
schema = described_class.new(controller, post2)
html = schema.html

expect(html).to include('"url":"' + post2.full_url)
end
end

context "when post has multiple videos" do
before { post1.update!(cooked: <<~HTML) }
<div class="youtube-onebox lazy-video-container"
data-video-id="video1"
data-video-title="Video 1"
data-provider-name="youtube">
<a href="https://www.youtube.com/watch?v=video1">
<img src="https://img.youtube.com/vi/video1/maxresdefault.jpg">
</a>
</div>
<div class="vimeo-onebox lazy-video-container"
data-video-id="123456"
data-video-title="Video 2"
data-provider-name="vimeo">
<a href="https://vimeo.com/123456">
<img src="https://example.com/thumbnail.jpg">
</a>
</div>
HTML

it "generates separate schemas for each video" do
html = post_crawler_schema.html

expect(html.scan('<script type="application/ld+json">').count).to eq(2)
expect(html).to include('"embedUrl":"https://www.youtube.com/embed/video1"')
expect(html).to include('"embedUrl":"https://player.vimeo.com/video/123456"')
end
end

context "when post has a TikTok video" do
before { post1.update!(cooked: <<~HTML) }
<div class="tiktok-onebox lazy-video-container"
data-video-id="7181751442041220378"
data-video-title="TikTok Video"
data-provider-name="tiktok">
<a href="https://www.tiktok.com/@user/video/7181751442041220378">
<img src="https://example.com/tiktok-thumbnail.jpg">
</a>
</div>
HTML

it "generates TikTok VideoObject schema" do
html = post_crawler_schema.html

expect(html).to include('"embedUrl":"https://www.tiktok.com/embed/v2/7181751442041220378"')
end
end

context "with escaping and security" do
before { post1.update!(cooked: <<~HTML) }
<div class="youtube-onebox lazy-video-container"
data-video-id="test123"
data-video-title="Title with &quot;quotes&quot; and &lt;tags&gt;"
data-provider-name="youtube">
<a href="https://youtube.com/watch?v=test123">
<img src="https://img.youtube.com/vi/test123/maxresdefault.jpg">
</a>
</div>
HTML

it "properly escapes special characters" do
html = post_crawler_schema.html

expect(html).to include('<script type="application/ld+json">')
expect(html).to include("</script>")

parsed = JSON.parse(html.match(%r{<script[^>]*>(.*?)</script>}m)[1])
expect(parsed["name"]).to eq('Title with "quotes" and <tags>')
end
end

context "with missing video data" do
before { post1.update!(cooked: <<~HTML) }
<div class="lazy-video-container"
data-video-id="test123">
<a href="https://youtube.com/watch?v=test123">
<img src="https://img.youtube.com/vi/test123/maxresdefault.jpg">
</a>
</div>
HTML

it "returns empty when provider is missing" do
expect(post_crawler_schema.html).to eq("")
end
end

context "with invalid provider" do
before { post1.update!(cooked: <<~HTML) }
<div class="lazy-video-container"
data-video-id="test123"
data-provider-name="invalid">
<a href="https://example.com/video">
<img src="https://example.com/thumb.jpg">
</a>
</div>
HTML

it "returns empty for unsupported providers" do
expect(post_crawler_schema.html).to eq("")
end
end

context "when contentUrl is provided" do
before { post1.update!(cooked: <<~HTML) }
<div class="youtube-onebox lazy-video-container"
data-video-id="test123"
data-video-title="Test Video"
data-provider-name="youtube">
<a href="https://www.youtube.com/watch?v=test123">
<img src="https://img.youtube.com/vi/test123/maxresdefault.jpg">
</a>
</div>
HTML

it "includes contentUrl from the anchor href" do
html = post_crawler_schema.html

expect(html).to include('"contentUrl":"https://www.youtube.com/watch?v=test123"')
end
end

context "when post has both text content and video" do
before do
post1.update!(
raw: "Check out this amazing video! It's a classic that everyone should watch.",
)

post1.update_columns(cooked: <<~HTML)
<p>Check out this amazing video! It's a classic that everyone should watch.</p>
<div class="youtube-onebox lazy-video-container"
data-video-id="test123"
data-video-title="Test Video"
data-provider-name="youtube">
<a href="https://www.youtube.com/watch?v=test123">
<img src="https://img.youtube.com/vi/test123/maxresdefault.jpg">
</a>
</div>
HTML

post1.reload
end

it "includes description from post excerpt" do
expect(post1.raw).to include("Check out this amazing video")
expect(post1.cooked).to include("lazy-video-container")

excerpt = post1.excerpt(200, strip_links: true, text_entities: true)
expect(excerpt).to be_present
expect(excerpt).to include("Check out")

html = post_crawler_schema.html

if html.empty?
puts "Controller class: #{controller.class}"
puts "Crawler post: #{controller.instance_variable_get(:@crawler_post)&.id}"
puts "Topic view: #{controller.instance_variable_get(:@topic_view)}"
puts "SiteSetting.lazy_videos_enabled: #{SiteSetting.lazy_videos_enabled}"
end

expect(html).not_to be_empty
expect(html).to include('"description":')
parsed = JSON.parse(html.match(%r{<script[^>]*>(.*?)</script>}m)[1])
expect(parsed["description"]).to include("Check out this amazing video")
end
end
end
end