mirror of
https://github.com/discourse/discourse.git
synced 2025-09-06 10:50:21 +08:00
PERF: ability to crawl for titles without extra HEAD req
Also, introduces a much more aggressive timeout for title crawling and introduces gzip to body that is crawled
This commit is contained in:
parent
1f6adbea5c
commit
fa5880e04f
4 changed files with 176 additions and 26 deletions
|
@ -51,12 +51,17 @@ class FinalDestination
|
||||||
@cookie = nil
|
@cookie = nil
|
||||||
@limited_ips = []
|
@limited_ips = []
|
||||||
@verbose = @opts[:verbose] || false
|
@verbose = @opts[:verbose] || false
|
||||||
|
@timeout = @opts[:timeout] || nil
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.connection_timeout
|
def self.connection_timeout
|
||||||
20
|
20
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def timeout
|
||||||
|
@timeout || FinalDestination.connection_timeout
|
||||||
|
end
|
||||||
|
|
||||||
def redirected?
|
def redirected?
|
||||||
@limit < @opts[:max_redirects]
|
@limit < @opts[:max_redirects]
|
||||||
end
|
end
|
||||||
|
@ -75,12 +80,58 @@ class FinalDestination
|
||||||
|
|
||||||
def small_get(headers)
|
def small_get(headers)
|
||||||
Net::HTTP.start(@uri.host, @uri.port, use_ssl: @uri.is_a?(URI::HTTPS)) do |http|
|
Net::HTTP.start(@uri.host, @uri.port, use_ssl: @uri.is_a?(URI::HTTPS)) do |http|
|
||||||
http.open_timeout = FinalDestination.connection_timeout
|
http.open_timeout = timeout
|
||||||
http.read_timeout = FinalDestination.connection_timeout
|
http.read_timeout = timeout
|
||||||
http.request_get(@uri.request_uri, headers)
|
http.request_get(@uri.request_uri, headers)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
# this is a new interface for simply getting
|
||||||
|
# N bytes accounting for all internal logic
|
||||||
|
def get(uri = @uri, redirects = @limit, extra_headers: {}, &blk)
|
||||||
|
raise "Must specify block" unless block_given?
|
||||||
|
|
||||||
|
if uri && uri.port == 80 && FinalDestination.is_https_domain?(uri.hostname)
|
||||||
|
uri.scheme = "https"
|
||||||
|
uri = URI(uri.to_s)
|
||||||
|
end
|
||||||
|
|
||||||
|
unless validate_uri
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
result, (location, cookie) = safe_get(uri, &blk)
|
||||||
|
|
||||||
|
if result == :redirect && (redirects == 0 || !location)
|
||||||
|
return nil
|
||||||
|
end
|
||||||
|
|
||||||
|
if result == :redirect
|
||||||
|
old_port = uri.port
|
||||||
|
|
||||||
|
location = "#{uri.scheme}://#{uri.host}#{location}" if location[0] == "/"
|
||||||
|
uri = URI(location) rescue nil
|
||||||
|
|
||||||
|
# https redirect, so just cache that whole new domain is https
|
||||||
|
if old_port == 80 && uri&.port == 443 && (URI::HTTPS === uri)
|
||||||
|
FinalDestination.cache_https_domain(uri.hostname)
|
||||||
|
end
|
||||||
|
|
||||||
|
return nil if !uri
|
||||||
|
|
||||||
|
extra = nil
|
||||||
|
if cookie
|
||||||
|
extra = { 'Cookie' => cookie }
|
||||||
|
end
|
||||||
|
|
||||||
|
get(uri, redirects - 1, extra_headers: extra, &blk)
|
||||||
|
elsif result == :ok
|
||||||
|
uri.to_s
|
||||||
|
else
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
def resolve
|
def resolve
|
||||||
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
|
if @uri && @uri.port == 80 && FinalDestination.is_https_domain?(@uri.hostname)
|
||||||
@uri.scheme = "https"
|
@uri.scheme = "https"
|
||||||
|
@ -108,7 +159,7 @@ class FinalDestination
|
||||||
headers = request_headers
|
headers = request_headers
|
||||||
response = Excon.public_send(@http_verb,
|
response = Excon.public_send(@http_verb,
|
||||||
@uri.to_s,
|
@uri.to_s,
|
||||||
read_timeout: FinalDestination.connection_timeout,
|
read_timeout: timeout,
|
||||||
headers: headers
|
headers: headers
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -135,7 +186,7 @@ class FinalDestination
|
||||||
headers['set-cookie'] = cookie_val.join
|
headers['set-cookie'] = cookie_val.join
|
||||||
end
|
end
|
||||||
|
|
||||||
# TODO this is confusing why grap location for anything not
|
# TODO this is confusing why grab location for anything not
|
||||||
# between 300-400 ?
|
# between 300-400 ?
|
||||||
if location_val = get_response.get_fields('location')
|
if location_val = get_response.get_fields('location')
|
||||||
headers['location'] = location_val.join
|
headers['location'] = location_val.join
|
||||||
|
@ -279,4 +330,69 @@ class FinalDestination
|
||||||
nil
|
nil
|
||||||
end
|
end
|
||||||
|
|
||||||
|
protected
|
||||||
|
|
||||||
|
def safe_get(uri)
|
||||||
|
|
||||||
|
result = nil
|
||||||
|
unsafe_close = false
|
||||||
|
|
||||||
|
safe_session(uri) do |http|
|
||||||
|
|
||||||
|
headers = request_headers.merge(
|
||||||
|
'Accept-Encoding' => 'gzip',
|
||||||
|
'Host' => uri.host
|
||||||
|
)
|
||||||
|
|
||||||
|
req = Net::HTTP::Get.new(uri.request_uri, headers)
|
||||||
|
|
||||||
|
http.request(req) do |resp|
|
||||||
|
if Net::HTTPRedirection === resp
|
||||||
|
result = :redirect, [resp['location'], resp['Set-Cookie']]
|
||||||
|
end
|
||||||
|
|
||||||
|
if Net::HTTPSuccess === resp
|
||||||
|
|
||||||
|
resp.decode_content = true
|
||||||
|
resp.read_body { |chunk|
|
||||||
|
read_next = true
|
||||||
|
|
||||||
|
catch(:done) do
|
||||||
|
if read_next
|
||||||
|
read_next = false
|
||||||
|
yield resp, chunk, uri
|
||||||
|
read_next = true
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
# no clean way of finishing abruptly cause
|
||||||
|
# response likes reading till the end
|
||||||
|
if !read_next
|
||||||
|
unsafe_close = true
|
||||||
|
http.finish
|
||||||
|
raise StandardError
|
||||||
|
end
|
||||||
|
}
|
||||||
|
result = :ok
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
result
|
||||||
|
rescue StandardError
|
||||||
|
if unsafe_close
|
||||||
|
:ok
|
||||||
|
else
|
||||||
|
raise
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def safe_session(uri)
|
||||||
|
Net::HTTP.start(uri.host, uri.port, use_ssl: (uri.scheme == "https")) do |http|
|
||||||
|
http.read_timeout = timeout
|
||||||
|
http.open_timeout = timeout
|
||||||
|
yield http
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
require_dependency 'final_destination'
|
require_dependency 'final_destination'
|
||||||
|
|
||||||
module RetrieveTitle
|
module RetrieveTitle
|
||||||
class ReadEnough < StandardError; end
|
CRAWL_TIMEOUT = 1
|
||||||
|
|
||||||
def self.crawl(url)
|
def self.crawl(url)
|
||||||
extract_title(fetch_beginning(url))
|
fetch_title(url)
|
||||||
rescue Exception
|
rescue Exception
|
||||||
# If there was a connection error, do nothing
|
# If there was a connection error, do nothing
|
||||||
end
|
end
|
||||||
|
@ -41,7 +41,7 @@ module RetrieveTitle
|
||||||
|
|
||||||
# Amazon and YouTube leave the title until very late. Exceptions are bad
|
# Amazon and YouTube leave the title until very late. Exceptions are bad
|
||||||
# but these are large sites.
|
# but these are large sites.
|
||||||
return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
|
return 500 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/
|
||||||
return 300 if uri.host =~ /youtube\.com$/ || uri.host =~ /youtu.be/
|
return 300 if uri.host =~ /youtube\.com$/ || uri.host =~ /youtu.be/
|
||||||
|
|
||||||
# default is 10k
|
# default is 10k
|
||||||
|
@ -49,27 +49,24 @@ module RetrieveTitle
|
||||||
end
|
end
|
||||||
|
|
||||||
# Fetch the beginning of a HTML document at a url
|
# Fetch the beginning of a HTML document at a url
|
||||||
def self.fetch_beginning(url)
|
def self.fetch_title(url)
|
||||||
fd = FinalDestination.new(url)
|
fd = FinalDestination.new(url, timeout: CRAWL_TIMEOUT)
|
||||||
uri = fd.resolve
|
|
||||||
return "" unless uri
|
|
||||||
|
|
||||||
result = ""
|
current = nil
|
||||||
streamer = lambda do |chunk, _, _|
|
title = nil
|
||||||
result << chunk
|
|
||||||
|
|
||||||
# Using exceptions for flow control is really bad, but there really seems to
|
fd.get do |_response, chunk, uri|
|
||||||
# be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
|
|
||||||
# that matter!)
|
if current
|
||||||
raise ReadEnough.new if result.size > (max_chunk_size(uri) * 1024)
|
current << chunk
|
||||||
|
else
|
||||||
|
current = chunk
|
||||||
end
|
end
|
||||||
Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: fd.request_headers)
|
|
||||||
result
|
|
||||||
|
|
||||||
rescue Excon::Errors::SocketError => ex
|
max_size = max_chunk_size(uri) * 1024
|
||||||
return result if ex.socket_error.is_a?(ReadEnough)
|
title = extract_title(current)
|
||||||
raise
|
throw :done if title || max_size < current.length
|
||||||
rescue ReadEnough
|
end
|
||||||
result
|
return title || ""
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -207,6 +207,32 @@ describe FinalDestination do
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
describe '.get' do
|
||||||
|
|
||||||
|
it "can correctly stream with a redirect" do
|
||||||
|
|
||||||
|
FinalDestination.clear_https_cache!("wikipedia.com")
|
||||||
|
|
||||||
|
stub_request(:get, "http://wikipedia.com/").
|
||||||
|
to_return(status: 302, body: "" , headers: { "location" => "https://wikipedia.com/" })
|
||||||
|
|
||||||
|
# webmock does not do chunks
|
||||||
|
stub_request(:get, "https://wikipedia.com/").
|
||||||
|
to_return(status: 200, body: "<html><head>" , headers: {})
|
||||||
|
|
||||||
|
result = nil
|
||||||
|
chunk = nil
|
||||||
|
|
||||||
|
result = FinalDestination.new("http://wikipedia.com", opts).get do |resp, c|
|
||||||
|
chunk = c
|
||||||
|
throw :done
|
||||||
|
end
|
||||||
|
|
||||||
|
expect(result).to eq("https://wikipedia.com/")
|
||||||
|
expect(chunk).to eq("<html><head>")
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
describe '.validate_uri' do
|
describe '.validate_uri' do
|
||||||
context "host lookups" do
|
context "host lookups" do
|
||||||
it "works for various hosts" do
|
it "works for various hosts" do
|
||||||
|
|
|
@ -54,7 +54,18 @@ describe RetrieveTitle do
|
||||||
)
|
)
|
||||||
expect(title).to eq("Video Title")
|
expect(title).to eq("Video Title")
|
||||||
end
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
context "crawl" do
|
||||||
|
it "can properly extract a title from a url" do
|
||||||
|
stub_request(:get, "https://brelksdjflaskfj.com/amazing")
|
||||||
|
.to_return(status: 200, body: "<html><title>very amazing</title>")
|
||||||
|
|
||||||
|
# we still resolve the IP address for every host
|
||||||
|
IPSocket.stubs(:getaddress).returns('100.2.3.4')
|
||||||
|
|
||||||
|
expect(RetrieveTitle.crawl("https://brelksdjflaskfj.com/amazing")).to eq("very amazing")
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue